1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop. 474 /// In the case of epilogue vectorization, this function is overriden to 475 /// handle the more complex control flow around the loops. 476 virtual BasicBlock *createVectorizedLoopSkeleton(); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 483 void fixVectorizedLoop(VPTransformState &State); 484 485 // Return true if any runtime check is added. 486 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 487 488 /// A type for vectorized values in the new loop. Each value from the 489 /// original loop, when vectorized, is represented by UF vector values in the 490 /// new unrolled loop, where UF is the unroll factor. 491 using VectorParts = SmallVector<Value *, 2>; 492 493 /// Vectorize a single first-order recurrence or pointer induction PHINode in 494 /// a block. This method handles the induction variable canonicalization. It 495 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 496 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 497 VPTransformState &State); 498 499 /// A helper function to scalarize a single Instruction in the innermost loop. 500 /// Generates a sequence of scalar instances for each lane between \p MinLane 501 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 502 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 503 /// Instr's operands. 504 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 505 const VPIteration &Instance, bool IfPredicateInstr, 506 VPTransformState &State); 507 508 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 509 /// is provided, the integer induction variable will first be truncated to 510 /// the corresponding type. 511 void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, 512 Value *Start, TruncInst *Trunc, VPValue *Def, 513 VPTransformState &State); 514 515 /// Construct the vector value of a scalarized value \p V one lane at a time. 516 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 517 VPTransformState &State); 518 519 /// Try to vectorize interleaved access group \p Group with the base address 520 /// given in \p Addr, optionally masking the vector operations if \p 521 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 522 /// values in the vectorized loop. 523 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 524 ArrayRef<VPValue *> VPDefs, 525 VPTransformState &State, VPValue *Addr, 526 ArrayRef<VPValue *> StoredValues, 527 VPValue *BlockInMask = nullptr); 528 529 /// Set the debug location in the builder \p Ptr using the debug location in 530 /// \p V. If \p Ptr is None then it uses the class member's Builder. 531 void setDebugLocFromInst(const Value *V, 532 Optional<IRBuilder<> *> CustomBuilder = None); 533 534 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 535 void fixNonInductionPHIs(VPTransformState &State); 536 537 /// Returns true if the reordering of FP operations is not allowed, but we are 538 /// able to vectorize with strict in-order reductions for the given RdxDesc. 539 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 540 541 /// Create a broadcast instruction. This method generates a broadcast 542 /// instruction (shuffle) for loop invariant values and for the induction 543 /// value. If this is the induction variable then we extend it to N, N+1, ... 544 /// this is needed because each iteration in the loop corresponds to a SIMD 545 /// element. 546 virtual Value *getBroadcastInstrs(Value *V); 547 548 /// Add metadata from one instruction to another. 549 /// 550 /// This includes both the original MDs from \p From and additional ones (\see 551 /// addNewMetadata). Use this for *newly created* instructions in the vector 552 /// loop. 553 void addMetadata(Instruction *To, Instruction *From); 554 555 /// Similar to the previous function but it adds the metadata to a 556 /// vector of instructions. 557 void addMetadata(ArrayRef<Value *> To, Instruction *From); 558 559 protected: 560 friend class LoopVectorizationPlanner; 561 562 /// A small list of PHINodes. 563 using PhiVector = SmallVector<PHINode *, 4>; 564 565 /// A type for scalarized values in the new loop. Each value from the 566 /// original loop, when scalarized, is represented by UF x VF scalar values 567 /// in the new unrolled loop, where UF is the unroll factor and VF is the 568 /// vectorization factor. 569 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 570 571 /// Set up the values of the IVs correctly when exiting the vector loop. 572 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 573 Value *CountRoundDown, Value *EndValue, 574 BasicBlock *MiddleBlock); 575 576 /// Create a new induction variable inside L. 577 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 578 Value *Step, Instruction *DL); 579 580 /// Handle all cross-iteration phis in the header. 581 void fixCrossIterationPHIs(VPTransformState &State); 582 583 /// Create the exit value of first order recurrences in the middle block and 584 /// update their users. 585 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 586 587 /// Create code for the loop exit value of the reduction. 588 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 589 590 /// Clear NSW/NUW flags from reduction instructions if necessary. 591 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 592 VPTransformState &State); 593 594 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 595 /// means we need to add the appropriate incoming value from the middle 596 /// block as exiting edges from the scalar epilogue loop (if present) are 597 /// already in place, and we exit the vector loop exclusively to the middle 598 /// block. 599 void fixLCSSAPHIs(VPTransformState &State); 600 601 /// Iteratively sink the scalarized operands of a predicated instruction into 602 /// the block that was created for it. 603 void sinkScalarOperands(Instruction *PredInst); 604 605 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 606 /// represented as. 607 void truncateToMinimalBitwidths(VPTransformState &State); 608 609 /// This function adds 610 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 611 /// to each vector element of Val. The sequence starts at StartIndex. 612 /// \p Opcode is relevant for FP induction variable. 613 virtual Value * 614 getStepVector(Value *Val, Value *StartIdx, Value *Step, 615 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 616 617 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 618 /// variable on which to base the steps, \p Step is the size of the step, and 619 /// \p EntryVal is the value from the original loop that maps to the steps. 620 /// Note that \p EntryVal doesn't have to be an induction variable - it 621 /// can also be a truncate instruction. 622 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 623 const InductionDescriptor &ID, VPValue *Def, 624 VPTransformState &State); 625 626 /// Create a vector induction phi node based on an existing scalar one. \p 627 /// EntryVal is the value from the original loop that maps to the vector phi 628 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 629 /// truncate instruction, instead of widening the original IV, we widen a 630 /// version of the IV truncated to \p EntryVal's type. 631 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 632 Value *Step, Value *Start, 633 Instruction *EntryVal, VPValue *Def, 634 VPTransformState &State); 635 636 /// Returns true if an instruction \p I should be scalarized instead of 637 /// vectorized for the chosen vectorization factor. 638 bool shouldScalarizeInstruction(Instruction *I) const; 639 640 /// Returns true if we should generate a scalar version of \p IV. 641 bool needsScalarInduction(Instruction *IV) const; 642 643 /// Generate a shuffle sequence that will reverse the vector Vec. 644 virtual Value *reverseVector(Value *Vec); 645 646 /// Returns (and creates if needed) the original loop trip count. 647 Value *getOrCreateTripCount(Loop *NewLoop); 648 649 /// Returns (and creates if needed) the trip count of the widened loop. 650 Value *getOrCreateVectorTripCount(Loop *NewLoop); 651 652 /// Returns a bitcasted value to the requested vector type. 653 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 654 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 655 const DataLayout &DL); 656 657 /// Emit a bypass check to see if the vector trip count is zero, including if 658 /// it overflows. 659 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 660 661 /// Emit a bypass check to see if all of the SCEV assumptions we've 662 /// had to make are correct. Returns the block containing the checks or 663 /// nullptr if no checks have been added. 664 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 665 666 /// Emit bypass checks to check any memory assumptions we may have made. 667 /// Returns the block containing the checks or nullptr if no checks have been 668 /// added. 669 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 670 671 /// Compute the transformed value of Index at offset StartValue using step 672 /// StepValue. 673 /// For integer induction, returns StartValue + Index * StepValue. 674 /// For pointer induction, returns StartValue[Index * StepValue]. 675 /// FIXME: The newly created binary instructions should contain nsw/nuw 676 /// flags, which can be found from the original scalar operations. 677 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 678 const DataLayout &DL, 679 const InductionDescriptor &ID) const; 680 681 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 682 /// vector loop preheader, middle block and scalar preheader. Also 683 /// allocate a loop object for the new vector loop and return it. 684 Loop *createVectorLoopSkeleton(StringRef Prefix); 685 686 /// Create new phi nodes for the induction variables to resume iteration count 687 /// in the scalar epilogue, from where the vectorized loop left off (given by 688 /// \p VectorTripCount). 689 /// In cases where the loop skeleton is more complicated (eg. epilogue 690 /// vectorization) and the resume values can come from an additional bypass 691 /// block, the \p AdditionalBypass pair provides information about the bypass 692 /// block and the end value on the edge from bypass to this loop. 693 void createInductionResumeValues( 694 Loop *L, Value *VectorTripCount, 695 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 696 697 /// Complete the loop skeleton by adding debug MDs, creating appropriate 698 /// conditional branches in the middle block, preparing the builder and 699 /// running the verifier. Take in the vector loop \p L as argument, and return 700 /// the preheader of the completed vector loop. 701 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 702 703 /// Add additional metadata to \p To that was not present on \p Orig. 704 /// 705 /// Currently this is used to add the noalias annotations based on the 706 /// inserted memchecks. Use this for instructions that are *cloned* into the 707 /// vector loop. 708 void addNewMetadata(Instruction *To, const Instruction *Orig); 709 710 /// Collect poison-generating recipes that may generate a poison value that is 711 /// used after vectorization, even when their operands are not poison. Those 712 /// recipes meet the following conditions: 713 /// * Contribute to the address computation of a recipe generating a widen 714 /// memory load/store (VPWidenMemoryInstructionRecipe or 715 /// VPInterleaveRecipe). 716 /// * Such a widen memory load/store has at least one underlying Instruction 717 /// that is in a basic block that needs predication and after vectorization 718 /// the generated instruction won't be predicated. 719 void collectPoisonGeneratingRecipes(VPTransformState &State); 720 721 /// Allow subclasses to override and print debug traces before/after vplan 722 /// execution, when trace information is requested. 723 virtual void printDebugTracesAtStart(){}; 724 virtual void printDebugTracesAtEnd(){}; 725 726 /// The original loop. 727 Loop *OrigLoop; 728 729 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 730 /// dynamic knowledge to simplify SCEV expressions and converts them to a 731 /// more usable form. 732 PredicatedScalarEvolution &PSE; 733 734 /// Loop Info. 735 LoopInfo *LI; 736 737 /// Dominator Tree. 738 DominatorTree *DT; 739 740 /// Alias Analysis. 741 AAResults *AA; 742 743 /// Target Library Info. 744 const TargetLibraryInfo *TLI; 745 746 /// Target Transform Info. 747 const TargetTransformInfo *TTI; 748 749 /// Assumption Cache. 750 AssumptionCache *AC; 751 752 /// Interface to emit optimization remarks. 753 OptimizationRemarkEmitter *ORE; 754 755 /// LoopVersioning. It's only set up (non-null) if memchecks were 756 /// used. 757 /// 758 /// This is currently only used to add no-alias metadata based on the 759 /// memchecks. The actually versioning is performed manually. 760 std::unique_ptr<LoopVersioning> LVer; 761 762 /// The vectorization SIMD factor to use. Each vector will have this many 763 /// vector elements. 764 ElementCount VF; 765 766 /// The vectorization unroll factor to use. Each scalar is vectorized to this 767 /// many different vector instructions. 768 unsigned UF; 769 770 /// The builder that we use 771 IRBuilder<> Builder; 772 773 // --- Vectorization state --- 774 775 /// The vector-loop preheader. 776 BasicBlock *LoopVectorPreHeader; 777 778 /// The scalar-loop preheader. 779 BasicBlock *LoopScalarPreHeader; 780 781 /// Middle Block between the vector and the scalar. 782 BasicBlock *LoopMiddleBlock; 783 784 /// The unique ExitBlock of the scalar loop if one exists. Note that 785 /// there can be multiple exiting edges reaching this block. 786 BasicBlock *LoopExitBlock; 787 788 /// The vector loop body. 789 BasicBlock *LoopVectorBody; 790 791 /// The scalar loop body. 792 BasicBlock *LoopScalarBody; 793 794 /// A list of all bypass blocks. The first block is the entry of the loop. 795 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 796 797 /// The new Induction variable which was added to the new block. 798 PHINode *Induction = nullptr; 799 800 /// The induction variable of the old basic block. 801 PHINode *OldInduction = nullptr; 802 803 /// Store instructions that were predicated. 804 SmallVector<Instruction *, 4> PredicatedInstructions; 805 806 /// Trip count of the original loop. 807 Value *TripCount = nullptr; 808 809 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 810 Value *VectorTripCount = nullptr; 811 812 /// The legality analysis. 813 LoopVectorizationLegality *Legal; 814 815 /// The profitablity analysis. 816 LoopVectorizationCostModel *Cost; 817 818 // Record whether runtime checks are added. 819 bool AddedSafetyChecks = false; 820 821 // Holds the end values for each induction variable. We save the end values 822 // so we can later fix-up the external users of the induction variables. 823 DenseMap<PHINode *, Value *> IVEndValues; 824 825 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 826 // fixed up at the end of vector code generation. 827 SmallVector<PHINode *, 8> OrigPHIsToFix; 828 829 /// BFI and PSI are used to check for profile guided size optimizations. 830 BlockFrequencyInfo *BFI; 831 ProfileSummaryInfo *PSI; 832 833 // Whether this loop should be optimized for size based on profile guided size 834 // optimizatios. 835 bool OptForSizeBasedOnProfile; 836 837 /// Structure to hold information about generated runtime checks, responsible 838 /// for cleaning the checks, if vectorization turns out unprofitable. 839 GeneratedRTChecks &RTChecks; 840 }; 841 842 class InnerLoopUnroller : public InnerLoopVectorizer { 843 public: 844 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 845 LoopInfo *LI, DominatorTree *DT, 846 const TargetLibraryInfo *TLI, 847 const TargetTransformInfo *TTI, AssumptionCache *AC, 848 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 849 LoopVectorizationLegality *LVL, 850 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 851 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 852 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 853 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 854 BFI, PSI, Check) {} 855 856 private: 857 Value *getBroadcastInstrs(Value *V) override; 858 Value *getStepVector( 859 Value *Val, Value *StartIdx, Value *Step, 860 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 861 Value *reverseVector(Value *Vec) override; 862 }; 863 864 /// Encapsulate information regarding vectorization of a loop and its epilogue. 865 /// This information is meant to be updated and used across two stages of 866 /// epilogue vectorization. 867 struct EpilogueLoopVectorizationInfo { 868 ElementCount MainLoopVF = ElementCount::getFixed(0); 869 unsigned MainLoopUF = 0; 870 ElementCount EpilogueVF = ElementCount::getFixed(0); 871 unsigned EpilogueUF = 0; 872 BasicBlock *MainLoopIterationCountCheck = nullptr; 873 BasicBlock *EpilogueIterationCountCheck = nullptr; 874 BasicBlock *SCEVSafetyCheck = nullptr; 875 BasicBlock *MemSafetyCheck = nullptr; 876 Value *TripCount = nullptr; 877 Value *VectorTripCount = nullptr; 878 879 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 880 ElementCount EVF, unsigned EUF) 881 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 882 assert(EUF == 1 && 883 "A high UF for the epilogue loop is likely not beneficial."); 884 } 885 }; 886 887 /// An extension of the inner loop vectorizer that creates a skeleton for a 888 /// vectorized loop that has its epilogue (residual) also vectorized. 889 /// The idea is to run the vplan on a given loop twice, firstly to setup the 890 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 891 /// from the first step and vectorize the epilogue. This is achieved by 892 /// deriving two concrete strategy classes from this base class and invoking 893 /// them in succession from the loop vectorizer planner. 894 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 895 public: 896 InnerLoopAndEpilogueVectorizer( 897 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 898 DominatorTree *DT, const TargetLibraryInfo *TLI, 899 const TargetTransformInfo *TTI, AssumptionCache *AC, 900 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 901 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 902 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 903 GeneratedRTChecks &Checks) 904 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 905 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 906 Checks), 907 EPI(EPI) {} 908 909 // Override this function to handle the more complex control flow around the 910 // three loops. 911 BasicBlock *createVectorizedLoopSkeleton() final override { 912 return createEpilogueVectorizedLoopSkeleton(); 913 } 914 915 /// The interface for creating a vectorized skeleton using one of two 916 /// different strategies, each corresponding to one execution of the vplan 917 /// as described above. 918 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 919 920 /// Holds and updates state information required to vectorize the main loop 921 /// and its epilogue in two separate passes. This setup helps us avoid 922 /// regenerating and recomputing runtime safety checks. It also helps us to 923 /// shorten the iteration-count-check path length for the cases where the 924 /// iteration count of the loop is so small that the main vector loop is 925 /// completely skipped. 926 EpilogueLoopVectorizationInfo &EPI; 927 }; 928 929 /// A specialized derived class of inner loop vectorizer that performs 930 /// vectorization of *main* loops in the process of vectorizing loops and their 931 /// epilogues. 932 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 933 public: 934 EpilogueVectorizerMainLoop( 935 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 936 DominatorTree *DT, const TargetLibraryInfo *TLI, 937 const TargetTransformInfo *TTI, AssumptionCache *AC, 938 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 939 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 940 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 941 GeneratedRTChecks &Check) 942 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 943 EPI, LVL, CM, BFI, PSI, Check) {} 944 /// Implements the interface for creating a vectorized skeleton using the 945 /// *main loop* strategy (ie the first pass of vplan execution). 946 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 947 948 protected: 949 /// Emits an iteration count bypass check once for the main loop (when \p 950 /// ForEpilogue is false) and once for the epilogue loop (when \p 951 /// ForEpilogue is true). 952 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 953 bool ForEpilogue); 954 void printDebugTracesAtStart() override; 955 void printDebugTracesAtEnd() override; 956 }; 957 958 // A specialized derived class of inner loop vectorizer that performs 959 // vectorization of *epilogue* loops in the process of vectorizing loops and 960 // their epilogues. 961 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 962 public: 963 EpilogueVectorizerEpilogueLoop( 964 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 965 DominatorTree *DT, const TargetLibraryInfo *TLI, 966 const TargetTransformInfo *TTI, AssumptionCache *AC, 967 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 968 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 969 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 970 GeneratedRTChecks &Checks) 971 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 972 EPI, LVL, CM, BFI, PSI, Checks) {} 973 /// Implements the interface for creating a vectorized skeleton using the 974 /// *epilogue loop* strategy (ie the second pass of vplan execution). 975 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 976 977 protected: 978 /// Emits an iteration count bypass check after the main vector loop has 979 /// finished to see if there are any iterations left to execute by either 980 /// the vector epilogue or the scalar epilogue. 981 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 982 BasicBlock *Bypass, 983 BasicBlock *Insert); 984 void printDebugTracesAtStart() override; 985 void printDebugTracesAtEnd() override; 986 }; 987 } // end namespace llvm 988 989 /// Look for a meaningful debug location on the instruction or it's 990 /// operands. 991 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 992 if (!I) 993 return I; 994 995 DebugLoc Empty; 996 if (I->getDebugLoc() != Empty) 997 return I; 998 999 for (Use &Op : I->operands()) { 1000 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1001 if (OpInst->getDebugLoc() != Empty) 1002 return OpInst; 1003 } 1004 1005 return I; 1006 } 1007 1008 void InnerLoopVectorizer::setDebugLocFromInst( 1009 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1010 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1011 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1012 const DILocation *DIL = Inst->getDebugLoc(); 1013 1014 // When a FSDiscriminator is enabled, we don't need to add the multiply 1015 // factors to the discriminators. 1016 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1017 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1018 // FIXME: For scalable vectors, assume vscale=1. 1019 auto NewDIL = 1020 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1021 if (NewDIL) 1022 B->SetCurrentDebugLocation(NewDIL.getValue()); 1023 else 1024 LLVM_DEBUG(dbgs() 1025 << "Failed to create new discriminator: " 1026 << DIL->getFilename() << " Line: " << DIL->getLine()); 1027 } else 1028 B->SetCurrentDebugLocation(DIL); 1029 } else 1030 B->SetCurrentDebugLocation(DebugLoc()); 1031 } 1032 1033 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1034 /// is passed, the message relates to that particular instruction. 1035 #ifndef NDEBUG 1036 static void debugVectorizationMessage(const StringRef Prefix, 1037 const StringRef DebugMsg, 1038 Instruction *I) { 1039 dbgs() << "LV: " << Prefix << DebugMsg; 1040 if (I != nullptr) 1041 dbgs() << " " << *I; 1042 else 1043 dbgs() << '.'; 1044 dbgs() << '\n'; 1045 } 1046 #endif 1047 1048 /// Create an analysis remark that explains why vectorization failed 1049 /// 1050 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1051 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1052 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1053 /// the location of the remark. \return the remark object that can be 1054 /// streamed to. 1055 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1056 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1057 Value *CodeRegion = TheLoop->getHeader(); 1058 DebugLoc DL = TheLoop->getStartLoc(); 1059 1060 if (I) { 1061 CodeRegion = I->getParent(); 1062 // If there is no debug location attached to the instruction, revert back to 1063 // using the loop's. 1064 if (I->getDebugLoc()) 1065 DL = I->getDebugLoc(); 1066 } 1067 1068 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1069 } 1070 1071 /// Return a value for Step multiplied by VF. 1072 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1073 int64_t Step) { 1074 assert(Ty->isIntegerTy() && "Expected an integer step"); 1075 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1076 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1077 } 1078 1079 namespace llvm { 1080 1081 /// Return the runtime value for VF. 1082 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1083 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1084 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1085 } 1086 1087 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1088 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1089 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1090 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1091 return B.CreateUIToFP(RuntimeVF, FTy); 1092 } 1093 1094 void reportVectorizationFailure(const StringRef DebugMsg, 1095 const StringRef OREMsg, const StringRef ORETag, 1096 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1097 Instruction *I) { 1098 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1099 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1100 ORE->emit( 1101 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1102 << "loop not vectorized: " << OREMsg); 1103 } 1104 1105 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1106 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1107 Instruction *I) { 1108 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1109 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1110 ORE->emit( 1111 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1112 << Msg); 1113 } 1114 1115 } // end namespace llvm 1116 1117 #ifndef NDEBUG 1118 /// \return string containing a file name and a line # for the given loop. 1119 static std::string getDebugLocString(const Loop *L) { 1120 std::string Result; 1121 if (L) { 1122 raw_string_ostream OS(Result); 1123 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1124 LoopDbgLoc.print(OS); 1125 else 1126 // Just print the module name. 1127 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1128 OS.flush(); 1129 } 1130 return Result; 1131 } 1132 #endif 1133 1134 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1135 const Instruction *Orig) { 1136 // If the loop was versioned with memchecks, add the corresponding no-alias 1137 // metadata. 1138 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1139 LVer->annotateInstWithNoAlias(To, Orig); 1140 } 1141 1142 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1143 VPTransformState &State) { 1144 1145 // Collect recipes in the backward slice of `Root` that may generate a poison 1146 // value that is used after vectorization. 1147 SmallPtrSet<VPRecipeBase *, 16> Visited; 1148 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1149 SmallVector<VPRecipeBase *, 16> Worklist; 1150 Worklist.push_back(Root); 1151 1152 // Traverse the backward slice of Root through its use-def chain. 1153 while (!Worklist.empty()) { 1154 VPRecipeBase *CurRec = Worklist.back(); 1155 Worklist.pop_back(); 1156 1157 if (!Visited.insert(CurRec).second) 1158 continue; 1159 1160 // Prune search if we find another recipe generating a widen memory 1161 // instruction. Widen memory instructions involved in address computation 1162 // will lead to gather/scatter instructions, which don't need to be 1163 // handled. 1164 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1165 isa<VPInterleaveRecipe>(CurRec)) 1166 continue; 1167 1168 // This recipe contributes to the address computation of a widen 1169 // load/store. Collect recipe if its underlying instruction has 1170 // poison-generating flags. 1171 Instruction *Instr = CurRec->getUnderlyingInstr(); 1172 if (Instr && Instr->hasPoisonGeneratingFlags()) 1173 State.MayGeneratePoisonRecipes.insert(CurRec); 1174 1175 // Add new definitions to the worklist. 1176 for (VPValue *operand : CurRec->operands()) 1177 if (VPDef *OpDef = operand->getDef()) 1178 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1179 } 1180 }); 1181 1182 // Traverse all the recipes in the VPlan and collect the poison-generating 1183 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1184 // VPInterleaveRecipe. 1185 auto Iter = depth_first( 1186 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1187 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1188 for (VPRecipeBase &Recipe : *VPBB) { 1189 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1190 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1191 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1192 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1193 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1194 collectPoisonGeneratingInstrsInBackwardSlice( 1195 cast<VPRecipeBase>(AddrDef)); 1196 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1197 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1198 if (AddrDef) { 1199 // Check if any member of the interleave group needs predication. 1200 const InterleaveGroup<Instruction> *InterGroup = 1201 InterleaveRec->getInterleaveGroup(); 1202 bool NeedPredication = false; 1203 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1204 I < NumMembers; ++I) { 1205 Instruction *Member = InterGroup->getMember(I); 1206 if (Member) 1207 NeedPredication |= 1208 Legal->blockNeedsPredication(Member->getParent()); 1209 } 1210 1211 if (NeedPredication) 1212 collectPoisonGeneratingInstrsInBackwardSlice( 1213 cast<VPRecipeBase>(AddrDef)); 1214 } 1215 } 1216 } 1217 } 1218 } 1219 1220 void InnerLoopVectorizer::addMetadata(Instruction *To, 1221 Instruction *From) { 1222 propagateMetadata(To, From); 1223 addNewMetadata(To, From); 1224 } 1225 1226 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1227 Instruction *From) { 1228 for (Value *V : To) { 1229 if (Instruction *I = dyn_cast<Instruction>(V)) 1230 addMetadata(I, From); 1231 } 1232 } 1233 1234 namespace llvm { 1235 1236 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1237 // lowered. 1238 enum ScalarEpilogueLowering { 1239 1240 // The default: allowing scalar epilogues. 1241 CM_ScalarEpilogueAllowed, 1242 1243 // Vectorization with OptForSize: don't allow epilogues. 1244 CM_ScalarEpilogueNotAllowedOptSize, 1245 1246 // A special case of vectorisation with OptForSize: loops with a very small 1247 // trip count are considered for vectorization under OptForSize, thereby 1248 // making sure the cost of their loop body is dominant, free of runtime 1249 // guards and scalar iteration overheads. 1250 CM_ScalarEpilogueNotAllowedLowTripLoop, 1251 1252 // Loop hint predicate indicating an epilogue is undesired. 1253 CM_ScalarEpilogueNotNeededUsePredicate, 1254 1255 // Directive indicating we must either tail fold or not vectorize 1256 CM_ScalarEpilogueNotAllowedUsePredicate 1257 }; 1258 1259 /// ElementCountComparator creates a total ordering for ElementCount 1260 /// for the purposes of using it in a set structure. 1261 struct ElementCountComparator { 1262 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1263 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1264 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1265 } 1266 }; 1267 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1268 1269 /// LoopVectorizationCostModel - estimates the expected speedups due to 1270 /// vectorization. 1271 /// In many cases vectorization is not profitable. This can happen because of 1272 /// a number of reasons. In this class we mainly attempt to predict the 1273 /// expected speedup/slowdowns due to the supported instruction set. We use the 1274 /// TargetTransformInfo to query the different backends for the cost of 1275 /// different operations. 1276 class LoopVectorizationCostModel { 1277 public: 1278 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1279 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1280 LoopVectorizationLegality *Legal, 1281 const TargetTransformInfo &TTI, 1282 const TargetLibraryInfo *TLI, DemandedBits *DB, 1283 AssumptionCache *AC, 1284 OptimizationRemarkEmitter *ORE, const Function *F, 1285 const LoopVectorizeHints *Hints, 1286 InterleavedAccessInfo &IAI) 1287 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1288 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1289 Hints(Hints), InterleaveInfo(IAI) {} 1290 1291 /// \return An upper bound for the vectorization factors (both fixed and 1292 /// scalable). If the factors are 0, vectorization and interleaving should be 1293 /// avoided up front. 1294 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1295 1296 /// \return True if runtime checks are required for vectorization, and false 1297 /// otherwise. 1298 bool runtimeChecksRequired(); 1299 1300 /// \return The most profitable vectorization factor and the cost of that VF. 1301 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1302 /// then this vectorization factor will be selected if vectorization is 1303 /// possible. 1304 VectorizationFactor 1305 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1306 1307 VectorizationFactor 1308 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1309 const LoopVectorizationPlanner &LVP); 1310 1311 /// Setup cost-based decisions for user vectorization factor. 1312 /// \return true if the UserVF is a feasible VF to be chosen. 1313 bool selectUserVectorizationFactor(ElementCount UserVF) { 1314 collectUniformsAndScalars(UserVF); 1315 collectInstsToScalarize(UserVF); 1316 return expectedCost(UserVF).first.isValid(); 1317 } 1318 1319 /// \return The size (in bits) of the smallest and widest types in the code 1320 /// that needs to be vectorized. We ignore values that remain scalar such as 1321 /// 64 bit loop indices. 1322 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1323 1324 /// \return The desired interleave count. 1325 /// If interleave count has been specified by metadata it will be returned. 1326 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1327 /// are the selected vectorization factor and the cost of the selected VF. 1328 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1329 1330 /// Memory access instruction may be vectorized in more than one way. 1331 /// Form of instruction after vectorization depends on cost. 1332 /// This function takes cost-based decisions for Load/Store instructions 1333 /// and collects them in a map. This decisions map is used for building 1334 /// the lists of loop-uniform and loop-scalar instructions. 1335 /// The calculated cost is saved with widening decision in order to 1336 /// avoid redundant calculations. 1337 void setCostBasedWideningDecision(ElementCount VF); 1338 1339 /// A struct that represents some properties of the register usage 1340 /// of a loop. 1341 struct RegisterUsage { 1342 /// Holds the number of loop invariant values that are used in the loop. 1343 /// The key is ClassID of target-provided register class. 1344 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1345 /// Holds the maximum number of concurrent live intervals in the loop. 1346 /// The key is ClassID of target-provided register class. 1347 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1348 }; 1349 1350 /// \return Returns information about the register usages of the loop for the 1351 /// given vectorization factors. 1352 SmallVector<RegisterUsage, 8> 1353 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1354 1355 /// Collect values we want to ignore in the cost model. 1356 void collectValuesToIgnore(); 1357 1358 /// Collect all element types in the loop for which widening is needed. 1359 void collectElementTypesForWidening(); 1360 1361 /// Split reductions into those that happen in the loop, and those that happen 1362 /// outside. In loop reductions are collected into InLoopReductionChains. 1363 void collectInLoopReductions(); 1364 1365 /// Returns true if we should use strict in-order reductions for the given 1366 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1367 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1368 /// of FP operations. 1369 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1370 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1371 } 1372 1373 /// \returns The smallest bitwidth each instruction can be represented with. 1374 /// The vector equivalents of these instructions should be truncated to this 1375 /// type. 1376 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1377 return MinBWs; 1378 } 1379 1380 /// \returns True if it is more profitable to scalarize instruction \p I for 1381 /// vectorization factor \p VF. 1382 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1383 assert(VF.isVector() && 1384 "Profitable to scalarize relevant only for VF > 1."); 1385 1386 // Cost model is not run in the VPlan-native path - return conservative 1387 // result until this changes. 1388 if (EnableVPlanNativePath) 1389 return false; 1390 1391 auto Scalars = InstsToScalarize.find(VF); 1392 assert(Scalars != InstsToScalarize.end() && 1393 "VF not yet analyzed for scalarization profitability"); 1394 return Scalars->second.find(I) != Scalars->second.end(); 1395 } 1396 1397 /// Returns true if \p I is known to be uniform after vectorization. 1398 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1399 if (VF.isScalar()) 1400 return true; 1401 1402 // Cost model is not run in the VPlan-native path - return conservative 1403 // result until this changes. 1404 if (EnableVPlanNativePath) 1405 return false; 1406 1407 auto UniformsPerVF = Uniforms.find(VF); 1408 assert(UniformsPerVF != Uniforms.end() && 1409 "VF not yet analyzed for uniformity"); 1410 return UniformsPerVF->second.count(I); 1411 } 1412 1413 /// Returns true if \p I is known to be scalar after vectorization. 1414 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1415 if (VF.isScalar()) 1416 return true; 1417 1418 // Cost model is not run in the VPlan-native path - return conservative 1419 // result until this changes. 1420 if (EnableVPlanNativePath) 1421 return false; 1422 1423 auto ScalarsPerVF = Scalars.find(VF); 1424 assert(ScalarsPerVF != Scalars.end() && 1425 "Scalar values are not calculated for VF"); 1426 return ScalarsPerVF->second.count(I); 1427 } 1428 1429 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1430 /// for vectorization factor \p VF. 1431 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1432 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1433 !isProfitableToScalarize(I, VF) && 1434 !isScalarAfterVectorization(I, VF); 1435 } 1436 1437 /// Decision that was taken during cost calculation for memory instruction. 1438 enum InstWidening { 1439 CM_Unknown, 1440 CM_Widen, // For consecutive accesses with stride +1. 1441 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1442 CM_Interleave, 1443 CM_GatherScatter, 1444 CM_Scalarize 1445 }; 1446 1447 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1448 /// instruction \p I and vector width \p VF. 1449 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1450 InstructionCost Cost) { 1451 assert(VF.isVector() && "Expected VF >=2"); 1452 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1453 } 1454 1455 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1456 /// interleaving group \p Grp and vector width \p VF. 1457 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1458 ElementCount VF, InstWidening W, 1459 InstructionCost Cost) { 1460 assert(VF.isVector() && "Expected VF >=2"); 1461 /// Broadcast this decicion to all instructions inside the group. 1462 /// But the cost will be assigned to one instruction only. 1463 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1464 if (auto *I = Grp->getMember(i)) { 1465 if (Grp->getInsertPos() == I) 1466 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1467 else 1468 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1469 } 1470 } 1471 } 1472 1473 /// Return the cost model decision for the given instruction \p I and vector 1474 /// width \p VF. Return CM_Unknown if this instruction did not pass 1475 /// through the cost modeling. 1476 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1477 assert(VF.isVector() && "Expected VF to be a vector VF"); 1478 // Cost model is not run in the VPlan-native path - return conservative 1479 // result until this changes. 1480 if (EnableVPlanNativePath) 1481 return CM_GatherScatter; 1482 1483 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1484 auto Itr = WideningDecisions.find(InstOnVF); 1485 if (Itr == WideningDecisions.end()) 1486 return CM_Unknown; 1487 return Itr->second.first; 1488 } 1489 1490 /// Return the vectorization cost for the given instruction \p I and vector 1491 /// width \p VF. 1492 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1493 assert(VF.isVector() && "Expected VF >=2"); 1494 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1495 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1496 "The cost is not calculated"); 1497 return WideningDecisions[InstOnVF].second; 1498 } 1499 1500 /// Return True if instruction \p I is an optimizable truncate whose operand 1501 /// is an induction variable. Such a truncate will be removed by adding a new 1502 /// induction variable with the destination type. 1503 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1504 // If the instruction is not a truncate, return false. 1505 auto *Trunc = dyn_cast<TruncInst>(I); 1506 if (!Trunc) 1507 return false; 1508 1509 // Get the source and destination types of the truncate. 1510 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1511 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1512 1513 // If the truncate is free for the given types, return false. Replacing a 1514 // free truncate with an induction variable would add an induction variable 1515 // update instruction to each iteration of the loop. We exclude from this 1516 // check the primary induction variable since it will need an update 1517 // instruction regardless. 1518 Value *Op = Trunc->getOperand(0); 1519 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1520 return false; 1521 1522 // If the truncated value is not an induction variable, return false. 1523 return Legal->isInductionPhi(Op); 1524 } 1525 1526 /// Collects the instructions to scalarize for each predicated instruction in 1527 /// the loop. 1528 void collectInstsToScalarize(ElementCount VF); 1529 1530 /// Collect Uniform and Scalar values for the given \p VF. 1531 /// The sets depend on CM decision for Load/Store instructions 1532 /// that may be vectorized as interleave, gather-scatter or scalarized. 1533 void collectUniformsAndScalars(ElementCount VF) { 1534 // Do the analysis once. 1535 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1536 return; 1537 setCostBasedWideningDecision(VF); 1538 collectLoopUniforms(VF); 1539 collectLoopScalars(VF); 1540 } 1541 1542 /// Returns true if the target machine supports masked store operation 1543 /// for the given \p DataType and kind of access to \p Ptr. 1544 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1545 return Legal->isConsecutivePtr(DataType, Ptr) && 1546 TTI.isLegalMaskedStore(DataType, Alignment); 1547 } 1548 1549 /// Returns true if the target machine supports masked load operation 1550 /// for the given \p DataType and kind of access to \p Ptr. 1551 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1552 return Legal->isConsecutivePtr(DataType, Ptr) && 1553 TTI.isLegalMaskedLoad(DataType, Alignment); 1554 } 1555 1556 /// Returns true if the target machine can represent \p V as a masked gather 1557 /// or scatter operation. 1558 bool isLegalGatherOrScatter(Value *V) { 1559 bool LI = isa<LoadInst>(V); 1560 bool SI = isa<StoreInst>(V); 1561 if (!LI && !SI) 1562 return false; 1563 auto *Ty = getLoadStoreType(V); 1564 Align Align = getLoadStoreAlignment(V); 1565 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1566 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1567 } 1568 1569 /// Returns true if the target machine supports all of the reduction 1570 /// variables found for the given VF. 1571 bool canVectorizeReductions(ElementCount VF) const { 1572 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1573 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1574 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1575 })); 1576 } 1577 1578 /// Returns true if \p I is an instruction that will be scalarized with 1579 /// predication. Such instructions include conditional stores and 1580 /// instructions that may divide by zero. 1581 /// If a non-zero VF has been calculated, we check if I will be scalarized 1582 /// predication for that VF. 1583 bool isScalarWithPredication(Instruction *I) const; 1584 1585 // Returns true if \p I is an instruction that will be predicated either 1586 // through scalar predication or masked load/store or masked gather/scatter. 1587 // Superset of instructions that return true for isScalarWithPredication. 1588 bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { 1589 // When we know the load is uniform and the original scalar loop was not 1590 // predicated we don't need to mark it as a predicated instruction. Any 1591 // vectorised blocks created when tail-folding are something artificial we 1592 // have introduced and we know there is always at least one active lane. 1593 // That's why we call Legal->blockNeedsPredication here because it doesn't 1594 // query tail-folding. 1595 if (IsKnownUniform && isa<LoadInst>(I) && 1596 !Legal->blockNeedsPredication(I->getParent())) 1597 return false; 1598 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1599 return false; 1600 // Loads and stores that need some form of masked operation are predicated 1601 // instructions. 1602 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1603 return Legal->isMaskRequired(I); 1604 return isScalarWithPredication(I); 1605 } 1606 1607 /// Returns true if \p I is a memory instruction with consecutive memory 1608 /// access that can be widened. 1609 bool 1610 memoryInstructionCanBeWidened(Instruction *I, 1611 ElementCount VF = ElementCount::getFixed(1)); 1612 1613 /// Returns true if \p I is a memory instruction in an interleaved-group 1614 /// of memory accesses that can be vectorized with wide vector loads/stores 1615 /// and shuffles. 1616 bool 1617 interleavedAccessCanBeWidened(Instruction *I, 1618 ElementCount VF = ElementCount::getFixed(1)); 1619 1620 /// Check if \p Instr belongs to any interleaved access group. 1621 bool isAccessInterleaved(Instruction *Instr) { 1622 return InterleaveInfo.isInterleaved(Instr); 1623 } 1624 1625 /// Get the interleaved access group that \p Instr belongs to. 1626 const InterleaveGroup<Instruction> * 1627 getInterleavedAccessGroup(Instruction *Instr) { 1628 return InterleaveInfo.getInterleaveGroup(Instr); 1629 } 1630 1631 /// Returns true if we're required to use a scalar epilogue for at least 1632 /// the final iteration of the original loop. 1633 bool requiresScalarEpilogue(ElementCount VF) const { 1634 if (!isScalarEpilogueAllowed()) 1635 return false; 1636 // If we might exit from anywhere but the latch, must run the exiting 1637 // iteration in scalar form. 1638 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1639 return true; 1640 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1641 } 1642 1643 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1644 /// loop hint annotation. 1645 bool isScalarEpilogueAllowed() const { 1646 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1647 } 1648 1649 /// Returns true if all loop blocks should be masked to fold tail loop. 1650 bool foldTailByMasking() const { return FoldTailByMasking; } 1651 1652 /// Returns true if the instructions in this block requires predication 1653 /// for any reason, e.g. because tail folding now requires a predicate 1654 /// or because the block in the original loop was predicated. 1655 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1656 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1657 } 1658 1659 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1660 /// nodes to the chain of instructions representing the reductions. Uses a 1661 /// MapVector to ensure deterministic iteration order. 1662 using ReductionChainMap = 1663 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1664 1665 /// Return the chain of instructions representing an inloop reduction. 1666 const ReductionChainMap &getInLoopReductionChains() const { 1667 return InLoopReductionChains; 1668 } 1669 1670 /// Returns true if the Phi is part of an inloop reduction. 1671 bool isInLoopReduction(PHINode *Phi) const { 1672 return InLoopReductionChains.count(Phi); 1673 } 1674 1675 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1676 /// with factor VF. Return the cost of the instruction, including 1677 /// scalarization overhead if it's needed. 1678 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1679 1680 /// Estimate cost of a call instruction CI if it were vectorized with factor 1681 /// VF. Return the cost of the instruction, including scalarization overhead 1682 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1683 /// scalarized - 1684 /// i.e. either vector version isn't available, or is too expensive. 1685 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1686 bool &NeedToScalarize) const; 1687 1688 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1689 /// that of B. 1690 bool isMoreProfitable(const VectorizationFactor &A, 1691 const VectorizationFactor &B) const; 1692 1693 /// Invalidates decisions already taken by the cost model. 1694 void invalidateCostModelingDecisions() { 1695 WideningDecisions.clear(); 1696 Uniforms.clear(); 1697 Scalars.clear(); 1698 } 1699 1700 private: 1701 unsigned NumPredStores = 0; 1702 1703 /// \return An upper bound for the vectorization factors for both 1704 /// fixed and scalable vectorization, where the minimum-known number of 1705 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1706 /// disabled or unsupported, then the scalable part will be equal to 1707 /// ElementCount::getScalable(0). 1708 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1709 ElementCount UserVF, 1710 bool FoldTailByMasking); 1711 1712 /// \return the maximized element count based on the targets vector 1713 /// registers and the loop trip-count, but limited to a maximum safe VF. 1714 /// This is a helper function of computeFeasibleMaxVF. 1715 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1716 /// issue that occurred on one of the buildbots which cannot be reproduced 1717 /// without having access to the properietary compiler (see comments on 1718 /// D98509). The issue is currently under investigation and this workaround 1719 /// will be removed as soon as possible. 1720 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1721 unsigned SmallestType, 1722 unsigned WidestType, 1723 const ElementCount &MaxSafeVF, 1724 bool FoldTailByMasking); 1725 1726 /// \return the maximum legal scalable VF, based on the safe max number 1727 /// of elements. 1728 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1729 1730 /// The vectorization cost is a combination of the cost itself and a boolean 1731 /// indicating whether any of the contributing operations will actually 1732 /// operate on vector values after type legalization in the backend. If this 1733 /// latter value is false, then all operations will be scalarized (i.e. no 1734 /// vectorization has actually taken place). 1735 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1736 1737 /// Returns the expected execution cost. The unit of the cost does 1738 /// not matter because we use the 'cost' units to compare different 1739 /// vector widths. The cost that is returned is *not* normalized by 1740 /// the factor width. If \p Invalid is not nullptr, this function 1741 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1742 /// each instruction that has an Invalid cost for the given VF. 1743 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1744 VectorizationCostTy 1745 expectedCost(ElementCount VF, 1746 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1747 1748 /// Returns the execution time cost of an instruction for a given vector 1749 /// width. Vector width of one means scalar. 1750 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1751 1752 /// The cost-computation logic from getInstructionCost which provides 1753 /// the vector type as an output parameter. 1754 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1755 Type *&VectorTy); 1756 1757 /// Return the cost of instructions in an inloop reduction pattern, if I is 1758 /// part of that pattern. 1759 Optional<InstructionCost> 1760 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1761 TTI::TargetCostKind CostKind); 1762 1763 /// Calculate vectorization cost of memory instruction \p I. 1764 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1765 1766 /// The cost computation for scalarized memory instruction. 1767 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1768 1769 /// The cost computation for interleaving group of memory instructions. 1770 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1771 1772 /// The cost computation for Gather/Scatter instruction. 1773 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1774 1775 /// The cost computation for widening instruction \p I with consecutive 1776 /// memory access. 1777 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1778 1779 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1780 /// Load: scalar load + broadcast. 1781 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1782 /// element) 1783 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1784 1785 /// Estimate the overhead of scalarizing an instruction. This is a 1786 /// convenience wrapper for the type-based getScalarizationOverhead API. 1787 InstructionCost getScalarizationOverhead(Instruction *I, 1788 ElementCount VF) const; 1789 1790 /// Returns whether the instruction is a load or store and will be a emitted 1791 /// as a vector operation. 1792 bool isConsecutiveLoadOrStore(Instruction *I); 1793 1794 /// Returns true if an artificially high cost for emulated masked memrefs 1795 /// should be used. 1796 bool useEmulatedMaskMemRefHack(Instruction *I); 1797 1798 /// Map of scalar integer values to the smallest bitwidth they can be legally 1799 /// represented as. The vector equivalents of these values should be truncated 1800 /// to this type. 1801 MapVector<Instruction *, uint64_t> MinBWs; 1802 1803 /// A type representing the costs for instructions if they were to be 1804 /// scalarized rather than vectorized. The entries are Instruction-Cost 1805 /// pairs. 1806 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1807 1808 /// A set containing all BasicBlocks that are known to present after 1809 /// vectorization as a predicated block. 1810 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1811 1812 /// Records whether it is allowed to have the original scalar loop execute at 1813 /// least once. This may be needed as a fallback loop in case runtime 1814 /// aliasing/dependence checks fail, or to handle the tail/remainder 1815 /// iterations when the trip count is unknown or doesn't divide by the VF, 1816 /// or as a peel-loop to handle gaps in interleave-groups. 1817 /// Under optsize and when the trip count is very small we don't allow any 1818 /// iterations to execute in the scalar loop. 1819 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1820 1821 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1822 bool FoldTailByMasking = false; 1823 1824 /// A map holding scalar costs for different vectorization factors. The 1825 /// presence of a cost for an instruction in the mapping indicates that the 1826 /// instruction will be scalarized when vectorizing with the associated 1827 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1828 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1829 1830 /// Holds the instructions known to be uniform after vectorization. 1831 /// The data is collected per VF. 1832 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1833 1834 /// Holds the instructions known to be scalar after vectorization. 1835 /// The data is collected per VF. 1836 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1837 1838 /// Holds the instructions (address computations) that are forced to be 1839 /// scalarized. 1840 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1841 1842 /// PHINodes of the reductions that should be expanded in-loop along with 1843 /// their associated chains of reduction operations, in program order from top 1844 /// (PHI) to bottom 1845 ReductionChainMap InLoopReductionChains; 1846 1847 /// A Map of inloop reduction operations and their immediate chain operand. 1848 /// FIXME: This can be removed once reductions can be costed correctly in 1849 /// vplan. This was added to allow quick lookup to the inloop operations, 1850 /// without having to loop through InLoopReductionChains. 1851 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1852 1853 /// Returns the expected difference in cost from scalarizing the expression 1854 /// feeding a predicated instruction \p PredInst. The instructions to 1855 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1856 /// non-negative return value implies the expression will be scalarized. 1857 /// Currently, only single-use chains are considered for scalarization. 1858 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1859 ElementCount VF); 1860 1861 /// Collect the instructions that are uniform after vectorization. An 1862 /// instruction is uniform if we represent it with a single scalar value in 1863 /// the vectorized loop corresponding to each vector iteration. Examples of 1864 /// uniform instructions include pointer operands of consecutive or 1865 /// interleaved memory accesses. Note that although uniformity implies an 1866 /// instruction will be scalar, the reverse is not true. In general, a 1867 /// scalarized instruction will be represented by VF scalar values in the 1868 /// vectorized loop, each corresponding to an iteration of the original 1869 /// scalar loop. 1870 void collectLoopUniforms(ElementCount VF); 1871 1872 /// Collect the instructions that are scalar after vectorization. An 1873 /// instruction is scalar if it is known to be uniform or will be scalarized 1874 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1875 /// to the list if they are used by a load/store instruction that is marked as 1876 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1877 /// VF values in the vectorized loop, each corresponding to an iteration of 1878 /// the original scalar loop. 1879 void collectLoopScalars(ElementCount VF); 1880 1881 /// Keeps cost model vectorization decision and cost for instructions. 1882 /// Right now it is used for memory instructions only. 1883 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1884 std::pair<InstWidening, InstructionCost>>; 1885 1886 DecisionList WideningDecisions; 1887 1888 /// Returns true if \p V is expected to be vectorized and it needs to be 1889 /// extracted. 1890 bool needsExtract(Value *V, ElementCount VF) const { 1891 Instruction *I = dyn_cast<Instruction>(V); 1892 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1893 TheLoop->isLoopInvariant(I)) 1894 return false; 1895 1896 // Assume we can vectorize V (and hence we need extraction) if the 1897 // scalars are not computed yet. This can happen, because it is called 1898 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1899 // the scalars are collected. That should be a safe assumption in most 1900 // cases, because we check if the operands have vectorizable types 1901 // beforehand in LoopVectorizationLegality. 1902 return Scalars.find(VF) == Scalars.end() || 1903 !isScalarAfterVectorization(I, VF); 1904 }; 1905 1906 /// Returns a range containing only operands needing to be extracted. 1907 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1908 ElementCount VF) const { 1909 return SmallVector<Value *, 4>(make_filter_range( 1910 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1911 } 1912 1913 /// Determines if we have the infrastructure to vectorize loop \p L and its 1914 /// epilogue, assuming the main loop is vectorized by \p VF. 1915 bool isCandidateForEpilogueVectorization(const Loop &L, 1916 const ElementCount VF) const; 1917 1918 /// Returns true if epilogue vectorization is considered profitable, and 1919 /// false otherwise. 1920 /// \p VF is the vectorization factor chosen for the original loop. 1921 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1922 1923 public: 1924 /// The loop that we evaluate. 1925 Loop *TheLoop; 1926 1927 /// Predicated scalar evolution analysis. 1928 PredicatedScalarEvolution &PSE; 1929 1930 /// Loop Info analysis. 1931 LoopInfo *LI; 1932 1933 /// Vectorization legality. 1934 LoopVectorizationLegality *Legal; 1935 1936 /// Vector target information. 1937 const TargetTransformInfo &TTI; 1938 1939 /// Target Library Info. 1940 const TargetLibraryInfo *TLI; 1941 1942 /// Demanded bits analysis. 1943 DemandedBits *DB; 1944 1945 /// Assumption cache. 1946 AssumptionCache *AC; 1947 1948 /// Interface to emit optimization remarks. 1949 OptimizationRemarkEmitter *ORE; 1950 1951 const Function *TheFunction; 1952 1953 /// Loop Vectorize Hint. 1954 const LoopVectorizeHints *Hints; 1955 1956 /// The interleave access information contains groups of interleaved accesses 1957 /// with the same stride and close to each other. 1958 InterleavedAccessInfo &InterleaveInfo; 1959 1960 /// Values to ignore in the cost model. 1961 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1962 1963 /// Values to ignore in the cost model when VF > 1. 1964 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1965 1966 /// All element types found in the loop. 1967 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1968 1969 /// Profitable vector factors. 1970 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1971 }; 1972 } // end namespace llvm 1973 1974 /// Helper struct to manage generating runtime checks for vectorization. 1975 /// 1976 /// The runtime checks are created up-front in temporary blocks to allow better 1977 /// estimating the cost and un-linked from the existing IR. After deciding to 1978 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1979 /// temporary blocks are completely removed. 1980 class GeneratedRTChecks { 1981 /// Basic block which contains the generated SCEV checks, if any. 1982 BasicBlock *SCEVCheckBlock = nullptr; 1983 1984 /// The value representing the result of the generated SCEV checks. If it is 1985 /// nullptr, either no SCEV checks have been generated or they have been used. 1986 Value *SCEVCheckCond = nullptr; 1987 1988 /// Basic block which contains the generated memory runtime checks, if any. 1989 BasicBlock *MemCheckBlock = nullptr; 1990 1991 /// The value representing the result of the generated memory runtime checks. 1992 /// If it is nullptr, either no memory runtime checks have been generated or 1993 /// they have been used. 1994 Value *MemRuntimeCheckCond = nullptr; 1995 1996 DominatorTree *DT; 1997 LoopInfo *LI; 1998 1999 SCEVExpander SCEVExp; 2000 SCEVExpander MemCheckExp; 2001 2002 public: 2003 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 2004 const DataLayout &DL) 2005 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 2006 MemCheckExp(SE, DL, "scev.check") {} 2007 2008 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 2009 /// accurately estimate the cost of the runtime checks. The blocks are 2010 /// un-linked from the IR and is added back during vector code generation. If 2011 /// there is no vector code generation, the check blocks are removed 2012 /// completely. 2013 void Create(Loop *L, const LoopAccessInfo &LAI, 2014 const SCEVUnionPredicate &UnionPred) { 2015 2016 BasicBlock *LoopHeader = L->getHeader(); 2017 BasicBlock *Preheader = L->getLoopPreheader(); 2018 2019 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2020 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2021 // may be used by SCEVExpander. The blocks will be un-linked from their 2022 // predecessors and removed from LI & DT at the end of the function. 2023 if (!UnionPred.isAlwaysTrue()) { 2024 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2025 nullptr, "vector.scevcheck"); 2026 2027 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2028 &UnionPred, SCEVCheckBlock->getTerminator()); 2029 } 2030 2031 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2032 if (RtPtrChecking.Need) { 2033 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2034 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2035 "vector.memcheck"); 2036 2037 MemRuntimeCheckCond = 2038 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2039 RtPtrChecking.getChecks(), MemCheckExp); 2040 assert(MemRuntimeCheckCond && 2041 "no RT checks generated although RtPtrChecking " 2042 "claimed checks are required"); 2043 } 2044 2045 if (!MemCheckBlock && !SCEVCheckBlock) 2046 return; 2047 2048 // Unhook the temporary block with the checks, update various places 2049 // accordingly. 2050 if (SCEVCheckBlock) 2051 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2052 if (MemCheckBlock) 2053 MemCheckBlock->replaceAllUsesWith(Preheader); 2054 2055 if (SCEVCheckBlock) { 2056 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2057 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2058 Preheader->getTerminator()->eraseFromParent(); 2059 } 2060 if (MemCheckBlock) { 2061 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2062 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2063 Preheader->getTerminator()->eraseFromParent(); 2064 } 2065 2066 DT->changeImmediateDominator(LoopHeader, Preheader); 2067 if (MemCheckBlock) { 2068 DT->eraseNode(MemCheckBlock); 2069 LI->removeBlock(MemCheckBlock); 2070 } 2071 if (SCEVCheckBlock) { 2072 DT->eraseNode(SCEVCheckBlock); 2073 LI->removeBlock(SCEVCheckBlock); 2074 } 2075 } 2076 2077 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2078 /// unused. 2079 ~GeneratedRTChecks() { 2080 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2081 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2082 if (!SCEVCheckCond) 2083 SCEVCleaner.markResultUsed(); 2084 2085 if (!MemRuntimeCheckCond) 2086 MemCheckCleaner.markResultUsed(); 2087 2088 if (MemRuntimeCheckCond) { 2089 auto &SE = *MemCheckExp.getSE(); 2090 // Memory runtime check generation creates compares that use expanded 2091 // values. Remove them before running the SCEVExpanderCleaners. 2092 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2093 if (MemCheckExp.isInsertedInstruction(&I)) 2094 continue; 2095 SE.forgetValue(&I); 2096 I.eraseFromParent(); 2097 } 2098 } 2099 MemCheckCleaner.cleanup(); 2100 SCEVCleaner.cleanup(); 2101 2102 if (SCEVCheckCond) 2103 SCEVCheckBlock->eraseFromParent(); 2104 if (MemRuntimeCheckCond) 2105 MemCheckBlock->eraseFromParent(); 2106 } 2107 2108 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2109 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2110 /// depending on the generated condition. 2111 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2112 BasicBlock *LoopVectorPreHeader, 2113 BasicBlock *LoopExitBlock) { 2114 if (!SCEVCheckCond) 2115 return nullptr; 2116 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2117 if (C->isZero()) 2118 return nullptr; 2119 2120 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2121 2122 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2123 // Create new preheader for vector loop. 2124 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2125 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2126 2127 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2128 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2129 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2130 SCEVCheckBlock); 2131 2132 DT->addNewBlock(SCEVCheckBlock, Pred); 2133 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2134 2135 ReplaceInstWithInst( 2136 SCEVCheckBlock->getTerminator(), 2137 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2138 // Mark the check as used, to prevent it from being removed during cleanup. 2139 SCEVCheckCond = nullptr; 2140 return SCEVCheckBlock; 2141 } 2142 2143 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2144 /// the branches to branch to the vector preheader or \p Bypass, depending on 2145 /// the generated condition. 2146 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2147 BasicBlock *LoopVectorPreHeader) { 2148 // Check if we generated code that checks in runtime if arrays overlap. 2149 if (!MemRuntimeCheckCond) 2150 return nullptr; 2151 2152 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2153 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2154 MemCheckBlock); 2155 2156 DT->addNewBlock(MemCheckBlock, Pred); 2157 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2158 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2159 2160 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2161 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2162 2163 ReplaceInstWithInst( 2164 MemCheckBlock->getTerminator(), 2165 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2166 MemCheckBlock->getTerminator()->setDebugLoc( 2167 Pred->getTerminator()->getDebugLoc()); 2168 2169 // Mark the check as used, to prevent it from being removed during cleanup. 2170 MemRuntimeCheckCond = nullptr; 2171 return MemCheckBlock; 2172 } 2173 }; 2174 2175 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2176 // vectorization. The loop needs to be annotated with #pragma omp simd 2177 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2178 // vector length information is not provided, vectorization is not considered 2179 // explicit. Interleave hints are not allowed either. These limitations will be 2180 // relaxed in the future. 2181 // Please, note that we are currently forced to abuse the pragma 'clang 2182 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2183 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2184 // provides *explicit vectorization hints* (LV can bypass legal checks and 2185 // assume that vectorization is legal). However, both hints are implemented 2186 // using the same metadata (llvm.loop.vectorize, processed by 2187 // LoopVectorizeHints). This will be fixed in the future when the native IR 2188 // representation for pragma 'omp simd' is introduced. 2189 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2190 OptimizationRemarkEmitter *ORE) { 2191 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2192 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2193 2194 // Only outer loops with an explicit vectorization hint are supported. 2195 // Unannotated outer loops are ignored. 2196 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2197 return false; 2198 2199 Function *Fn = OuterLp->getHeader()->getParent(); 2200 if (!Hints.allowVectorization(Fn, OuterLp, 2201 true /*VectorizeOnlyWhenForced*/)) { 2202 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2203 return false; 2204 } 2205 2206 if (Hints.getInterleave() > 1) { 2207 // TODO: Interleave support is future work. 2208 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2209 "outer loops.\n"); 2210 Hints.emitRemarkWithHints(); 2211 return false; 2212 } 2213 2214 return true; 2215 } 2216 2217 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2218 OptimizationRemarkEmitter *ORE, 2219 SmallVectorImpl<Loop *> &V) { 2220 // Collect inner loops and outer loops without irreducible control flow. For 2221 // now, only collect outer loops that have explicit vectorization hints. If we 2222 // are stress testing the VPlan H-CFG construction, we collect the outermost 2223 // loop of every loop nest. 2224 if (L.isInnermost() || VPlanBuildStressTest || 2225 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2226 LoopBlocksRPO RPOT(&L); 2227 RPOT.perform(LI); 2228 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2229 V.push_back(&L); 2230 // TODO: Collect inner loops inside marked outer loops in case 2231 // vectorization fails for the outer loop. Do not invoke 2232 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2233 // already known to be reducible. We can use an inherited attribute for 2234 // that. 2235 return; 2236 } 2237 } 2238 for (Loop *InnerL : L) 2239 collectSupportedLoops(*InnerL, LI, ORE, V); 2240 } 2241 2242 namespace { 2243 2244 /// The LoopVectorize Pass. 2245 struct LoopVectorize : public FunctionPass { 2246 /// Pass identification, replacement for typeid 2247 static char ID; 2248 2249 LoopVectorizePass Impl; 2250 2251 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2252 bool VectorizeOnlyWhenForced = false) 2253 : FunctionPass(ID), 2254 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2255 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2256 } 2257 2258 bool runOnFunction(Function &F) override { 2259 if (skipFunction(F)) 2260 return false; 2261 2262 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2263 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2264 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2265 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2266 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2267 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2268 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2269 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2270 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2271 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2272 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2273 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2274 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2275 2276 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2277 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2278 2279 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2280 GetLAA, *ORE, PSI).MadeAnyChange; 2281 } 2282 2283 void getAnalysisUsage(AnalysisUsage &AU) const override { 2284 AU.addRequired<AssumptionCacheTracker>(); 2285 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2286 AU.addRequired<DominatorTreeWrapperPass>(); 2287 AU.addRequired<LoopInfoWrapperPass>(); 2288 AU.addRequired<ScalarEvolutionWrapperPass>(); 2289 AU.addRequired<TargetTransformInfoWrapperPass>(); 2290 AU.addRequired<AAResultsWrapperPass>(); 2291 AU.addRequired<LoopAccessLegacyAnalysis>(); 2292 AU.addRequired<DemandedBitsWrapperPass>(); 2293 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2294 AU.addRequired<InjectTLIMappingsLegacy>(); 2295 2296 // We currently do not preserve loopinfo/dominator analyses with outer loop 2297 // vectorization. Until this is addressed, mark these analyses as preserved 2298 // only for non-VPlan-native path. 2299 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2300 if (!EnableVPlanNativePath) { 2301 AU.addPreserved<LoopInfoWrapperPass>(); 2302 AU.addPreserved<DominatorTreeWrapperPass>(); 2303 } 2304 2305 AU.addPreserved<BasicAAWrapperPass>(); 2306 AU.addPreserved<GlobalsAAWrapperPass>(); 2307 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2308 } 2309 }; 2310 2311 } // end anonymous namespace 2312 2313 //===----------------------------------------------------------------------===// 2314 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2315 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2316 //===----------------------------------------------------------------------===// 2317 2318 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2319 // We need to place the broadcast of invariant variables outside the loop, 2320 // but only if it's proven safe to do so. Else, broadcast will be inside 2321 // vector loop body. 2322 Instruction *Instr = dyn_cast<Instruction>(V); 2323 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2324 (!Instr || 2325 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2326 // Place the code for broadcasting invariant variables in the new preheader. 2327 IRBuilder<>::InsertPointGuard Guard(Builder); 2328 if (SafeToHoist) 2329 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2330 2331 // Broadcast the scalar into all locations in the vector. 2332 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2333 2334 return Shuf; 2335 } 2336 2337 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2338 const InductionDescriptor &II, Value *Step, Value *Start, 2339 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2340 IRBuilder<> &Builder = State.Builder; 2341 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2342 "Expected either an induction phi-node or a truncate of it!"); 2343 2344 // Construct the initial value of the vector IV in the vector loop preheader 2345 auto CurrIP = Builder.saveIP(); 2346 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2347 if (isa<TruncInst>(EntryVal)) { 2348 assert(Start->getType()->isIntegerTy() && 2349 "Truncation requires an integer type"); 2350 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2351 Step = Builder.CreateTrunc(Step, TruncType); 2352 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2353 } 2354 2355 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2356 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2357 Value *SteppedStart = 2358 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2359 2360 // We create vector phi nodes for both integer and floating-point induction 2361 // variables. Here, we determine the kind of arithmetic we will perform. 2362 Instruction::BinaryOps AddOp; 2363 Instruction::BinaryOps MulOp; 2364 if (Step->getType()->isIntegerTy()) { 2365 AddOp = Instruction::Add; 2366 MulOp = Instruction::Mul; 2367 } else { 2368 AddOp = II.getInductionOpcode(); 2369 MulOp = Instruction::FMul; 2370 } 2371 2372 // Multiply the vectorization factor by the step using integer or 2373 // floating-point arithmetic as appropriate. 2374 Type *StepType = Step->getType(); 2375 Value *RuntimeVF; 2376 if (Step->getType()->isFloatingPointTy()) 2377 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2378 else 2379 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2380 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2381 2382 // Create a vector splat to use in the induction update. 2383 // 2384 // FIXME: If the step is non-constant, we create the vector splat with 2385 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2386 // handle a constant vector splat. 2387 Value *SplatVF = isa<Constant>(Mul) 2388 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2389 : Builder.CreateVectorSplat(State.VF, Mul); 2390 Builder.restoreIP(CurrIP); 2391 2392 // We may need to add the step a number of times, depending on the unroll 2393 // factor. The last of those goes into the PHI. 2394 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2395 &*LoopVectorBody->getFirstInsertionPt()); 2396 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2397 Instruction *LastInduction = VecInd; 2398 for (unsigned Part = 0; Part < UF; ++Part) { 2399 State.set(Def, LastInduction, Part); 2400 2401 if (isa<TruncInst>(EntryVal)) 2402 addMetadata(LastInduction, EntryVal); 2403 2404 LastInduction = cast<Instruction>( 2405 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2406 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2407 } 2408 2409 // Move the last step to the end of the latch block. This ensures consistent 2410 // placement of all induction updates. 2411 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2412 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2413 auto *ICmp = cast<Instruction>(Br->getCondition()); 2414 LastInduction->moveBefore(ICmp); 2415 LastInduction->setName("vec.ind.next"); 2416 2417 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2418 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2419 } 2420 2421 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2422 return Cost->isScalarAfterVectorization(I, VF) || 2423 Cost->isProfitableToScalarize(I, VF); 2424 } 2425 2426 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2427 if (shouldScalarizeInstruction(IV)) 2428 return true; 2429 auto isScalarInst = [&](User *U) -> bool { 2430 auto *I = cast<Instruction>(U); 2431 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2432 }; 2433 return llvm::any_of(IV->users(), isScalarInst); 2434 } 2435 2436 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, 2437 const InductionDescriptor &ID, 2438 Value *Start, TruncInst *Trunc, 2439 VPValue *Def, 2440 VPTransformState &State) { 2441 IRBuilder<> &Builder = State.Builder; 2442 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2443 "Primary induction variable must have an integer type"); 2444 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2445 2446 // The value from the original loop to which we are mapping the new induction 2447 // variable. 2448 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2449 2450 auto &DL = EntryVal->getModule()->getDataLayout(); 2451 2452 // Generate code for the induction step. Note that induction steps are 2453 // required to be loop-invariant 2454 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2455 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2456 "Induction step should be loop invariant"); 2457 if (PSE.getSE()->isSCEVable(IV->getType())) { 2458 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2459 return Exp.expandCodeFor(Step, Step->getType(), 2460 State.CFG.VectorPreHeader->getTerminator()); 2461 } 2462 return cast<SCEVUnknown>(Step)->getValue(); 2463 }; 2464 2465 // The scalar value to broadcast. This is derived from the canonical 2466 // induction variable. If a truncation type is given, truncate the canonical 2467 // induction variable and step. Otherwise, derive these values from the 2468 // induction descriptor. 2469 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2470 Value *ScalarIV = Induction; 2471 if (IV != OldInduction) { 2472 ScalarIV = IV->getType()->isIntegerTy() 2473 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2474 : Builder.CreateCast(Instruction::SIToFP, Induction, 2475 IV->getType()); 2476 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2477 ScalarIV->setName("offset.idx"); 2478 } 2479 if (Trunc) { 2480 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2481 assert(Step->getType()->isIntegerTy() && 2482 "Truncation requires an integer step"); 2483 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2484 Step = Builder.CreateTrunc(Step, TruncType); 2485 } 2486 return ScalarIV; 2487 }; 2488 2489 // Create the vector values from the scalar IV, in the absence of creating a 2490 // vector IV. 2491 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2492 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2493 for (unsigned Part = 0; Part < UF; ++Part) { 2494 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 2495 Value *StartIdx; 2496 if (Step->getType()->isFloatingPointTy()) 2497 StartIdx = 2498 getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); 2499 else 2500 StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); 2501 2502 Value *EntryPart = 2503 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2504 State.set(Def, EntryPart, Part); 2505 if (Trunc) 2506 addMetadata(EntryPart, Trunc); 2507 } 2508 }; 2509 2510 // Fast-math-flags propagate from the original induction instruction. 2511 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2512 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2513 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2514 2515 // Now do the actual transformations, and start with creating the step value. 2516 Value *Step = CreateStepValue(ID.getStep()); 2517 if (State.VF.isZero() || State.VF.isScalar()) { 2518 Value *ScalarIV = CreateScalarIV(Step); 2519 CreateSplatIV(ScalarIV, Step); 2520 return; 2521 } 2522 2523 // Determine if we want a scalar version of the induction variable. This is 2524 // true if the induction variable itself is not widened, or if it has at 2525 // least one user in the loop that is not widened. 2526 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2527 if (!NeedsScalarIV) { 2528 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2529 return; 2530 } 2531 2532 // Try to create a new independent vector induction variable. If we can't 2533 // create the phi node, we will splat the scalar induction variable in each 2534 // loop iteration. 2535 if (!shouldScalarizeInstruction(EntryVal)) { 2536 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2537 Value *ScalarIV = CreateScalarIV(Step); 2538 // Create scalar steps that can be used by instructions we will later 2539 // scalarize. Note that the addition of the scalar steps will not increase 2540 // the number of instructions in the loop in the common case prior to 2541 // InstCombine. We will be trading one vector extract for each scalar step. 2542 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2543 return; 2544 } 2545 2546 // All IV users are scalar instructions, so only emit a scalar IV, not a 2547 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2548 // predicate used by the masked loads/stores. 2549 Value *ScalarIV = CreateScalarIV(Step); 2550 if (!Cost->isScalarEpilogueAllowed()) 2551 CreateSplatIV(ScalarIV, Step); 2552 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2553 } 2554 2555 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2556 Value *Step, 2557 Instruction::BinaryOps BinOp) { 2558 // Create and check the types. 2559 auto *ValVTy = cast<VectorType>(Val->getType()); 2560 ElementCount VLen = ValVTy->getElementCount(); 2561 2562 Type *STy = Val->getType()->getScalarType(); 2563 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2564 "Induction Step must be an integer or FP"); 2565 assert(Step->getType() == STy && "Step has wrong type"); 2566 2567 SmallVector<Constant *, 8> Indices; 2568 2569 // Create a vector of consecutive numbers from zero to VF. 2570 VectorType *InitVecValVTy = ValVTy; 2571 Type *InitVecValSTy = STy; 2572 if (STy->isFloatingPointTy()) { 2573 InitVecValSTy = 2574 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2575 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2576 } 2577 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2578 2579 // Splat the StartIdx 2580 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2581 2582 if (STy->isIntegerTy()) { 2583 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2584 Step = Builder.CreateVectorSplat(VLen, Step); 2585 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2586 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2587 // which can be found from the original scalar operations. 2588 Step = Builder.CreateMul(InitVec, Step); 2589 return Builder.CreateAdd(Val, Step, "induction"); 2590 } 2591 2592 // Floating point induction. 2593 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2594 "Binary Opcode should be specified for FP induction"); 2595 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2596 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2597 2598 Step = Builder.CreateVectorSplat(VLen, Step); 2599 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2600 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2601 } 2602 2603 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2604 Instruction *EntryVal, 2605 const InductionDescriptor &ID, 2606 VPValue *Def, 2607 VPTransformState &State) { 2608 IRBuilder<> &Builder = State.Builder; 2609 // We shouldn't have to build scalar steps if we aren't vectorizing. 2610 assert(State.VF.isVector() && "VF should be greater than one"); 2611 // Get the value type and ensure it and the step have the same integer type. 2612 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2613 assert(ScalarIVTy == Step->getType() && 2614 "Val and Step should have the same type"); 2615 2616 // We build scalar steps for both integer and floating-point induction 2617 // variables. Here, we determine the kind of arithmetic we will perform. 2618 Instruction::BinaryOps AddOp; 2619 Instruction::BinaryOps MulOp; 2620 if (ScalarIVTy->isIntegerTy()) { 2621 AddOp = Instruction::Add; 2622 MulOp = Instruction::Mul; 2623 } else { 2624 AddOp = ID.getInductionOpcode(); 2625 MulOp = Instruction::FMul; 2626 } 2627 2628 // Determine the number of scalars we need to generate for each unroll 2629 // iteration. If EntryVal is uniform, we only need to generate the first 2630 // lane. Otherwise, we generate all VF values. 2631 bool IsUniform = 2632 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); 2633 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 2634 // Compute the scalar steps and save the results in State. 2635 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2636 ScalarIVTy->getScalarSizeInBits()); 2637 Type *VecIVTy = nullptr; 2638 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2639 if (!IsUniform && State.VF.isScalable()) { 2640 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2641 UnitStepVec = 2642 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2643 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2644 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2645 } 2646 2647 for (unsigned Part = 0; Part < State.UF; ++Part) { 2648 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2649 2650 if (!IsUniform && State.VF.isScalable()) { 2651 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2652 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2653 if (ScalarIVTy->isFloatingPointTy()) 2654 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2655 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2656 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2657 State.set(Def, Add, Part); 2658 // It's useful to record the lane values too for the known minimum number 2659 // of elements so we do those below. This improves the code quality when 2660 // trying to extract the first element, for example. 2661 } 2662 2663 if (ScalarIVTy->isFloatingPointTy()) 2664 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2665 2666 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2667 Value *StartIdx = Builder.CreateBinOp( 2668 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2669 // The step returned by `createStepForVF` is a runtime-evaluated value 2670 // when VF is scalable. Otherwise, it should be folded into a Constant. 2671 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2672 "Expected StartIdx to be folded to a constant when VF is not " 2673 "scalable"); 2674 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2675 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2676 State.set(Def, Add, VPIteration(Part, Lane)); 2677 } 2678 } 2679 } 2680 2681 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2682 const VPIteration &Instance, 2683 VPTransformState &State) { 2684 Value *ScalarInst = State.get(Def, Instance); 2685 Value *VectorValue = State.get(Def, Instance.Part); 2686 VectorValue = Builder.CreateInsertElement( 2687 VectorValue, ScalarInst, 2688 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2689 State.set(Def, VectorValue, Instance.Part); 2690 } 2691 2692 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2693 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2694 return Builder.CreateVectorReverse(Vec, "reverse"); 2695 } 2696 2697 // Return whether we allow using masked interleave-groups (for dealing with 2698 // strided loads/stores that reside in predicated blocks, or for dealing 2699 // with gaps). 2700 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2701 // If an override option has been passed in for interleaved accesses, use it. 2702 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2703 return EnableMaskedInterleavedMemAccesses; 2704 2705 return TTI.enableMaskedInterleavedAccessVectorization(); 2706 } 2707 2708 // Try to vectorize the interleave group that \p Instr belongs to. 2709 // 2710 // E.g. Translate following interleaved load group (factor = 3): 2711 // for (i = 0; i < N; i+=3) { 2712 // R = Pic[i]; // Member of index 0 2713 // G = Pic[i+1]; // Member of index 1 2714 // B = Pic[i+2]; // Member of index 2 2715 // ... // do something to R, G, B 2716 // } 2717 // To: 2718 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2719 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2720 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2721 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2722 // 2723 // Or translate following interleaved store group (factor = 3): 2724 // for (i = 0; i < N; i+=3) { 2725 // ... do something to R, G, B 2726 // Pic[i] = R; // Member of index 0 2727 // Pic[i+1] = G; // Member of index 1 2728 // Pic[i+2] = B; // Member of index 2 2729 // } 2730 // To: 2731 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2732 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2733 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2734 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2735 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2736 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2737 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2738 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2739 VPValue *BlockInMask) { 2740 Instruction *Instr = Group->getInsertPos(); 2741 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2742 2743 // Prepare for the vector type of the interleaved load/store. 2744 Type *ScalarTy = getLoadStoreType(Instr); 2745 unsigned InterleaveFactor = Group->getFactor(); 2746 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2747 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2748 2749 // Prepare for the new pointers. 2750 SmallVector<Value *, 2> AddrParts; 2751 unsigned Index = Group->getIndex(Instr); 2752 2753 // TODO: extend the masked interleaved-group support to reversed access. 2754 assert((!BlockInMask || !Group->isReverse()) && 2755 "Reversed masked interleave-group not supported."); 2756 2757 // If the group is reverse, adjust the index to refer to the last vector lane 2758 // instead of the first. We adjust the index from the first vector lane, 2759 // rather than directly getting the pointer for lane VF - 1, because the 2760 // pointer operand of the interleaved access is supposed to be uniform. For 2761 // uniform instructions, we're only required to generate a value for the 2762 // first vector lane in each unroll iteration. 2763 if (Group->isReverse()) 2764 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2765 2766 for (unsigned Part = 0; Part < UF; Part++) { 2767 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2768 setDebugLocFromInst(AddrPart); 2769 2770 // Notice current instruction could be any index. Need to adjust the address 2771 // to the member of index 0. 2772 // 2773 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2774 // b = A[i]; // Member of index 0 2775 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2776 // 2777 // E.g. A[i+1] = a; // Member of index 1 2778 // A[i] = b; // Member of index 0 2779 // A[i+2] = c; // Member of index 2 (Current instruction) 2780 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2781 2782 bool InBounds = false; 2783 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2784 InBounds = gep->isInBounds(); 2785 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2786 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2787 2788 // Cast to the vector pointer type. 2789 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2790 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2791 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2792 } 2793 2794 setDebugLocFromInst(Instr); 2795 Value *PoisonVec = PoisonValue::get(VecTy); 2796 2797 Value *MaskForGaps = nullptr; 2798 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2799 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2800 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2801 } 2802 2803 // Vectorize the interleaved load group. 2804 if (isa<LoadInst>(Instr)) { 2805 // For each unroll part, create a wide load for the group. 2806 SmallVector<Value *, 2> NewLoads; 2807 for (unsigned Part = 0; Part < UF; Part++) { 2808 Instruction *NewLoad; 2809 if (BlockInMask || MaskForGaps) { 2810 assert(useMaskedInterleavedAccesses(*TTI) && 2811 "masked interleaved groups are not allowed."); 2812 Value *GroupMask = MaskForGaps; 2813 if (BlockInMask) { 2814 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2815 Value *ShuffledMask = Builder.CreateShuffleVector( 2816 BlockInMaskPart, 2817 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2818 "interleaved.mask"); 2819 GroupMask = MaskForGaps 2820 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2821 MaskForGaps) 2822 : ShuffledMask; 2823 } 2824 NewLoad = 2825 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2826 GroupMask, PoisonVec, "wide.masked.vec"); 2827 } 2828 else 2829 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2830 Group->getAlign(), "wide.vec"); 2831 Group->addMetadata(NewLoad); 2832 NewLoads.push_back(NewLoad); 2833 } 2834 2835 // For each member in the group, shuffle out the appropriate data from the 2836 // wide loads. 2837 unsigned J = 0; 2838 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2839 Instruction *Member = Group->getMember(I); 2840 2841 // Skip the gaps in the group. 2842 if (!Member) 2843 continue; 2844 2845 auto StrideMask = 2846 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2847 for (unsigned Part = 0; Part < UF; Part++) { 2848 Value *StridedVec = Builder.CreateShuffleVector( 2849 NewLoads[Part], StrideMask, "strided.vec"); 2850 2851 // If this member has different type, cast the result type. 2852 if (Member->getType() != ScalarTy) { 2853 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2854 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2855 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2856 } 2857 2858 if (Group->isReverse()) 2859 StridedVec = reverseVector(StridedVec); 2860 2861 State.set(VPDefs[J], StridedVec, Part); 2862 } 2863 ++J; 2864 } 2865 return; 2866 } 2867 2868 // The sub vector type for current instruction. 2869 auto *SubVT = VectorType::get(ScalarTy, VF); 2870 2871 // Vectorize the interleaved store group. 2872 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2873 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2874 "masked interleaved groups are not allowed."); 2875 assert((!MaskForGaps || !VF.isScalable()) && 2876 "masking gaps for scalable vectors is not yet supported."); 2877 for (unsigned Part = 0; Part < UF; Part++) { 2878 // Collect the stored vector from each member. 2879 SmallVector<Value *, 4> StoredVecs; 2880 for (unsigned i = 0; i < InterleaveFactor; i++) { 2881 assert((Group->getMember(i) || MaskForGaps) && 2882 "Fail to get a member from an interleaved store group"); 2883 Instruction *Member = Group->getMember(i); 2884 2885 // Skip the gaps in the group. 2886 if (!Member) { 2887 Value *Undef = PoisonValue::get(SubVT); 2888 StoredVecs.push_back(Undef); 2889 continue; 2890 } 2891 2892 Value *StoredVec = State.get(StoredValues[i], Part); 2893 2894 if (Group->isReverse()) 2895 StoredVec = reverseVector(StoredVec); 2896 2897 // If this member has different type, cast it to a unified type. 2898 2899 if (StoredVec->getType() != SubVT) 2900 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2901 2902 StoredVecs.push_back(StoredVec); 2903 } 2904 2905 // Concatenate all vectors into a wide vector. 2906 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2907 2908 // Interleave the elements in the wide vector. 2909 Value *IVec = Builder.CreateShuffleVector( 2910 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2911 "interleaved.vec"); 2912 2913 Instruction *NewStoreInstr; 2914 if (BlockInMask || MaskForGaps) { 2915 Value *GroupMask = MaskForGaps; 2916 if (BlockInMask) { 2917 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2918 Value *ShuffledMask = Builder.CreateShuffleVector( 2919 BlockInMaskPart, 2920 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2921 "interleaved.mask"); 2922 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2923 ShuffledMask, MaskForGaps) 2924 : ShuffledMask; 2925 } 2926 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2927 Group->getAlign(), GroupMask); 2928 } else 2929 NewStoreInstr = 2930 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2931 2932 Group->addMetadata(NewStoreInstr); 2933 } 2934 } 2935 2936 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2937 VPReplicateRecipe *RepRecipe, 2938 const VPIteration &Instance, 2939 bool IfPredicateInstr, 2940 VPTransformState &State) { 2941 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2942 2943 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2944 // the first lane and part. 2945 if (isa<NoAliasScopeDeclInst>(Instr)) 2946 if (!Instance.isFirstIteration()) 2947 return; 2948 2949 setDebugLocFromInst(Instr); 2950 2951 // Does this instruction return a value ? 2952 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2953 2954 Instruction *Cloned = Instr->clone(); 2955 if (!IsVoidRetTy) 2956 Cloned->setName(Instr->getName() + ".cloned"); 2957 2958 // If the scalarized instruction contributes to the address computation of a 2959 // widen masked load/store which was in a basic block that needed predication 2960 // and is not predicated after vectorization, we can't propagate 2961 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2962 // instruction could feed a poison value to the base address of the widen 2963 // load/store. 2964 if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) 2965 Cloned->dropPoisonGeneratingFlags(); 2966 2967 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2968 Builder.GetInsertPoint()); 2969 // Replace the operands of the cloned instructions with their scalar 2970 // equivalents in the new loop. 2971 for (auto &I : enumerate(RepRecipe->operands())) { 2972 auto InputInstance = Instance; 2973 VPValue *Operand = I.value(); 2974 if (State.Plan->isUniformAfterVectorization(Operand)) 2975 InputInstance.Lane = VPLane::getFirstLane(); 2976 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2977 } 2978 addNewMetadata(Cloned, Instr); 2979 2980 // Place the cloned scalar in the new loop. 2981 Builder.Insert(Cloned); 2982 2983 State.set(RepRecipe, Cloned, Instance); 2984 2985 // If we just cloned a new assumption, add it the assumption cache. 2986 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2987 AC->registerAssumption(II); 2988 2989 // End if-block. 2990 if (IfPredicateInstr) 2991 PredicatedInstructions.push_back(Cloned); 2992 } 2993 2994 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2995 Value *End, Value *Step, 2996 Instruction *DL) { 2997 BasicBlock *Header = L->getHeader(); 2998 BasicBlock *Latch = L->getLoopLatch(); 2999 // As we're just creating this loop, it's possible no latch exists 3000 // yet. If so, use the header as this will be a single block loop. 3001 if (!Latch) 3002 Latch = Header; 3003 3004 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3005 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3006 setDebugLocFromInst(OldInst, &B); 3007 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3008 3009 B.SetInsertPoint(Latch->getTerminator()); 3010 setDebugLocFromInst(OldInst, &B); 3011 3012 // Create i+1 and fill the PHINode. 3013 // 3014 // If the tail is not folded, we know that End - Start >= Step (either 3015 // statically or through the minimum iteration checks). We also know that both 3016 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3017 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3018 // overflows and we can mark the induction increment as NUW. 3019 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3020 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3021 Induction->addIncoming(Start, L->getLoopPreheader()); 3022 Induction->addIncoming(Next, Latch); 3023 // Create the compare. 3024 Value *ICmp = B.CreateICmpEQ(Next, End); 3025 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3026 3027 // Now we have two terminators. Remove the old one from the block. 3028 Latch->getTerminator()->eraseFromParent(); 3029 3030 return Induction; 3031 } 3032 3033 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3034 if (TripCount) 3035 return TripCount; 3036 3037 assert(L && "Create Trip Count for null loop."); 3038 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3039 // Find the loop boundaries. 3040 ScalarEvolution *SE = PSE.getSE(); 3041 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3042 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3043 "Invalid loop count"); 3044 3045 Type *IdxTy = Legal->getWidestInductionType(); 3046 assert(IdxTy && "No type for induction"); 3047 3048 // The exit count might have the type of i64 while the phi is i32. This can 3049 // happen if we have an induction variable that is sign extended before the 3050 // compare. The only way that we get a backedge taken count is that the 3051 // induction variable was signed and as such will not overflow. In such a case 3052 // truncation is legal. 3053 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3054 IdxTy->getPrimitiveSizeInBits()) 3055 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3056 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3057 3058 // Get the total trip count from the count by adding 1. 3059 const SCEV *ExitCount = SE->getAddExpr( 3060 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3061 3062 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3063 3064 // Expand the trip count and place the new instructions in the preheader. 3065 // Notice that the pre-header does not change, only the loop body. 3066 SCEVExpander Exp(*SE, DL, "induction"); 3067 3068 // Count holds the overall loop count (N). 3069 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3070 L->getLoopPreheader()->getTerminator()); 3071 3072 if (TripCount->getType()->isPointerTy()) 3073 TripCount = 3074 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3075 L->getLoopPreheader()->getTerminator()); 3076 3077 return TripCount; 3078 } 3079 3080 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3081 if (VectorTripCount) 3082 return VectorTripCount; 3083 3084 Value *TC = getOrCreateTripCount(L); 3085 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3086 3087 Type *Ty = TC->getType(); 3088 // This is where we can make the step a runtime constant. 3089 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3090 3091 // If the tail is to be folded by masking, round the number of iterations N 3092 // up to a multiple of Step instead of rounding down. This is done by first 3093 // adding Step-1 and then rounding down. Note that it's ok if this addition 3094 // overflows: the vector induction variable will eventually wrap to zero given 3095 // that it starts at zero and its Step is a power of two; the loop will then 3096 // exit, with the last early-exit vector comparison also producing all-true. 3097 if (Cost->foldTailByMasking()) { 3098 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3099 "VF*UF must be a power of 2 when folding tail by masking"); 3100 assert(!VF.isScalable() && 3101 "Tail folding not yet supported for scalable vectors"); 3102 TC = Builder.CreateAdd( 3103 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3104 } 3105 3106 // Now we need to generate the expression for the part of the loop that the 3107 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3108 // iterations are not required for correctness, or N - Step, otherwise. Step 3109 // is equal to the vectorization factor (number of SIMD elements) times the 3110 // unroll factor (number of SIMD instructions). 3111 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3112 3113 // There are cases where we *must* run at least one iteration in the remainder 3114 // loop. See the cost model for when this can happen. If the step evenly 3115 // divides the trip count, we set the remainder to be equal to the step. If 3116 // the step does not evenly divide the trip count, no adjustment is necessary 3117 // since there will already be scalar iterations. Note that the minimum 3118 // iterations check ensures that N >= Step. 3119 if (Cost->requiresScalarEpilogue(VF)) { 3120 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3121 R = Builder.CreateSelect(IsZero, Step, R); 3122 } 3123 3124 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3125 3126 return VectorTripCount; 3127 } 3128 3129 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3130 const DataLayout &DL) { 3131 // Verify that V is a vector type with same number of elements as DstVTy. 3132 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3133 unsigned VF = DstFVTy->getNumElements(); 3134 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3135 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3136 Type *SrcElemTy = SrcVecTy->getElementType(); 3137 Type *DstElemTy = DstFVTy->getElementType(); 3138 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3139 "Vector elements must have same size"); 3140 3141 // Do a direct cast if element types are castable. 3142 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3143 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3144 } 3145 // V cannot be directly casted to desired vector type. 3146 // May happen when V is a floating point vector but DstVTy is a vector of 3147 // pointers or vice-versa. Handle this using a two-step bitcast using an 3148 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3149 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3150 "Only one type should be a pointer type"); 3151 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3152 "Only one type should be a floating point type"); 3153 Type *IntTy = 3154 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3155 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3156 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3157 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3158 } 3159 3160 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3161 BasicBlock *Bypass) { 3162 Value *Count = getOrCreateTripCount(L); 3163 // Reuse existing vector loop preheader for TC checks. 3164 // Note that new preheader block is generated for vector loop. 3165 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3166 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3167 3168 // Generate code to check if the loop's trip count is less than VF * UF, or 3169 // equal to it in case a scalar epilogue is required; this implies that the 3170 // vector trip count is zero. This check also covers the case where adding one 3171 // to the backedge-taken count overflowed leading to an incorrect trip count 3172 // of zero. In this case we will also jump to the scalar loop. 3173 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3174 : ICmpInst::ICMP_ULT; 3175 3176 // If tail is to be folded, vector loop takes care of all iterations. 3177 Value *CheckMinIters = Builder.getFalse(); 3178 if (!Cost->foldTailByMasking()) { 3179 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3180 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3181 } 3182 // Create new preheader for vector loop. 3183 LoopVectorPreHeader = 3184 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3185 "vector.ph"); 3186 3187 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3188 DT->getNode(Bypass)->getIDom()) && 3189 "TC check is expected to dominate Bypass"); 3190 3191 // Update dominator for Bypass & LoopExit (if needed). 3192 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3193 if (!Cost->requiresScalarEpilogue(VF)) 3194 // If there is an epilogue which must run, there's no edge from the 3195 // middle block to exit blocks and thus no need to update the immediate 3196 // dominator of the exit blocks. 3197 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3198 3199 ReplaceInstWithInst( 3200 TCCheckBlock->getTerminator(), 3201 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3202 LoopBypassBlocks.push_back(TCCheckBlock); 3203 } 3204 3205 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3206 3207 BasicBlock *const SCEVCheckBlock = 3208 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3209 if (!SCEVCheckBlock) 3210 return nullptr; 3211 3212 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3213 (OptForSizeBasedOnProfile && 3214 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3215 "Cannot SCEV check stride or overflow when optimizing for size"); 3216 3217 3218 // Update dominator only if this is first RT check. 3219 if (LoopBypassBlocks.empty()) { 3220 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3221 if (!Cost->requiresScalarEpilogue(VF)) 3222 // If there is an epilogue which must run, there's no edge from the 3223 // middle block to exit blocks and thus no need to update the immediate 3224 // dominator of the exit blocks. 3225 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3226 } 3227 3228 LoopBypassBlocks.push_back(SCEVCheckBlock); 3229 AddedSafetyChecks = true; 3230 return SCEVCheckBlock; 3231 } 3232 3233 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3234 BasicBlock *Bypass) { 3235 // VPlan-native path does not do any analysis for runtime checks currently. 3236 if (EnableVPlanNativePath) 3237 return nullptr; 3238 3239 BasicBlock *const MemCheckBlock = 3240 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3241 3242 // Check if we generated code that checks in runtime if arrays overlap. We put 3243 // the checks into a separate block to make the more common case of few 3244 // elements faster. 3245 if (!MemCheckBlock) 3246 return nullptr; 3247 3248 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3249 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3250 "Cannot emit memory checks when optimizing for size, unless forced " 3251 "to vectorize."); 3252 ORE->emit([&]() { 3253 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3254 L->getStartLoc(), L->getHeader()) 3255 << "Code-size may be reduced by not forcing " 3256 "vectorization, or by source-code modifications " 3257 "eliminating the need for runtime checks " 3258 "(e.g., adding 'restrict')."; 3259 }); 3260 } 3261 3262 LoopBypassBlocks.push_back(MemCheckBlock); 3263 3264 AddedSafetyChecks = true; 3265 3266 // We currently don't use LoopVersioning for the actual loop cloning but we 3267 // still use it to add the noalias metadata. 3268 LVer = std::make_unique<LoopVersioning>( 3269 *Legal->getLAI(), 3270 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3271 DT, PSE.getSE()); 3272 LVer->prepareNoAliasMetadata(); 3273 return MemCheckBlock; 3274 } 3275 3276 Value *InnerLoopVectorizer::emitTransformedIndex( 3277 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3278 const InductionDescriptor &ID) const { 3279 3280 SCEVExpander Exp(*SE, DL, "induction"); 3281 auto Step = ID.getStep(); 3282 auto StartValue = ID.getStartValue(); 3283 assert(Index->getType()->getScalarType() == Step->getType() && 3284 "Index scalar type does not match StepValue type"); 3285 3286 // Note: the IR at this point is broken. We cannot use SE to create any new 3287 // SCEV and then expand it, hoping that SCEV's simplification will give us 3288 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3289 // lead to various SCEV crashes. So all we can do is to use builder and rely 3290 // on InstCombine for future simplifications. Here we handle some trivial 3291 // cases only. 3292 auto CreateAdd = [&B](Value *X, Value *Y) { 3293 assert(X->getType() == Y->getType() && "Types don't match!"); 3294 if (auto *CX = dyn_cast<ConstantInt>(X)) 3295 if (CX->isZero()) 3296 return Y; 3297 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3298 if (CY->isZero()) 3299 return X; 3300 return B.CreateAdd(X, Y); 3301 }; 3302 3303 // We allow X to be a vector type, in which case Y will potentially be 3304 // splatted into a vector with the same element count. 3305 auto CreateMul = [&B](Value *X, Value *Y) { 3306 assert(X->getType()->getScalarType() == Y->getType() && 3307 "Types don't match!"); 3308 if (auto *CX = dyn_cast<ConstantInt>(X)) 3309 if (CX->isOne()) 3310 return Y; 3311 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3312 if (CY->isOne()) 3313 return X; 3314 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3315 if (XVTy && !isa<VectorType>(Y->getType())) 3316 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3317 return B.CreateMul(X, Y); 3318 }; 3319 3320 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3321 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3322 // the DomTree is not kept up-to-date for additional blocks generated in the 3323 // vector loop. By using the header as insertion point, we guarantee that the 3324 // expanded instructions dominate all their uses. 3325 auto GetInsertPoint = [this, &B]() { 3326 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3327 if (InsertBB != LoopVectorBody && 3328 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3329 return LoopVectorBody->getTerminator(); 3330 return &*B.GetInsertPoint(); 3331 }; 3332 3333 switch (ID.getKind()) { 3334 case InductionDescriptor::IK_IntInduction: { 3335 assert(!isa<VectorType>(Index->getType()) && 3336 "Vector indices not supported for integer inductions yet"); 3337 assert(Index->getType() == StartValue->getType() && 3338 "Index type does not match StartValue type"); 3339 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3340 return B.CreateSub(StartValue, Index); 3341 auto *Offset = CreateMul( 3342 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3343 return CreateAdd(StartValue, Offset); 3344 } 3345 case InductionDescriptor::IK_PtrInduction: { 3346 assert(isa<SCEVConstant>(Step) && 3347 "Expected constant step for pointer induction"); 3348 return B.CreateGEP( 3349 ID.getElementType(), StartValue, 3350 CreateMul(Index, 3351 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3352 GetInsertPoint()))); 3353 } 3354 case InductionDescriptor::IK_FpInduction: { 3355 assert(!isa<VectorType>(Index->getType()) && 3356 "Vector indices not supported for FP inductions yet"); 3357 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3358 auto InductionBinOp = ID.getInductionBinOp(); 3359 assert(InductionBinOp && 3360 (InductionBinOp->getOpcode() == Instruction::FAdd || 3361 InductionBinOp->getOpcode() == Instruction::FSub) && 3362 "Original bin op should be defined for FP induction"); 3363 3364 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3365 Value *MulExp = B.CreateFMul(StepValue, Index); 3366 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3367 "induction"); 3368 } 3369 case InductionDescriptor::IK_NoInduction: 3370 return nullptr; 3371 } 3372 llvm_unreachable("invalid enum"); 3373 } 3374 3375 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3376 LoopScalarBody = OrigLoop->getHeader(); 3377 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3378 assert(LoopVectorPreHeader && "Invalid loop structure"); 3379 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3380 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3381 "multiple exit loop without required epilogue?"); 3382 3383 LoopMiddleBlock = 3384 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3385 LI, nullptr, Twine(Prefix) + "middle.block"); 3386 LoopScalarPreHeader = 3387 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3388 nullptr, Twine(Prefix) + "scalar.ph"); 3389 3390 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3391 3392 // Set up the middle block terminator. Two cases: 3393 // 1) If we know that we must execute the scalar epilogue, emit an 3394 // unconditional branch. 3395 // 2) Otherwise, we must have a single unique exit block (due to how we 3396 // implement the multiple exit case). In this case, set up a conditonal 3397 // branch from the middle block to the loop scalar preheader, and the 3398 // exit block. completeLoopSkeleton will update the condition to use an 3399 // iteration check, if required to decide whether to execute the remainder. 3400 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3401 BranchInst::Create(LoopScalarPreHeader) : 3402 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3403 Builder.getTrue()); 3404 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3405 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3406 3407 // We intentionally don't let SplitBlock to update LoopInfo since 3408 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3409 // LoopVectorBody is explicitly added to the correct place few lines later. 3410 LoopVectorBody = 3411 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3412 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3413 3414 // Update dominator for loop exit. 3415 if (!Cost->requiresScalarEpilogue(VF)) 3416 // If there is an epilogue which must run, there's no edge from the 3417 // middle block to exit blocks and thus no need to update the immediate 3418 // dominator of the exit blocks. 3419 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3420 3421 // Create and register the new vector loop. 3422 Loop *Lp = LI->AllocateLoop(); 3423 Loop *ParentLoop = OrigLoop->getParentLoop(); 3424 3425 // Insert the new loop into the loop nest and register the new basic blocks 3426 // before calling any utilities such as SCEV that require valid LoopInfo. 3427 if (ParentLoop) { 3428 ParentLoop->addChildLoop(Lp); 3429 } else { 3430 LI->addTopLevelLoop(Lp); 3431 } 3432 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3433 return Lp; 3434 } 3435 3436 void InnerLoopVectorizer::createInductionResumeValues( 3437 Loop *L, Value *VectorTripCount, 3438 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3439 assert(VectorTripCount && L && "Expected valid arguments"); 3440 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3441 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3442 "Inconsistent information about additional bypass."); 3443 // We are going to resume the execution of the scalar loop. 3444 // Go over all of the induction variables that we found and fix the 3445 // PHIs that are left in the scalar version of the loop. 3446 // The starting values of PHI nodes depend on the counter of the last 3447 // iteration in the vectorized loop. 3448 // If we come from a bypass edge then we need to start from the original 3449 // start value. 3450 for (auto &InductionEntry : Legal->getInductionVars()) { 3451 PHINode *OrigPhi = InductionEntry.first; 3452 InductionDescriptor II = InductionEntry.second; 3453 3454 // Create phi nodes to merge from the backedge-taken check block. 3455 PHINode *BCResumeVal = 3456 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3457 LoopScalarPreHeader->getTerminator()); 3458 // Copy original phi DL over to the new one. 3459 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3460 Value *&EndValue = IVEndValues[OrigPhi]; 3461 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3462 if (OrigPhi == OldInduction) { 3463 // We know what the end value is. 3464 EndValue = VectorTripCount; 3465 } else { 3466 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3467 3468 // Fast-math-flags propagate from the original induction instruction. 3469 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3470 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3471 3472 Type *StepType = II.getStep()->getType(); 3473 Instruction::CastOps CastOp = 3474 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3475 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3476 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3477 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3478 EndValue->setName("ind.end"); 3479 3480 // Compute the end value for the additional bypass (if applicable). 3481 if (AdditionalBypass.first) { 3482 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3483 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3484 StepType, true); 3485 CRD = 3486 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3487 EndValueFromAdditionalBypass = 3488 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3489 EndValueFromAdditionalBypass->setName("ind.end"); 3490 } 3491 } 3492 // The new PHI merges the original incoming value, in case of a bypass, 3493 // or the value at the end of the vectorized loop. 3494 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3495 3496 // Fix the scalar body counter (PHI node). 3497 // The old induction's phi node in the scalar body needs the truncated 3498 // value. 3499 for (BasicBlock *BB : LoopBypassBlocks) 3500 BCResumeVal->addIncoming(II.getStartValue(), BB); 3501 3502 if (AdditionalBypass.first) 3503 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3504 EndValueFromAdditionalBypass); 3505 3506 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3507 } 3508 } 3509 3510 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3511 MDNode *OrigLoopID) { 3512 assert(L && "Expected valid loop."); 3513 3514 // The trip counts should be cached by now. 3515 Value *Count = getOrCreateTripCount(L); 3516 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3517 3518 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3519 3520 // Add a check in the middle block to see if we have completed 3521 // all of the iterations in the first vector loop. Three cases: 3522 // 1) If we require a scalar epilogue, there is no conditional branch as 3523 // we unconditionally branch to the scalar preheader. Do nothing. 3524 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3525 // Thus if tail is to be folded, we know we don't need to run the 3526 // remainder and we can use the previous value for the condition (true). 3527 // 3) Otherwise, construct a runtime check. 3528 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3529 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3530 Count, VectorTripCount, "cmp.n", 3531 LoopMiddleBlock->getTerminator()); 3532 3533 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3534 // of the corresponding compare because they may have ended up with 3535 // different line numbers and we want to avoid awkward line stepping while 3536 // debugging. Eg. if the compare has got a line number inside the loop. 3537 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3538 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3539 } 3540 3541 // Get ready to start creating new instructions into the vectorized body. 3542 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3543 "Inconsistent vector loop preheader"); 3544 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3545 3546 Optional<MDNode *> VectorizedLoopID = 3547 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3548 LLVMLoopVectorizeFollowupVectorized}); 3549 if (VectorizedLoopID.hasValue()) { 3550 L->setLoopID(VectorizedLoopID.getValue()); 3551 3552 // Do not setAlreadyVectorized if loop attributes have been defined 3553 // explicitly. 3554 return LoopVectorPreHeader; 3555 } 3556 3557 // Keep all loop hints from the original loop on the vector loop (we'll 3558 // replace the vectorizer-specific hints below). 3559 if (MDNode *LID = OrigLoop->getLoopID()) 3560 L->setLoopID(LID); 3561 3562 LoopVectorizeHints Hints(L, true, *ORE); 3563 Hints.setAlreadyVectorized(); 3564 3565 #ifdef EXPENSIVE_CHECKS 3566 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3567 LI->verify(*DT); 3568 #endif 3569 3570 return LoopVectorPreHeader; 3571 } 3572 3573 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3574 /* 3575 In this function we generate a new loop. The new loop will contain 3576 the vectorized instructions while the old loop will continue to run the 3577 scalar remainder. 3578 3579 [ ] <-- loop iteration number check. 3580 / | 3581 / v 3582 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3583 | / | 3584 | / v 3585 || [ ] <-- vector pre header. 3586 |/ | 3587 | v 3588 | [ ] \ 3589 | [ ]_| <-- vector loop. 3590 | | 3591 | v 3592 \ -[ ] <--- middle-block. 3593 \/ | 3594 /\ v 3595 | ->[ ] <--- new preheader. 3596 | | 3597 (opt) v <-- edge from middle to exit iff epilogue is not required. 3598 | [ ] \ 3599 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3600 \ | 3601 \ v 3602 >[ ] <-- exit block(s). 3603 ... 3604 */ 3605 3606 // Get the metadata of the original loop before it gets modified. 3607 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3608 3609 // Workaround! Compute the trip count of the original loop and cache it 3610 // before we start modifying the CFG. This code has a systemic problem 3611 // wherein it tries to run analysis over partially constructed IR; this is 3612 // wrong, and not simply for SCEV. The trip count of the original loop 3613 // simply happens to be prone to hitting this in practice. In theory, we 3614 // can hit the same issue for any SCEV, or ValueTracking query done during 3615 // mutation. See PR49900. 3616 getOrCreateTripCount(OrigLoop); 3617 3618 // Create an empty vector loop, and prepare basic blocks for the runtime 3619 // checks. 3620 Loop *Lp = createVectorLoopSkeleton(""); 3621 3622 // Now, compare the new count to zero. If it is zero skip the vector loop and 3623 // jump to the scalar loop. This check also covers the case where the 3624 // backedge-taken count is uint##_max: adding one to it will overflow leading 3625 // to an incorrect trip count of zero. In this (rare) case we will also jump 3626 // to the scalar loop. 3627 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3628 3629 // Generate the code to check any assumptions that we've made for SCEV 3630 // expressions. 3631 emitSCEVChecks(Lp, LoopScalarPreHeader); 3632 3633 // Generate the code that checks in runtime if arrays overlap. We put the 3634 // checks into a separate block to make the more common case of few elements 3635 // faster. 3636 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3637 3638 // Some loops have a single integer induction variable, while other loops 3639 // don't. One example is c++ iterators that often have multiple pointer 3640 // induction variables. In the code below we also support a case where we 3641 // don't have a single induction variable. 3642 // 3643 // We try to obtain an induction variable from the original loop as hard 3644 // as possible. However if we don't find one that: 3645 // - is an integer 3646 // - counts from zero, stepping by one 3647 // - is the size of the widest induction variable type 3648 // then we create a new one. 3649 OldInduction = Legal->getPrimaryInduction(); 3650 Type *IdxTy = Legal->getWidestInductionType(); 3651 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3652 // The loop step is equal to the vectorization factor (num of SIMD elements) 3653 // times the unroll factor (num of SIMD instructions). 3654 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3655 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3656 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3657 Induction = 3658 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3659 getDebugLocFromInstOrOperands(OldInduction)); 3660 3661 // Emit phis for the new starting index of the scalar loop. 3662 createInductionResumeValues(Lp, CountRoundDown); 3663 3664 return completeLoopSkeleton(Lp, OrigLoopID); 3665 } 3666 3667 // Fix up external users of the induction variable. At this point, we are 3668 // in LCSSA form, with all external PHIs that use the IV having one input value, 3669 // coming from the remainder loop. We need those PHIs to also have a correct 3670 // value for the IV when arriving directly from the middle block. 3671 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3672 const InductionDescriptor &II, 3673 Value *CountRoundDown, Value *EndValue, 3674 BasicBlock *MiddleBlock) { 3675 // There are two kinds of external IV usages - those that use the value 3676 // computed in the last iteration (the PHI) and those that use the penultimate 3677 // value (the value that feeds into the phi from the loop latch). 3678 // We allow both, but they, obviously, have different values. 3679 3680 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3681 3682 DenseMap<Value *, Value *> MissingVals; 3683 3684 // An external user of the last iteration's value should see the value that 3685 // the remainder loop uses to initialize its own IV. 3686 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3687 for (User *U : PostInc->users()) { 3688 Instruction *UI = cast<Instruction>(U); 3689 if (!OrigLoop->contains(UI)) { 3690 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3691 MissingVals[UI] = EndValue; 3692 } 3693 } 3694 3695 // An external user of the penultimate value need to see EndValue - Step. 3696 // The simplest way to get this is to recompute it from the constituent SCEVs, 3697 // that is Start + (Step * (CRD - 1)). 3698 for (User *U : OrigPhi->users()) { 3699 auto *UI = cast<Instruction>(U); 3700 if (!OrigLoop->contains(UI)) { 3701 const DataLayout &DL = 3702 OrigLoop->getHeader()->getModule()->getDataLayout(); 3703 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3704 3705 IRBuilder<> B(MiddleBlock->getTerminator()); 3706 3707 // Fast-math-flags propagate from the original induction instruction. 3708 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3709 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3710 3711 Value *CountMinusOne = B.CreateSub( 3712 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3713 Value *CMO = 3714 !II.getStep()->getType()->isIntegerTy() 3715 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3716 II.getStep()->getType()) 3717 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3718 CMO->setName("cast.cmo"); 3719 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3720 Escape->setName("ind.escape"); 3721 MissingVals[UI] = Escape; 3722 } 3723 } 3724 3725 for (auto &I : MissingVals) { 3726 PHINode *PHI = cast<PHINode>(I.first); 3727 // One corner case we have to handle is two IVs "chasing" each-other, 3728 // that is %IV2 = phi [...], [ %IV1, %latch ] 3729 // In this case, if IV1 has an external use, we need to avoid adding both 3730 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3731 // don't already have an incoming value for the middle block. 3732 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3733 PHI->addIncoming(I.second, MiddleBlock); 3734 } 3735 } 3736 3737 namespace { 3738 3739 struct CSEDenseMapInfo { 3740 static bool canHandle(const Instruction *I) { 3741 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3742 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3743 } 3744 3745 static inline Instruction *getEmptyKey() { 3746 return DenseMapInfo<Instruction *>::getEmptyKey(); 3747 } 3748 3749 static inline Instruction *getTombstoneKey() { 3750 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3751 } 3752 3753 static unsigned getHashValue(const Instruction *I) { 3754 assert(canHandle(I) && "Unknown instruction!"); 3755 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3756 I->value_op_end())); 3757 } 3758 3759 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3760 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3761 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3762 return LHS == RHS; 3763 return LHS->isIdenticalTo(RHS); 3764 } 3765 }; 3766 3767 } // end anonymous namespace 3768 3769 ///Perform cse of induction variable instructions. 3770 static void cse(BasicBlock *BB) { 3771 // Perform simple cse. 3772 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3773 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3774 if (!CSEDenseMapInfo::canHandle(&In)) 3775 continue; 3776 3777 // Check if we can replace this instruction with any of the 3778 // visited instructions. 3779 if (Instruction *V = CSEMap.lookup(&In)) { 3780 In.replaceAllUsesWith(V); 3781 In.eraseFromParent(); 3782 continue; 3783 } 3784 3785 CSEMap[&In] = &In; 3786 } 3787 } 3788 3789 InstructionCost 3790 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3791 bool &NeedToScalarize) const { 3792 Function *F = CI->getCalledFunction(); 3793 Type *ScalarRetTy = CI->getType(); 3794 SmallVector<Type *, 4> Tys, ScalarTys; 3795 for (auto &ArgOp : CI->args()) 3796 ScalarTys.push_back(ArgOp->getType()); 3797 3798 // Estimate cost of scalarized vector call. The source operands are assumed 3799 // to be vectors, so we need to extract individual elements from there, 3800 // execute VF scalar calls, and then gather the result into the vector return 3801 // value. 3802 InstructionCost ScalarCallCost = 3803 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3804 if (VF.isScalar()) 3805 return ScalarCallCost; 3806 3807 // Compute corresponding vector type for return value and arguments. 3808 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3809 for (Type *ScalarTy : ScalarTys) 3810 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3811 3812 // Compute costs of unpacking argument values for the scalar calls and 3813 // packing the return values to a vector. 3814 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3815 3816 InstructionCost Cost = 3817 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3818 3819 // If we can't emit a vector call for this function, then the currently found 3820 // cost is the cost we need to return. 3821 NeedToScalarize = true; 3822 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3823 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3824 3825 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3826 return Cost; 3827 3828 // If the corresponding vector cost is cheaper, return its cost. 3829 InstructionCost VectorCallCost = 3830 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3831 if (VectorCallCost < Cost) { 3832 NeedToScalarize = false; 3833 Cost = VectorCallCost; 3834 } 3835 return Cost; 3836 } 3837 3838 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3839 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3840 return Elt; 3841 return VectorType::get(Elt, VF); 3842 } 3843 3844 InstructionCost 3845 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3846 ElementCount VF) const { 3847 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3848 assert(ID && "Expected intrinsic call!"); 3849 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3850 FastMathFlags FMF; 3851 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3852 FMF = FPMO->getFastMathFlags(); 3853 3854 SmallVector<const Value *> Arguments(CI->args()); 3855 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3856 SmallVector<Type *> ParamTys; 3857 std::transform(FTy->param_begin(), FTy->param_end(), 3858 std::back_inserter(ParamTys), 3859 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3860 3861 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3862 dyn_cast<IntrinsicInst>(CI)); 3863 return TTI.getIntrinsicInstrCost(CostAttrs, 3864 TargetTransformInfo::TCK_RecipThroughput); 3865 } 3866 3867 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3868 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3869 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3870 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3871 } 3872 3873 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3874 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3875 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3876 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3877 } 3878 3879 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3880 // For every instruction `I` in MinBWs, truncate the operands, create a 3881 // truncated version of `I` and reextend its result. InstCombine runs 3882 // later and will remove any ext/trunc pairs. 3883 SmallPtrSet<Value *, 4> Erased; 3884 for (const auto &KV : Cost->getMinimalBitwidths()) { 3885 // If the value wasn't vectorized, we must maintain the original scalar 3886 // type. The absence of the value from State indicates that it 3887 // wasn't vectorized. 3888 // FIXME: Should not rely on getVPValue at this point. 3889 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3890 if (!State.hasAnyVectorValue(Def)) 3891 continue; 3892 for (unsigned Part = 0; Part < UF; ++Part) { 3893 Value *I = State.get(Def, Part); 3894 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3895 continue; 3896 Type *OriginalTy = I->getType(); 3897 Type *ScalarTruncatedTy = 3898 IntegerType::get(OriginalTy->getContext(), KV.second); 3899 auto *TruncatedTy = VectorType::get( 3900 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3901 if (TruncatedTy == OriginalTy) 3902 continue; 3903 3904 IRBuilder<> B(cast<Instruction>(I)); 3905 auto ShrinkOperand = [&](Value *V) -> Value * { 3906 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3907 if (ZI->getSrcTy() == TruncatedTy) 3908 return ZI->getOperand(0); 3909 return B.CreateZExtOrTrunc(V, TruncatedTy); 3910 }; 3911 3912 // The actual instruction modification depends on the instruction type, 3913 // unfortunately. 3914 Value *NewI = nullptr; 3915 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3916 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3917 ShrinkOperand(BO->getOperand(1))); 3918 3919 // Any wrapping introduced by shrinking this operation shouldn't be 3920 // considered undefined behavior. So, we can't unconditionally copy 3921 // arithmetic wrapping flags to NewI. 3922 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3923 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3924 NewI = 3925 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3926 ShrinkOperand(CI->getOperand(1))); 3927 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3928 NewI = B.CreateSelect(SI->getCondition(), 3929 ShrinkOperand(SI->getTrueValue()), 3930 ShrinkOperand(SI->getFalseValue())); 3931 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3932 switch (CI->getOpcode()) { 3933 default: 3934 llvm_unreachable("Unhandled cast!"); 3935 case Instruction::Trunc: 3936 NewI = ShrinkOperand(CI->getOperand(0)); 3937 break; 3938 case Instruction::SExt: 3939 NewI = B.CreateSExtOrTrunc( 3940 CI->getOperand(0), 3941 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3942 break; 3943 case Instruction::ZExt: 3944 NewI = B.CreateZExtOrTrunc( 3945 CI->getOperand(0), 3946 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3947 break; 3948 } 3949 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3950 auto Elements0 = 3951 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3952 auto *O0 = B.CreateZExtOrTrunc( 3953 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3954 auto Elements1 = 3955 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3956 auto *O1 = B.CreateZExtOrTrunc( 3957 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3958 3959 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3960 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3961 // Don't do anything with the operands, just extend the result. 3962 continue; 3963 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3964 auto Elements = 3965 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3966 auto *O0 = B.CreateZExtOrTrunc( 3967 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3968 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3969 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3970 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3971 auto Elements = 3972 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3973 auto *O0 = B.CreateZExtOrTrunc( 3974 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3975 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3976 } else { 3977 // If we don't know what to do, be conservative and don't do anything. 3978 continue; 3979 } 3980 3981 // Lastly, extend the result. 3982 NewI->takeName(cast<Instruction>(I)); 3983 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3984 I->replaceAllUsesWith(Res); 3985 cast<Instruction>(I)->eraseFromParent(); 3986 Erased.insert(I); 3987 State.reset(Def, Res, Part); 3988 } 3989 } 3990 3991 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3992 for (const auto &KV : Cost->getMinimalBitwidths()) { 3993 // If the value wasn't vectorized, we must maintain the original scalar 3994 // type. The absence of the value from State indicates that it 3995 // wasn't vectorized. 3996 // FIXME: Should not rely on getVPValue at this point. 3997 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3998 if (!State.hasAnyVectorValue(Def)) 3999 continue; 4000 for (unsigned Part = 0; Part < UF; ++Part) { 4001 Value *I = State.get(Def, Part); 4002 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4003 if (Inst && Inst->use_empty()) { 4004 Value *NewI = Inst->getOperand(0); 4005 Inst->eraseFromParent(); 4006 State.reset(Def, NewI, Part); 4007 } 4008 } 4009 } 4010 } 4011 4012 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4013 // Insert truncates and extends for any truncated instructions as hints to 4014 // InstCombine. 4015 if (VF.isVector()) 4016 truncateToMinimalBitwidths(State); 4017 4018 // Fix widened non-induction PHIs by setting up the PHI operands. 4019 if (OrigPHIsToFix.size()) { 4020 assert(EnableVPlanNativePath && 4021 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4022 fixNonInductionPHIs(State); 4023 } 4024 4025 // At this point every instruction in the original loop is widened to a 4026 // vector form. Now we need to fix the recurrences in the loop. These PHI 4027 // nodes are currently empty because we did not want to introduce cycles. 4028 // This is the second stage of vectorizing recurrences. 4029 fixCrossIterationPHIs(State); 4030 4031 // Forget the original basic block. 4032 PSE.getSE()->forgetLoop(OrigLoop); 4033 4034 // If we inserted an edge from the middle block to the unique exit block, 4035 // update uses outside the loop (phis) to account for the newly inserted 4036 // edge. 4037 if (!Cost->requiresScalarEpilogue(VF)) { 4038 // Fix-up external users of the induction variables. 4039 for (auto &Entry : Legal->getInductionVars()) 4040 fixupIVUsers(Entry.first, Entry.second, 4041 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4042 IVEndValues[Entry.first], LoopMiddleBlock); 4043 4044 fixLCSSAPHIs(State); 4045 } 4046 4047 for (Instruction *PI : PredicatedInstructions) 4048 sinkScalarOperands(&*PI); 4049 4050 // Remove redundant induction instructions. 4051 cse(LoopVectorBody); 4052 4053 // Set/update profile weights for the vector and remainder loops as original 4054 // loop iterations are now distributed among them. Note that original loop 4055 // represented by LoopScalarBody becomes remainder loop after vectorization. 4056 // 4057 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4058 // end up getting slightly roughened result but that should be OK since 4059 // profile is not inherently precise anyway. Note also possible bypass of 4060 // vector code caused by legality checks is ignored, assigning all the weight 4061 // to the vector loop, optimistically. 4062 // 4063 // For scalable vectorization we can't know at compile time how many iterations 4064 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4065 // vscale of '1'. 4066 setProfileInfoAfterUnrolling( 4067 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4068 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4069 } 4070 4071 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4072 // In order to support recurrences we need to be able to vectorize Phi nodes. 4073 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4074 // stage #2: We now need to fix the recurrences by adding incoming edges to 4075 // the currently empty PHI nodes. At this point every instruction in the 4076 // original loop is widened to a vector form so we can use them to construct 4077 // the incoming edges. 4078 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4079 for (VPRecipeBase &R : Header->phis()) { 4080 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4081 fixReduction(ReductionPhi, State); 4082 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4083 fixFirstOrderRecurrence(FOR, State); 4084 } 4085 } 4086 4087 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4088 VPTransformState &State) { 4089 // This is the second phase of vectorizing first-order recurrences. An 4090 // overview of the transformation is described below. Suppose we have the 4091 // following loop. 4092 // 4093 // for (int i = 0; i < n; ++i) 4094 // b[i] = a[i] - a[i - 1]; 4095 // 4096 // There is a first-order recurrence on "a". For this loop, the shorthand 4097 // scalar IR looks like: 4098 // 4099 // scalar.ph: 4100 // s_init = a[-1] 4101 // br scalar.body 4102 // 4103 // scalar.body: 4104 // i = phi [0, scalar.ph], [i+1, scalar.body] 4105 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4106 // s2 = a[i] 4107 // b[i] = s2 - s1 4108 // br cond, scalar.body, ... 4109 // 4110 // In this example, s1 is a recurrence because it's value depends on the 4111 // previous iteration. In the first phase of vectorization, we created a 4112 // vector phi v1 for s1. We now complete the vectorization and produce the 4113 // shorthand vector IR shown below (for VF = 4, UF = 1). 4114 // 4115 // vector.ph: 4116 // v_init = vector(..., ..., ..., a[-1]) 4117 // br vector.body 4118 // 4119 // vector.body 4120 // i = phi [0, vector.ph], [i+4, vector.body] 4121 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4122 // v2 = a[i, i+1, i+2, i+3]; 4123 // v3 = vector(v1(3), v2(0, 1, 2)) 4124 // b[i, i+1, i+2, i+3] = v2 - v3 4125 // br cond, vector.body, middle.block 4126 // 4127 // middle.block: 4128 // x = v2(3) 4129 // br scalar.ph 4130 // 4131 // scalar.ph: 4132 // s_init = phi [x, middle.block], [a[-1], otherwise] 4133 // br scalar.body 4134 // 4135 // After execution completes the vector loop, we extract the next value of 4136 // the recurrence (x) to use as the initial value in the scalar loop. 4137 4138 // Extract the last vector element in the middle block. This will be the 4139 // initial value for the recurrence when jumping to the scalar loop. 4140 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4141 Value *Incoming = State.get(PreviousDef, UF - 1); 4142 auto *ExtractForScalar = Incoming; 4143 auto *IdxTy = Builder.getInt32Ty(); 4144 if (VF.isVector()) { 4145 auto *One = ConstantInt::get(IdxTy, 1); 4146 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4147 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4148 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4149 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4150 "vector.recur.extract"); 4151 } 4152 // Extract the second last element in the middle block if the 4153 // Phi is used outside the loop. We need to extract the phi itself 4154 // and not the last element (the phi update in the current iteration). This 4155 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4156 // when the scalar loop is not run at all. 4157 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4158 if (VF.isVector()) { 4159 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4160 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4161 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4162 Incoming, Idx, "vector.recur.extract.for.phi"); 4163 } else if (UF > 1) 4164 // When loop is unrolled without vectorizing, initialize 4165 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4166 // of `Incoming`. This is analogous to the vectorized case above: extracting 4167 // the second last element when VF > 1. 4168 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4169 4170 // Fix the initial value of the original recurrence in the scalar loop. 4171 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4172 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4173 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4174 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4175 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4176 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4177 Start->addIncoming(Incoming, BB); 4178 } 4179 4180 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4181 Phi->setName("scalar.recur"); 4182 4183 // Finally, fix users of the recurrence outside the loop. The users will need 4184 // either the last value of the scalar recurrence or the last value of the 4185 // vector recurrence we extracted in the middle block. Since the loop is in 4186 // LCSSA form, we just need to find all the phi nodes for the original scalar 4187 // recurrence in the exit block, and then add an edge for the middle block. 4188 // Note that LCSSA does not imply single entry when the original scalar loop 4189 // had multiple exiting edges (as we always run the last iteration in the 4190 // scalar epilogue); in that case, there is no edge from middle to exit and 4191 // and thus no phis which needed updated. 4192 if (!Cost->requiresScalarEpilogue(VF)) 4193 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4194 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4195 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4196 } 4197 4198 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4199 VPTransformState &State) { 4200 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4201 // Get it's reduction variable descriptor. 4202 assert(Legal->isReductionVariable(OrigPhi) && 4203 "Unable to find the reduction variable"); 4204 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4205 4206 RecurKind RK = RdxDesc.getRecurrenceKind(); 4207 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4208 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4209 setDebugLocFromInst(ReductionStartValue); 4210 4211 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4212 // This is the vector-clone of the value that leaves the loop. 4213 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4214 4215 // Wrap flags are in general invalid after vectorization, clear them. 4216 clearReductionWrapFlags(RdxDesc, State); 4217 4218 // Before each round, move the insertion point right between 4219 // the PHIs and the values we are going to write. 4220 // This allows us to write both PHINodes and the extractelement 4221 // instructions. 4222 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4223 4224 setDebugLocFromInst(LoopExitInst); 4225 4226 Type *PhiTy = OrigPhi->getType(); 4227 // If tail is folded by masking, the vector value to leave the loop should be 4228 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4229 // instead of the former. For an inloop reduction the reduction will already 4230 // be predicated, and does not need to be handled here. 4231 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4232 for (unsigned Part = 0; Part < UF; ++Part) { 4233 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4234 Value *Sel = nullptr; 4235 for (User *U : VecLoopExitInst->users()) { 4236 if (isa<SelectInst>(U)) { 4237 assert(!Sel && "Reduction exit feeding two selects"); 4238 Sel = U; 4239 } else 4240 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4241 } 4242 assert(Sel && "Reduction exit feeds no select"); 4243 State.reset(LoopExitInstDef, Sel, Part); 4244 4245 // If the target can create a predicated operator for the reduction at no 4246 // extra cost in the loop (for example a predicated vadd), it can be 4247 // cheaper for the select to remain in the loop than be sunk out of it, 4248 // and so use the select value for the phi instead of the old 4249 // LoopExitValue. 4250 if (PreferPredicatedReductionSelect || 4251 TTI->preferPredicatedReductionSelect( 4252 RdxDesc.getOpcode(), PhiTy, 4253 TargetTransformInfo::ReductionFlags())) { 4254 auto *VecRdxPhi = 4255 cast<PHINode>(State.get(PhiR, Part)); 4256 VecRdxPhi->setIncomingValueForBlock( 4257 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4258 } 4259 } 4260 } 4261 4262 // If the vector reduction can be performed in a smaller type, we truncate 4263 // then extend the loop exit value to enable InstCombine to evaluate the 4264 // entire expression in the smaller type. 4265 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4266 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4267 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4268 Builder.SetInsertPoint( 4269 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4270 VectorParts RdxParts(UF); 4271 for (unsigned Part = 0; Part < UF; ++Part) { 4272 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4273 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4274 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4275 : Builder.CreateZExt(Trunc, VecTy); 4276 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4277 if (U != Trunc) { 4278 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4279 RdxParts[Part] = Extnd; 4280 } 4281 } 4282 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4283 for (unsigned Part = 0; Part < UF; ++Part) { 4284 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4285 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4286 } 4287 } 4288 4289 // Reduce all of the unrolled parts into a single vector. 4290 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4291 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4292 4293 // The middle block terminator has already been assigned a DebugLoc here (the 4294 // OrigLoop's single latch terminator). We want the whole middle block to 4295 // appear to execute on this line because: (a) it is all compiler generated, 4296 // (b) these instructions are always executed after evaluating the latch 4297 // conditional branch, and (c) other passes may add new predecessors which 4298 // terminate on this line. This is the easiest way to ensure we don't 4299 // accidentally cause an extra step back into the loop while debugging. 4300 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4301 if (PhiR->isOrdered()) 4302 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4303 else { 4304 // Floating-point operations should have some FMF to enable the reduction. 4305 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4306 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4307 for (unsigned Part = 1; Part < UF; ++Part) { 4308 Value *RdxPart = State.get(LoopExitInstDef, Part); 4309 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4310 ReducedPartRdx = Builder.CreateBinOp( 4311 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4312 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4313 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4314 ReducedPartRdx, RdxPart); 4315 else 4316 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4317 } 4318 } 4319 4320 // Create the reduction after the loop. Note that inloop reductions create the 4321 // target reduction in the loop using a Reduction recipe. 4322 if (VF.isVector() && !PhiR->isInLoop()) { 4323 ReducedPartRdx = 4324 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4325 // If the reduction can be performed in a smaller type, we need to extend 4326 // the reduction to the wider type before we branch to the original loop. 4327 if (PhiTy != RdxDesc.getRecurrenceType()) 4328 ReducedPartRdx = RdxDesc.isSigned() 4329 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4330 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4331 } 4332 4333 // Create a phi node that merges control-flow from the backedge-taken check 4334 // block and the middle block. 4335 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4336 LoopScalarPreHeader->getTerminator()); 4337 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4338 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4339 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4340 4341 // Now, we need to fix the users of the reduction variable 4342 // inside and outside of the scalar remainder loop. 4343 4344 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4345 // in the exit blocks. See comment on analogous loop in 4346 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4347 if (!Cost->requiresScalarEpilogue(VF)) 4348 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4349 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4350 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4351 4352 // Fix the scalar loop reduction variable with the incoming reduction sum 4353 // from the vector body and from the backedge value. 4354 int IncomingEdgeBlockIdx = 4355 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4356 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4357 // Pick the other block. 4358 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4359 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4360 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4361 } 4362 4363 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4364 VPTransformState &State) { 4365 RecurKind RK = RdxDesc.getRecurrenceKind(); 4366 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4367 return; 4368 4369 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4370 assert(LoopExitInstr && "null loop exit instruction"); 4371 SmallVector<Instruction *, 8> Worklist; 4372 SmallPtrSet<Instruction *, 8> Visited; 4373 Worklist.push_back(LoopExitInstr); 4374 Visited.insert(LoopExitInstr); 4375 4376 while (!Worklist.empty()) { 4377 Instruction *Cur = Worklist.pop_back_val(); 4378 if (isa<OverflowingBinaryOperator>(Cur)) 4379 for (unsigned Part = 0; Part < UF; ++Part) { 4380 // FIXME: Should not rely on getVPValue at this point. 4381 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4382 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4383 } 4384 4385 for (User *U : Cur->users()) { 4386 Instruction *UI = cast<Instruction>(U); 4387 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4388 Visited.insert(UI).second) 4389 Worklist.push_back(UI); 4390 } 4391 } 4392 } 4393 4394 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4395 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4396 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4397 // Some phis were already hand updated by the reduction and recurrence 4398 // code above, leave them alone. 4399 continue; 4400 4401 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4402 // Non-instruction incoming values will have only one value. 4403 4404 VPLane Lane = VPLane::getFirstLane(); 4405 if (isa<Instruction>(IncomingValue) && 4406 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4407 VF)) 4408 Lane = VPLane::getLastLaneForVF(VF); 4409 4410 // Can be a loop invariant incoming value or the last scalar value to be 4411 // extracted from the vectorized loop. 4412 // FIXME: Should not rely on getVPValue at this point. 4413 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4414 Value *lastIncomingValue = 4415 OrigLoop->isLoopInvariant(IncomingValue) 4416 ? IncomingValue 4417 : State.get(State.Plan->getVPValue(IncomingValue, true), 4418 VPIteration(UF - 1, Lane)); 4419 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4420 } 4421 } 4422 4423 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4424 // The basic block and loop containing the predicated instruction. 4425 auto *PredBB = PredInst->getParent(); 4426 auto *VectorLoop = LI->getLoopFor(PredBB); 4427 4428 // Initialize a worklist with the operands of the predicated instruction. 4429 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4430 4431 // Holds instructions that we need to analyze again. An instruction may be 4432 // reanalyzed if we don't yet know if we can sink it or not. 4433 SmallVector<Instruction *, 8> InstsToReanalyze; 4434 4435 // Returns true if a given use occurs in the predicated block. Phi nodes use 4436 // their operands in their corresponding predecessor blocks. 4437 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4438 auto *I = cast<Instruction>(U.getUser()); 4439 BasicBlock *BB = I->getParent(); 4440 if (auto *Phi = dyn_cast<PHINode>(I)) 4441 BB = Phi->getIncomingBlock( 4442 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4443 return BB == PredBB; 4444 }; 4445 4446 // Iteratively sink the scalarized operands of the predicated instruction 4447 // into the block we created for it. When an instruction is sunk, it's 4448 // operands are then added to the worklist. The algorithm ends after one pass 4449 // through the worklist doesn't sink a single instruction. 4450 bool Changed; 4451 do { 4452 // Add the instructions that need to be reanalyzed to the worklist, and 4453 // reset the changed indicator. 4454 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4455 InstsToReanalyze.clear(); 4456 Changed = false; 4457 4458 while (!Worklist.empty()) { 4459 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4460 4461 // We can't sink an instruction if it is a phi node, is not in the loop, 4462 // or may have side effects. 4463 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4464 I->mayHaveSideEffects()) 4465 continue; 4466 4467 // If the instruction is already in PredBB, check if we can sink its 4468 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4469 // sinking the scalar instruction I, hence it appears in PredBB; but it 4470 // may have failed to sink I's operands (recursively), which we try 4471 // (again) here. 4472 if (I->getParent() == PredBB) { 4473 Worklist.insert(I->op_begin(), I->op_end()); 4474 continue; 4475 } 4476 4477 // It's legal to sink the instruction if all its uses occur in the 4478 // predicated block. Otherwise, there's nothing to do yet, and we may 4479 // need to reanalyze the instruction. 4480 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4481 InstsToReanalyze.push_back(I); 4482 continue; 4483 } 4484 4485 // Move the instruction to the beginning of the predicated block, and add 4486 // it's operands to the worklist. 4487 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4488 Worklist.insert(I->op_begin(), I->op_end()); 4489 4490 // The sinking may have enabled other instructions to be sunk, so we will 4491 // need to iterate. 4492 Changed = true; 4493 } 4494 } while (Changed); 4495 } 4496 4497 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4498 for (PHINode *OrigPhi : OrigPHIsToFix) { 4499 VPWidenPHIRecipe *VPPhi = 4500 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4501 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4502 // Make sure the builder has a valid insert point. 4503 Builder.SetInsertPoint(NewPhi); 4504 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4505 VPValue *Inc = VPPhi->getIncomingValue(i); 4506 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4507 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4508 } 4509 } 4510 } 4511 4512 bool InnerLoopVectorizer::useOrderedReductions( 4513 const RecurrenceDescriptor &RdxDesc) { 4514 return Cost->useOrderedReductions(RdxDesc); 4515 } 4516 4517 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4518 VPWidenPHIRecipe *PhiR, 4519 VPTransformState &State) { 4520 PHINode *P = cast<PHINode>(PN); 4521 if (EnableVPlanNativePath) { 4522 // Currently we enter here in the VPlan-native path for non-induction 4523 // PHIs where all control flow is uniform. We simply widen these PHIs. 4524 // Create a vector phi with no operands - the vector phi operands will be 4525 // set at the end of vector code generation. 4526 Type *VecTy = (State.VF.isScalar()) 4527 ? PN->getType() 4528 : VectorType::get(PN->getType(), State.VF); 4529 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4530 State.set(PhiR, VecPhi, 0); 4531 OrigPHIsToFix.push_back(P); 4532 4533 return; 4534 } 4535 4536 assert(PN->getParent() == OrigLoop->getHeader() && 4537 "Non-header phis should have been handled elsewhere"); 4538 4539 // In order to support recurrences we need to be able to vectorize Phi nodes. 4540 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4541 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4542 // this value when we vectorize all of the instructions that use the PHI. 4543 4544 assert(!Legal->isReductionVariable(P) && 4545 "reductions should be handled elsewhere"); 4546 4547 setDebugLocFromInst(P); 4548 4549 // This PHINode must be an induction variable. 4550 // Make sure that we know about it. 4551 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4552 4553 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4554 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4555 4556 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4557 // which can be found from the original scalar operations. 4558 switch (II.getKind()) { 4559 case InductionDescriptor::IK_NoInduction: 4560 llvm_unreachable("Unknown induction"); 4561 case InductionDescriptor::IK_IntInduction: 4562 case InductionDescriptor::IK_FpInduction: 4563 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4564 case InductionDescriptor::IK_PtrInduction: { 4565 // Handle the pointer induction variable case. 4566 assert(P->getType()->isPointerTy() && "Unexpected type."); 4567 4568 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4569 // This is the normalized GEP that starts counting at zero. 4570 Value *PtrInd = 4571 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4572 // Determine the number of scalars we need to generate for each unroll 4573 // iteration. If the instruction is uniform, we only need to generate the 4574 // first lane. Otherwise, we generate all VF values. 4575 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4576 assert((IsUniform || !State.VF.isScalable()) && 4577 "Cannot scalarize a scalable VF"); 4578 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4579 4580 for (unsigned Part = 0; Part < UF; ++Part) { 4581 Value *PartStart = 4582 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4583 4584 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4585 Value *Idx = Builder.CreateAdd( 4586 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4587 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4588 Value *SclrGep = 4589 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4590 SclrGep->setName("next.gep"); 4591 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4592 } 4593 } 4594 return; 4595 } 4596 assert(isa<SCEVConstant>(II.getStep()) && 4597 "Induction step not a SCEV constant!"); 4598 Type *PhiType = II.getStep()->getType(); 4599 4600 // Build a pointer phi 4601 Value *ScalarStartValue = II.getStartValue(); 4602 Type *ScStValueType = ScalarStartValue->getType(); 4603 PHINode *NewPointerPhi = 4604 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4605 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4606 4607 // A pointer induction, performed by using a gep 4608 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4609 Instruction *InductionLoc = LoopLatch->getTerminator(); 4610 const SCEV *ScalarStep = II.getStep(); 4611 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4612 Value *ScalarStepValue = 4613 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4614 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4615 Value *NumUnrolledElems = 4616 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4617 Value *InductionGEP = GetElementPtrInst::Create( 4618 II.getElementType(), NewPointerPhi, 4619 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4620 InductionLoc); 4621 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4622 4623 // Create UF many actual address geps that use the pointer 4624 // phi as base and a vectorized version of the step value 4625 // (<step*0, ..., step*N>) as offset. 4626 for (unsigned Part = 0; Part < State.UF; ++Part) { 4627 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4628 Value *StartOffsetScalar = 4629 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4630 Value *StartOffset = 4631 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4632 // Create a vector of consecutive numbers from zero to VF. 4633 StartOffset = 4634 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4635 4636 Value *GEP = Builder.CreateGEP( 4637 II.getElementType(), NewPointerPhi, 4638 Builder.CreateMul( 4639 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4640 "vector.gep")); 4641 State.set(PhiR, GEP, Part); 4642 } 4643 } 4644 } 4645 } 4646 4647 /// A helper function for checking whether an integer division-related 4648 /// instruction may divide by zero (in which case it must be predicated if 4649 /// executed conditionally in the scalar code). 4650 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4651 /// Non-zero divisors that are non compile-time constants will not be 4652 /// converted into multiplication, so we will still end up scalarizing 4653 /// the division, but can do so w/o predication. 4654 static bool mayDivideByZero(Instruction &I) { 4655 assert((I.getOpcode() == Instruction::UDiv || 4656 I.getOpcode() == Instruction::SDiv || 4657 I.getOpcode() == Instruction::URem || 4658 I.getOpcode() == Instruction::SRem) && 4659 "Unexpected instruction"); 4660 Value *Divisor = I.getOperand(1); 4661 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4662 return !CInt || CInt->isZero(); 4663 } 4664 4665 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4666 VPUser &ArgOperands, 4667 VPTransformState &State) { 4668 assert(!isa<DbgInfoIntrinsic>(I) && 4669 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4670 setDebugLocFromInst(&I); 4671 4672 Module *M = I.getParent()->getParent()->getParent(); 4673 auto *CI = cast<CallInst>(&I); 4674 4675 SmallVector<Type *, 4> Tys; 4676 for (Value *ArgOperand : CI->args()) 4677 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4678 4679 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4680 4681 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4682 // version of the instruction. 4683 // Is it beneficial to perform intrinsic call compared to lib call? 4684 bool NeedToScalarize = false; 4685 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4686 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4687 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4688 assert((UseVectorIntrinsic || !NeedToScalarize) && 4689 "Instruction should be scalarized elsewhere."); 4690 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4691 "Either the intrinsic cost or vector call cost must be valid"); 4692 4693 for (unsigned Part = 0; Part < UF; ++Part) { 4694 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4695 SmallVector<Value *, 4> Args; 4696 for (auto &I : enumerate(ArgOperands.operands())) { 4697 // Some intrinsics have a scalar argument - don't replace it with a 4698 // vector. 4699 Value *Arg; 4700 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4701 Arg = State.get(I.value(), Part); 4702 else { 4703 Arg = State.get(I.value(), VPIteration(0, 0)); 4704 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4705 TysForDecl.push_back(Arg->getType()); 4706 } 4707 Args.push_back(Arg); 4708 } 4709 4710 Function *VectorF; 4711 if (UseVectorIntrinsic) { 4712 // Use vector version of the intrinsic. 4713 if (VF.isVector()) 4714 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4715 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4716 assert(VectorF && "Can't retrieve vector intrinsic."); 4717 } else { 4718 // Use vector version of the function call. 4719 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4720 #ifndef NDEBUG 4721 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4722 "Can't create vector function."); 4723 #endif 4724 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4725 } 4726 SmallVector<OperandBundleDef, 1> OpBundles; 4727 CI->getOperandBundlesAsDefs(OpBundles); 4728 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4729 4730 if (isa<FPMathOperator>(V)) 4731 V->copyFastMathFlags(CI); 4732 4733 State.set(Def, V, Part); 4734 addMetadata(V, &I); 4735 } 4736 } 4737 4738 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4739 // We should not collect Scalars more than once per VF. Right now, this 4740 // function is called from collectUniformsAndScalars(), which already does 4741 // this check. Collecting Scalars for VF=1 does not make any sense. 4742 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4743 "This function should not be visited twice for the same VF"); 4744 4745 SmallSetVector<Instruction *, 8> Worklist; 4746 4747 // These sets are used to seed the analysis with pointers used by memory 4748 // accesses that will remain scalar. 4749 SmallSetVector<Instruction *, 8> ScalarPtrs; 4750 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4751 auto *Latch = TheLoop->getLoopLatch(); 4752 4753 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4754 // The pointer operands of loads and stores will be scalar as long as the 4755 // memory access is not a gather or scatter operation. The value operand of a 4756 // store will remain scalar if the store is scalarized. 4757 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4758 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4759 assert(WideningDecision != CM_Unknown && 4760 "Widening decision should be ready at this moment"); 4761 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4762 if (Ptr == Store->getValueOperand()) 4763 return WideningDecision == CM_Scalarize; 4764 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4765 "Ptr is neither a value or pointer operand"); 4766 return WideningDecision != CM_GatherScatter; 4767 }; 4768 4769 // A helper that returns true if the given value is a bitcast or 4770 // getelementptr instruction contained in the loop. 4771 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4772 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4773 isa<GetElementPtrInst>(V)) && 4774 !TheLoop->isLoopInvariant(V); 4775 }; 4776 4777 // A helper that evaluates a memory access's use of a pointer. If the use will 4778 // be a scalar use and the pointer is only used by memory accesses, we place 4779 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4780 // PossibleNonScalarPtrs. 4781 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4782 // We only care about bitcast and getelementptr instructions contained in 4783 // the loop. 4784 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4785 return; 4786 4787 // If the pointer has already been identified as scalar (e.g., if it was 4788 // also identified as uniform), there's nothing to do. 4789 auto *I = cast<Instruction>(Ptr); 4790 if (Worklist.count(I)) 4791 return; 4792 4793 // If the use of the pointer will be a scalar use, and all users of the 4794 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4795 // place the pointer in PossibleNonScalarPtrs. 4796 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4797 return isa<LoadInst>(U) || isa<StoreInst>(U); 4798 })) 4799 ScalarPtrs.insert(I); 4800 else 4801 PossibleNonScalarPtrs.insert(I); 4802 }; 4803 4804 // We seed the scalars analysis with three classes of instructions: (1) 4805 // instructions marked uniform-after-vectorization and (2) bitcast, 4806 // getelementptr and (pointer) phi instructions used by memory accesses 4807 // requiring a scalar use. 4808 // 4809 // (1) Add to the worklist all instructions that have been identified as 4810 // uniform-after-vectorization. 4811 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4812 4813 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4814 // memory accesses requiring a scalar use. The pointer operands of loads and 4815 // stores will be scalar as long as the memory accesses is not a gather or 4816 // scatter operation. The value operand of a store will remain scalar if the 4817 // store is scalarized. 4818 for (auto *BB : TheLoop->blocks()) 4819 for (auto &I : *BB) { 4820 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4821 evaluatePtrUse(Load, Load->getPointerOperand()); 4822 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4823 evaluatePtrUse(Store, Store->getPointerOperand()); 4824 evaluatePtrUse(Store, Store->getValueOperand()); 4825 } 4826 } 4827 for (auto *I : ScalarPtrs) 4828 if (!PossibleNonScalarPtrs.count(I)) { 4829 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4830 Worklist.insert(I); 4831 } 4832 4833 // Insert the forced scalars. 4834 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4835 // induction variable when the PHI user is scalarized. 4836 auto ForcedScalar = ForcedScalars.find(VF); 4837 if (ForcedScalar != ForcedScalars.end()) 4838 for (auto *I : ForcedScalar->second) 4839 Worklist.insert(I); 4840 4841 // Expand the worklist by looking through any bitcasts and getelementptr 4842 // instructions we've already identified as scalar. This is similar to the 4843 // expansion step in collectLoopUniforms(); however, here we're only 4844 // expanding to include additional bitcasts and getelementptr instructions. 4845 unsigned Idx = 0; 4846 while (Idx != Worklist.size()) { 4847 Instruction *Dst = Worklist[Idx++]; 4848 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4849 continue; 4850 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4851 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4852 auto *J = cast<Instruction>(U); 4853 return !TheLoop->contains(J) || Worklist.count(J) || 4854 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4855 isScalarUse(J, Src)); 4856 })) { 4857 Worklist.insert(Src); 4858 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4859 } 4860 } 4861 4862 // An induction variable will remain scalar if all users of the induction 4863 // variable and induction variable update remain scalar. 4864 for (auto &Induction : Legal->getInductionVars()) { 4865 auto *Ind = Induction.first; 4866 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4867 4868 // If tail-folding is applied, the primary induction variable will be used 4869 // to feed a vector compare. 4870 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4871 continue; 4872 4873 // Returns true if \p Indvar is a pointer induction that is used directly by 4874 // load/store instruction \p I. 4875 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4876 Instruction *I) { 4877 return Induction.second.getKind() == 4878 InductionDescriptor::IK_PtrInduction && 4879 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4880 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4881 }; 4882 4883 // Determine if all users of the induction variable are scalar after 4884 // vectorization. 4885 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4886 auto *I = cast<Instruction>(U); 4887 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4888 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4889 }); 4890 if (!ScalarInd) 4891 continue; 4892 4893 // Determine if all users of the induction variable update instruction are 4894 // scalar after vectorization. 4895 auto ScalarIndUpdate = 4896 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4897 auto *I = cast<Instruction>(U); 4898 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4899 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4900 }); 4901 if (!ScalarIndUpdate) 4902 continue; 4903 4904 // The induction variable and its update instruction will remain scalar. 4905 Worklist.insert(Ind); 4906 Worklist.insert(IndUpdate); 4907 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4908 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4909 << "\n"); 4910 } 4911 4912 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4913 } 4914 4915 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 4916 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4917 return false; 4918 switch(I->getOpcode()) { 4919 default: 4920 break; 4921 case Instruction::Load: 4922 case Instruction::Store: { 4923 if (!Legal->isMaskRequired(I)) 4924 return false; 4925 auto *Ptr = getLoadStorePointerOperand(I); 4926 auto *Ty = getLoadStoreType(I); 4927 const Align Alignment = getLoadStoreAlignment(I); 4928 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4929 TTI.isLegalMaskedGather(Ty, Alignment)) 4930 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4931 TTI.isLegalMaskedScatter(Ty, Alignment)); 4932 } 4933 case Instruction::UDiv: 4934 case Instruction::SDiv: 4935 case Instruction::SRem: 4936 case Instruction::URem: 4937 return mayDivideByZero(*I); 4938 } 4939 return false; 4940 } 4941 4942 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4943 Instruction *I, ElementCount VF) { 4944 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4945 assert(getWideningDecision(I, VF) == CM_Unknown && 4946 "Decision should not be set yet."); 4947 auto *Group = getInterleavedAccessGroup(I); 4948 assert(Group && "Must have a group."); 4949 4950 // If the instruction's allocated size doesn't equal it's type size, it 4951 // requires padding and will be scalarized. 4952 auto &DL = I->getModule()->getDataLayout(); 4953 auto *ScalarTy = getLoadStoreType(I); 4954 if (hasIrregularType(ScalarTy, DL)) 4955 return false; 4956 4957 // Check if masking is required. 4958 // A Group may need masking for one of two reasons: it resides in a block that 4959 // needs predication, or it was decided to use masking to deal with gaps 4960 // (either a gap at the end of a load-access that may result in a speculative 4961 // load, or any gaps in a store-access). 4962 bool PredicatedAccessRequiresMasking = 4963 blockNeedsPredicationForAnyReason(I->getParent()) && 4964 Legal->isMaskRequired(I); 4965 bool LoadAccessWithGapsRequiresEpilogMasking = 4966 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4967 !isScalarEpilogueAllowed(); 4968 bool StoreAccessWithGapsRequiresMasking = 4969 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4970 if (!PredicatedAccessRequiresMasking && 4971 !LoadAccessWithGapsRequiresEpilogMasking && 4972 !StoreAccessWithGapsRequiresMasking) 4973 return true; 4974 4975 // If masked interleaving is required, we expect that the user/target had 4976 // enabled it, because otherwise it either wouldn't have been created or 4977 // it should have been invalidated by the CostModel. 4978 assert(useMaskedInterleavedAccesses(TTI) && 4979 "Masked interleave-groups for predicated accesses are not enabled."); 4980 4981 if (Group->isReverse()) 4982 return false; 4983 4984 auto *Ty = getLoadStoreType(I); 4985 const Align Alignment = getLoadStoreAlignment(I); 4986 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4987 : TTI.isLegalMaskedStore(Ty, Alignment); 4988 } 4989 4990 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4991 Instruction *I, ElementCount VF) { 4992 // Get and ensure we have a valid memory instruction. 4993 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4994 4995 auto *Ptr = getLoadStorePointerOperand(I); 4996 auto *ScalarTy = getLoadStoreType(I); 4997 4998 // In order to be widened, the pointer should be consecutive, first of all. 4999 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5000 return false; 5001 5002 // If the instruction is a store located in a predicated block, it will be 5003 // scalarized. 5004 if (isScalarWithPredication(I)) 5005 return false; 5006 5007 // If the instruction's allocated size doesn't equal it's type size, it 5008 // requires padding and will be scalarized. 5009 auto &DL = I->getModule()->getDataLayout(); 5010 if (hasIrregularType(ScalarTy, DL)) 5011 return false; 5012 5013 return true; 5014 } 5015 5016 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5017 // We should not collect Uniforms more than once per VF. Right now, 5018 // this function is called from collectUniformsAndScalars(), which 5019 // already does this check. Collecting Uniforms for VF=1 does not make any 5020 // sense. 5021 5022 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5023 "This function should not be visited twice for the same VF"); 5024 5025 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5026 // not analyze again. Uniforms.count(VF) will return 1. 5027 Uniforms[VF].clear(); 5028 5029 // We now know that the loop is vectorizable! 5030 // Collect instructions inside the loop that will remain uniform after 5031 // vectorization. 5032 5033 // Global values, params and instructions outside of current loop are out of 5034 // scope. 5035 auto isOutOfScope = [&](Value *V) -> bool { 5036 Instruction *I = dyn_cast<Instruction>(V); 5037 return (!I || !TheLoop->contains(I)); 5038 }; 5039 5040 // Worklist containing uniform instructions demanding lane 0. 5041 SetVector<Instruction *> Worklist; 5042 BasicBlock *Latch = TheLoop->getLoopLatch(); 5043 5044 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5045 // that are scalar with predication must not be considered uniform after 5046 // vectorization, because that would create an erroneous replicating region 5047 // where only a single instance out of VF should be formed. 5048 // TODO: optimize such seldom cases if found important, see PR40816. 5049 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5050 if (isOutOfScope(I)) { 5051 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5052 << *I << "\n"); 5053 return; 5054 } 5055 if (isScalarWithPredication(I)) { 5056 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5057 << *I << "\n"); 5058 return; 5059 } 5060 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5061 Worklist.insert(I); 5062 }; 5063 5064 // Start with the conditional branch. If the branch condition is an 5065 // instruction contained in the loop that is only used by the branch, it is 5066 // uniform. 5067 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5068 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5069 addToWorklistIfAllowed(Cmp); 5070 5071 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5072 InstWidening WideningDecision = getWideningDecision(I, VF); 5073 assert(WideningDecision != CM_Unknown && 5074 "Widening decision should be ready at this moment"); 5075 5076 // A uniform memory op is itself uniform. We exclude uniform stores 5077 // here as they demand the last lane, not the first one. 5078 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5079 assert(WideningDecision == CM_Scalarize); 5080 return true; 5081 } 5082 5083 return (WideningDecision == CM_Widen || 5084 WideningDecision == CM_Widen_Reverse || 5085 WideningDecision == CM_Interleave); 5086 }; 5087 5088 5089 // Returns true if Ptr is the pointer operand of a memory access instruction 5090 // I, and I is known to not require scalarization. 5091 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5092 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5093 }; 5094 5095 // Holds a list of values which are known to have at least one uniform use. 5096 // Note that there may be other uses which aren't uniform. A "uniform use" 5097 // here is something which only demands lane 0 of the unrolled iterations; 5098 // it does not imply that all lanes produce the same value (e.g. this is not 5099 // the usual meaning of uniform) 5100 SetVector<Value *> HasUniformUse; 5101 5102 // Scan the loop for instructions which are either a) known to have only 5103 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5104 for (auto *BB : TheLoop->blocks()) 5105 for (auto &I : *BB) { 5106 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5107 switch (II->getIntrinsicID()) { 5108 case Intrinsic::sideeffect: 5109 case Intrinsic::experimental_noalias_scope_decl: 5110 case Intrinsic::assume: 5111 case Intrinsic::lifetime_start: 5112 case Intrinsic::lifetime_end: 5113 if (TheLoop->hasLoopInvariantOperands(&I)) 5114 addToWorklistIfAllowed(&I); 5115 break; 5116 default: 5117 break; 5118 } 5119 } 5120 5121 // ExtractValue instructions must be uniform, because the operands are 5122 // known to be loop-invariant. 5123 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5124 assert(isOutOfScope(EVI->getAggregateOperand()) && 5125 "Expected aggregate value to be loop invariant"); 5126 addToWorklistIfAllowed(EVI); 5127 continue; 5128 } 5129 5130 // If there's no pointer operand, there's nothing to do. 5131 auto *Ptr = getLoadStorePointerOperand(&I); 5132 if (!Ptr) 5133 continue; 5134 5135 // A uniform memory op is itself uniform. We exclude uniform stores 5136 // here as they demand the last lane, not the first one. 5137 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5138 addToWorklistIfAllowed(&I); 5139 5140 if (isUniformDecision(&I, VF)) { 5141 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5142 HasUniformUse.insert(Ptr); 5143 } 5144 } 5145 5146 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5147 // demanding) users. Since loops are assumed to be in LCSSA form, this 5148 // disallows uses outside the loop as well. 5149 for (auto *V : HasUniformUse) { 5150 if (isOutOfScope(V)) 5151 continue; 5152 auto *I = cast<Instruction>(V); 5153 auto UsersAreMemAccesses = 5154 llvm::all_of(I->users(), [&](User *U) -> bool { 5155 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5156 }); 5157 if (UsersAreMemAccesses) 5158 addToWorklistIfAllowed(I); 5159 } 5160 5161 // Expand Worklist in topological order: whenever a new instruction 5162 // is added , its users should be already inside Worklist. It ensures 5163 // a uniform instruction will only be used by uniform instructions. 5164 unsigned idx = 0; 5165 while (idx != Worklist.size()) { 5166 Instruction *I = Worklist[idx++]; 5167 5168 for (auto OV : I->operand_values()) { 5169 // isOutOfScope operands cannot be uniform instructions. 5170 if (isOutOfScope(OV)) 5171 continue; 5172 // First order recurrence Phi's should typically be considered 5173 // non-uniform. 5174 auto *OP = dyn_cast<PHINode>(OV); 5175 if (OP && Legal->isFirstOrderRecurrence(OP)) 5176 continue; 5177 // If all the users of the operand are uniform, then add the 5178 // operand into the uniform worklist. 5179 auto *OI = cast<Instruction>(OV); 5180 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5181 auto *J = cast<Instruction>(U); 5182 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5183 })) 5184 addToWorklistIfAllowed(OI); 5185 } 5186 } 5187 5188 // For an instruction to be added into Worklist above, all its users inside 5189 // the loop should also be in Worklist. However, this condition cannot be 5190 // true for phi nodes that form a cyclic dependence. We must process phi 5191 // nodes separately. An induction variable will remain uniform if all users 5192 // of the induction variable and induction variable update remain uniform. 5193 // The code below handles both pointer and non-pointer induction variables. 5194 for (auto &Induction : Legal->getInductionVars()) { 5195 auto *Ind = Induction.first; 5196 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5197 5198 // Determine if all users of the induction variable are uniform after 5199 // vectorization. 5200 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5201 auto *I = cast<Instruction>(U); 5202 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5203 isVectorizedMemAccessUse(I, Ind); 5204 }); 5205 if (!UniformInd) 5206 continue; 5207 5208 // Determine if all users of the induction variable update instruction are 5209 // uniform after vectorization. 5210 auto UniformIndUpdate = 5211 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5212 auto *I = cast<Instruction>(U); 5213 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5214 isVectorizedMemAccessUse(I, IndUpdate); 5215 }); 5216 if (!UniformIndUpdate) 5217 continue; 5218 5219 // The induction variable and its update instruction will remain uniform. 5220 addToWorklistIfAllowed(Ind); 5221 addToWorklistIfAllowed(IndUpdate); 5222 } 5223 5224 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5225 } 5226 5227 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5228 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5229 5230 if (Legal->getRuntimePointerChecking()->Need) { 5231 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5232 "runtime pointer checks needed. Enable vectorization of this " 5233 "loop with '#pragma clang loop vectorize(enable)' when " 5234 "compiling with -Os/-Oz", 5235 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5236 return true; 5237 } 5238 5239 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5240 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5241 "runtime SCEV checks needed. Enable vectorization of this " 5242 "loop with '#pragma clang loop vectorize(enable)' when " 5243 "compiling with -Os/-Oz", 5244 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5245 return true; 5246 } 5247 5248 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5249 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5250 reportVectorizationFailure("Runtime stride check for small trip count", 5251 "runtime stride == 1 checks needed. Enable vectorization of " 5252 "this loop without such check by compiling with -Os/-Oz", 5253 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5254 return true; 5255 } 5256 5257 return false; 5258 } 5259 5260 ElementCount 5261 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5262 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5263 return ElementCount::getScalable(0); 5264 5265 if (Hints->isScalableVectorizationDisabled()) { 5266 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5267 "ScalableVectorizationDisabled", ORE, TheLoop); 5268 return ElementCount::getScalable(0); 5269 } 5270 5271 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5272 5273 auto MaxScalableVF = ElementCount::getScalable( 5274 std::numeric_limits<ElementCount::ScalarTy>::max()); 5275 5276 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5277 // FIXME: While for scalable vectors this is currently sufficient, this should 5278 // be replaced by a more detailed mechanism that filters out specific VFs, 5279 // instead of invalidating vectorization for a whole set of VFs based on the 5280 // MaxVF. 5281 5282 // Disable scalable vectorization if the loop contains unsupported reductions. 5283 if (!canVectorizeReductions(MaxScalableVF)) { 5284 reportVectorizationInfo( 5285 "Scalable vectorization not supported for the reduction " 5286 "operations found in this loop.", 5287 "ScalableVFUnfeasible", ORE, TheLoop); 5288 return ElementCount::getScalable(0); 5289 } 5290 5291 // Disable scalable vectorization if the loop contains any instructions 5292 // with element types not supported for scalable vectors. 5293 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5294 return !Ty->isVoidTy() && 5295 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5296 })) { 5297 reportVectorizationInfo("Scalable vectorization is not supported " 5298 "for all element types found in this loop.", 5299 "ScalableVFUnfeasible", ORE, TheLoop); 5300 return ElementCount::getScalable(0); 5301 } 5302 5303 if (Legal->isSafeForAnyVectorWidth()) 5304 return MaxScalableVF; 5305 5306 // Limit MaxScalableVF by the maximum safe dependence distance. 5307 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5308 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5309 MaxVScale = 5310 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5311 MaxScalableVF = ElementCount::getScalable( 5312 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5313 if (!MaxScalableVF) 5314 reportVectorizationInfo( 5315 "Max legal vector width too small, scalable vectorization " 5316 "unfeasible.", 5317 "ScalableVFUnfeasible", ORE, TheLoop); 5318 5319 return MaxScalableVF; 5320 } 5321 5322 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5323 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5324 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5325 unsigned SmallestType, WidestType; 5326 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5327 5328 // Get the maximum safe dependence distance in bits computed by LAA. 5329 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5330 // the memory accesses that is most restrictive (involved in the smallest 5331 // dependence distance). 5332 unsigned MaxSafeElements = 5333 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5334 5335 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5336 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5337 5338 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5339 << ".\n"); 5340 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5341 << ".\n"); 5342 5343 // First analyze the UserVF, fall back if the UserVF should be ignored. 5344 if (UserVF) { 5345 auto MaxSafeUserVF = 5346 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5347 5348 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5349 // If `VF=vscale x N` is safe, then so is `VF=N` 5350 if (UserVF.isScalable()) 5351 return FixedScalableVFPair( 5352 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5353 else 5354 return UserVF; 5355 } 5356 5357 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5358 5359 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5360 // is better to ignore the hint and let the compiler choose a suitable VF. 5361 if (!UserVF.isScalable()) { 5362 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5363 << " is unsafe, clamping to max safe VF=" 5364 << MaxSafeFixedVF << ".\n"); 5365 ORE->emit([&]() { 5366 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5367 TheLoop->getStartLoc(), 5368 TheLoop->getHeader()) 5369 << "User-specified vectorization factor " 5370 << ore::NV("UserVectorizationFactor", UserVF) 5371 << " is unsafe, clamping to maximum safe vectorization factor " 5372 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5373 }); 5374 return MaxSafeFixedVF; 5375 } 5376 5377 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5378 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5379 << " is ignored because scalable vectors are not " 5380 "available.\n"); 5381 ORE->emit([&]() { 5382 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5383 TheLoop->getStartLoc(), 5384 TheLoop->getHeader()) 5385 << "User-specified vectorization factor " 5386 << ore::NV("UserVectorizationFactor", UserVF) 5387 << " is ignored because the target does not support scalable " 5388 "vectors. The compiler will pick a more suitable value."; 5389 }); 5390 } else { 5391 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5392 << " is unsafe. Ignoring scalable UserVF.\n"); 5393 ORE->emit([&]() { 5394 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5395 TheLoop->getStartLoc(), 5396 TheLoop->getHeader()) 5397 << "User-specified vectorization factor " 5398 << ore::NV("UserVectorizationFactor", UserVF) 5399 << " is unsafe. Ignoring the hint to let the compiler pick a " 5400 "more suitable value."; 5401 }); 5402 } 5403 } 5404 5405 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5406 << " / " << WidestType << " bits.\n"); 5407 5408 FixedScalableVFPair Result(ElementCount::getFixed(1), 5409 ElementCount::getScalable(0)); 5410 if (auto MaxVF = 5411 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5412 MaxSafeFixedVF, FoldTailByMasking)) 5413 Result.FixedVF = MaxVF; 5414 5415 if (auto MaxVF = 5416 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5417 MaxSafeScalableVF, FoldTailByMasking)) 5418 if (MaxVF.isScalable()) { 5419 Result.ScalableVF = MaxVF; 5420 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5421 << "\n"); 5422 } 5423 5424 return Result; 5425 } 5426 5427 FixedScalableVFPair 5428 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5429 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5430 // TODO: It may by useful to do since it's still likely to be dynamically 5431 // uniform if the target can skip. 5432 reportVectorizationFailure( 5433 "Not inserting runtime ptr check for divergent target", 5434 "runtime pointer checks needed. Not enabled for divergent target", 5435 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5436 return FixedScalableVFPair::getNone(); 5437 } 5438 5439 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5440 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5441 if (TC == 1) { 5442 reportVectorizationFailure("Single iteration (non) loop", 5443 "loop trip count is one, irrelevant for vectorization", 5444 "SingleIterationLoop", ORE, TheLoop); 5445 return FixedScalableVFPair::getNone(); 5446 } 5447 5448 switch (ScalarEpilogueStatus) { 5449 case CM_ScalarEpilogueAllowed: 5450 return computeFeasibleMaxVF(TC, UserVF, false); 5451 case CM_ScalarEpilogueNotAllowedUsePredicate: 5452 LLVM_FALLTHROUGH; 5453 case CM_ScalarEpilogueNotNeededUsePredicate: 5454 LLVM_DEBUG( 5455 dbgs() << "LV: vector predicate hint/switch found.\n" 5456 << "LV: Not allowing scalar epilogue, creating predicated " 5457 << "vector loop.\n"); 5458 break; 5459 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5460 // fallthrough as a special case of OptForSize 5461 case CM_ScalarEpilogueNotAllowedOptSize: 5462 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5463 LLVM_DEBUG( 5464 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5465 else 5466 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5467 << "count.\n"); 5468 5469 // Bail if runtime checks are required, which are not good when optimising 5470 // for size. 5471 if (runtimeChecksRequired()) 5472 return FixedScalableVFPair::getNone(); 5473 5474 break; 5475 } 5476 5477 // The only loops we can vectorize without a scalar epilogue, are loops with 5478 // a bottom-test and a single exiting block. We'd have to handle the fact 5479 // that not every instruction executes on the last iteration. This will 5480 // require a lane mask which varies through the vector loop body. (TODO) 5481 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5482 // If there was a tail-folding hint/switch, but we can't fold the tail by 5483 // masking, fallback to a vectorization with a scalar epilogue. 5484 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5485 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5486 "scalar epilogue instead.\n"); 5487 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5488 return computeFeasibleMaxVF(TC, UserVF, false); 5489 } 5490 return FixedScalableVFPair::getNone(); 5491 } 5492 5493 // Now try the tail folding 5494 5495 // Invalidate interleave groups that require an epilogue if we can't mask 5496 // the interleave-group. 5497 if (!useMaskedInterleavedAccesses(TTI)) { 5498 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5499 "No decisions should have been taken at this point"); 5500 // Note: There is no need to invalidate any cost modeling decisions here, as 5501 // non where taken so far. 5502 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5503 } 5504 5505 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5506 // Avoid tail folding if the trip count is known to be a multiple of any VF 5507 // we chose. 5508 // FIXME: The condition below pessimises the case for fixed-width vectors, 5509 // when scalable VFs are also candidates for vectorization. 5510 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5511 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5512 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5513 "MaxFixedVF must be a power of 2"); 5514 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5515 : MaxFixedVF.getFixedValue(); 5516 ScalarEvolution *SE = PSE.getSE(); 5517 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5518 const SCEV *ExitCount = SE->getAddExpr( 5519 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5520 const SCEV *Rem = SE->getURemExpr( 5521 SE->applyLoopGuards(ExitCount, TheLoop), 5522 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5523 if (Rem->isZero()) { 5524 // Accept MaxFixedVF if we do not have a tail. 5525 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5526 return MaxFactors; 5527 } 5528 } 5529 5530 // For scalable vectors, don't use tail folding as this is currently not yet 5531 // supported. The code is likely to have ended up here if the tripcount is 5532 // low, in which case it makes sense not to use scalable vectors. 5533 if (MaxFactors.ScalableVF.isVector()) 5534 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5535 5536 // If we don't know the precise trip count, or if the trip count that we 5537 // found modulo the vectorization factor is not zero, try to fold the tail 5538 // by masking. 5539 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5540 if (Legal->prepareToFoldTailByMasking()) { 5541 FoldTailByMasking = true; 5542 return MaxFactors; 5543 } 5544 5545 // If there was a tail-folding hint/switch, but we can't fold the tail by 5546 // masking, fallback to a vectorization with a scalar epilogue. 5547 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5548 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5549 "scalar epilogue instead.\n"); 5550 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5551 return MaxFactors; 5552 } 5553 5554 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5555 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5556 return FixedScalableVFPair::getNone(); 5557 } 5558 5559 if (TC == 0) { 5560 reportVectorizationFailure( 5561 "Unable to calculate the loop count due to complex control flow", 5562 "unable to calculate the loop count due to complex control flow", 5563 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5564 return FixedScalableVFPair::getNone(); 5565 } 5566 5567 reportVectorizationFailure( 5568 "Cannot optimize for size and vectorize at the same time.", 5569 "cannot optimize for size and vectorize at the same time. " 5570 "Enable vectorization of this loop with '#pragma clang loop " 5571 "vectorize(enable)' when compiling with -Os/-Oz", 5572 "NoTailLoopWithOptForSize", ORE, TheLoop); 5573 return FixedScalableVFPair::getNone(); 5574 } 5575 5576 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5577 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5578 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5579 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5580 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5581 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5582 : TargetTransformInfo::RGK_FixedWidthVector); 5583 5584 // Convenience function to return the minimum of two ElementCounts. 5585 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5586 assert((LHS.isScalable() == RHS.isScalable()) && 5587 "Scalable flags must match"); 5588 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5589 }; 5590 5591 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5592 // Note that both WidestRegister and WidestType may not be a powers of 2. 5593 auto MaxVectorElementCount = ElementCount::get( 5594 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5595 ComputeScalableMaxVF); 5596 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5597 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5598 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5599 5600 if (!MaxVectorElementCount) { 5601 LLVM_DEBUG(dbgs() << "LV: The target has no " 5602 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5603 << " vector registers.\n"); 5604 return ElementCount::getFixed(1); 5605 } 5606 5607 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5608 if (ConstTripCount && 5609 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5610 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5611 // If loop trip count (TC) is known at compile time there is no point in 5612 // choosing VF greater than TC (as done in the loop below). Select maximum 5613 // power of two which doesn't exceed TC. 5614 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5615 // when the TC is less than or equal to the known number of lanes. 5616 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5617 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5618 "exceeding the constant trip count: " 5619 << ClampedConstTripCount << "\n"); 5620 return ElementCount::getFixed(ClampedConstTripCount); 5621 } 5622 5623 ElementCount MaxVF = MaxVectorElementCount; 5624 if (TTI.shouldMaximizeVectorBandwidth() || 5625 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5626 auto MaxVectorElementCountMaxBW = ElementCount::get( 5627 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5628 ComputeScalableMaxVF); 5629 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5630 5631 // Collect all viable vectorization factors larger than the default MaxVF 5632 // (i.e. MaxVectorElementCount). 5633 SmallVector<ElementCount, 8> VFs; 5634 for (ElementCount VS = MaxVectorElementCount * 2; 5635 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5636 VFs.push_back(VS); 5637 5638 // For each VF calculate its register usage. 5639 auto RUs = calculateRegisterUsage(VFs); 5640 5641 // Select the largest VF which doesn't require more registers than existing 5642 // ones. 5643 for (int i = RUs.size() - 1; i >= 0; --i) { 5644 bool Selected = true; 5645 for (auto &pair : RUs[i].MaxLocalUsers) { 5646 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5647 if (pair.second > TargetNumRegisters) 5648 Selected = false; 5649 } 5650 if (Selected) { 5651 MaxVF = VFs[i]; 5652 break; 5653 } 5654 } 5655 if (ElementCount MinVF = 5656 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5657 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5658 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5659 << ") with target's minimum: " << MinVF << '\n'); 5660 MaxVF = MinVF; 5661 } 5662 } 5663 } 5664 return MaxVF; 5665 } 5666 5667 bool LoopVectorizationCostModel::isMoreProfitable( 5668 const VectorizationFactor &A, const VectorizationFactor &B) const { 5669 InstructionCost CostA = A.Cost; 5670 InstructionCost CostB = B.Cost; 5671 5672 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5673 5674 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5675 MaxTripCount) { 5676 // If we are folding the tail and the trip count is a known (possibly small) 5677 // constant, the trip count will be rounded up to an integer number of 5678 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5679 // which we compare directly. When not folding the tail, the total cost will 5680 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5681 // approximated with the per-lane cost below instead of using the tripcount 5682 // as here. 5683 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5684 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5685 return RTCostA < RTCostB; 5686 } 5687 5688 // Improve estimate for the vector width if it is scalable. 5689 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5690 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5691 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5692 if (A.Width.isScalable()) 5693 EstimatedWidthA *= VScale.getValue(); 5694 if (B.Width.isScalable()) 5695 EstimatedWidthB *= VScale.getValue(); 5696 } 5697 5698 // When set to preferred, for now assume vscale may be larger than 1 (or the 5699 // one being tuned for), so that scalable vectorization is slightly favorable 5700 // over fixed-width vectorization. 5701 if (Hints->isScalableVectorizationPreferred()) 5702 if (A.Width.isScalable() && !B.Width.isScalable()) 5703 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5704 5705 // To avoid the need for FP division: 5706 // (CostA / A.Width) < (CostB / B.Width) 5707 // <=> (CostA * B.Width) < (CostB * A.Width) 5708 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5709 } 5710 5711 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5712 const ElementCountSet &VFCandidates) { 5713 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5714 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5715 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5716 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5717 "Expected Scalar VF to be a candidate"); 5718 5719 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5720 VectorizationFactor ChosenFactor = ScalarCost; 5721 5722 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5723 if (ForceVectorization && VFCandidates.size() > 1) { 5724 // Ignore scalar width, because the user explicitly wants vectorization. 5725 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5726 // evaluation. 5727 ChosenFactor.Cost = InstructionCost::getMax(); 5728 } 5729 5730 SmallVector<InstructionVFPair> InvalidCosts; 5731 for (const auto &i : VFCandidates) { 5732 // The cost for scalar VF=1 is already calculated, so ignore it. 5733 if (i.isScalar()) 5734 continue; 5735 5736 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5737 VectorizationFactor Candidate(i, C.first); 5738 5739 #ifndef NDEBUG 5740 unsigned AssumedMinimumVscale = 1; 5741 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5742 AssumedMinimumVscale = VScale.getValue(); 5743 unsigned Width = 5744 Candidate.Width.isScalable() 5745 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5746 : Candidate.Width.getFixedValue(); 5747 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5748 << " costs: " << (Candidate.Cost / Width)); 5749 if (i.isScalable()) 5750 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5751 << AssumedMinimumVscale << ")"); 5752 LLVM_DEBUG(dbgs() << ".\n"); 5753 #endif 5754 5755 if (!C.second && !ForceVectorization) { 5756 LLVM_DEBUG( 5757 dbgs() << "LV: Not considering vector loop of width " << i 5758 << " because it will not generate any vector instructions.\n"); 5759 continue; 5760 } 5761 5762 // If profitable add it to ProfitableVF list. 5763 if (isMoreProfitable(Candidate, ScalarCost)) 5764 ProfitableVFs.push_back(Candidate); 5765 5766 if (isMoreProfitable(Candidate, ChosenFactor)) 5767 ChosenFactor = Candidate; 5768 } 5769 5770 // Emit a report of VFs with invalid costs in the loop. 5771 if (!InvalidCosts.empty()) { 5772 // Group the remarks per instruction, keeping the instruction order from 5773 // InvalidCosts. 5774 std::map<Instruction *, unsigned> Numbering; 5775 unsigned I = 0; 5776 for (auto &Pair : InvalidCosts) 5777 if (!Numbering.count(Pair.first)) 5778 Numbering[Pair.first] = I++; 5779 5780 // Sort the list, first on instruction(number) then on VF. 5781 llvm::sort(InvalidCosts, 5782 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5783 if (Numbering[A.first] != Numbering[B.first]) 5784 return Numbering[A.first] < Numbering[B.first]; 5785 ElementCountComparator ECC; 5786 return ECC(A.second, B.second); 5787 }); 5788 5789 // For a list of ordered instruction-vf pairs: 5790 // [(load, vf1), (load, vf2), (store, vf1)] 5791 // Group the instructions together to emit separate remarks for: 5792 // load (vf1, vf2) 5793 // store (vf1) 5794 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5795 auto Subset = ArrayRef<InstructionVFPair>(); 5796 do { 5797 if (Subset.empty()) 5798 Subset = Tail.take_front(1); 5799 5800 Instruction *I = Subset.front().first; 5801 5802 // If the next instruction is different, or if there are no other pairs, 5803 // emit a remark for the collated subset. e.g. 5804 // [(load, vf1), (load, vf2))] 5805 // to emit: 5806 // remark: invalid costs for 'load' at VF=(vf, vf2) 5807 if (Subset == Tail || Tail[Subset.size()].first != I) { 5808 std::string OutString; 5809 raw_string_ostream OS(OutString); 5810 assert(!Subset.empty() && "Unexpected empty range"); 5811 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5812 for (auto &Pair : Subset) 5813 OS << (Pair.second == Subset.front().second ? "" : ", ") 5814 << Pair.second; 5815 OS << "):"; 5816 if (auto *CI = dyn_cast<CallInst>(I)) 5817 OS << " call to " << CI->getCalledFunction()->getName(); 5818 else 5819 OS << " " << I->getOpcodeName(); 5820 OS.flush(); 5821 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5822 Tail = Tail.drop_front(Subset.size()); 5823 Subset = {}; 5824 } else 5825 // Grow the subset by one element 5826 Subset = Tail.take_front(Subset.size() + 1); 5827 } while (!Tail.empty()); 5828 } 5829 5830 if (!EnableCondStoresVectorization && NumPredStores) { 5831 reportVectorizationFailure("There are conditional stores.", 5832 "store that is conditionally executed prevents vectorization", 5833 "ConditionalStore", ORE, TheLoop); 5834 ChosenFactor = ScalarCost; 5835 } 5836 5837 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5838 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5839 << "LV: Vectorization seems to be not beneficial, " 5840 << "but was forced by a user.\n"); 5841 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5842 return ChosenFactor; 5843 } 5844 5845 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5846 const Loop &L, ElementCount VF) const { 5847 // Cross iteration phis such as reductions need special handling and are 5848 // currently unsupported. 5849 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5850 return Legal->isFirstOrderRecurrence(&Phi) || 5851 Legal->isReductionVariable(&Phi); 5852 })) 5853 return false; 5854 5855 // Phis with uses outside of the loop require special handling and are 5856 // currently unsupported. 5857 for (auto &Entry : Legal->getInductionVars()) { 5858 // Look for uses of the value of the induction at the last iteration. 5859 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5860 for (User *U : PostInc->users()) 5861 if (!L.contains(cast<Instruction>(U))) 5862 return false; 5863 // Look for uses of penultimate value of the induction. 5864 for (User *U : Entry.first->users()) 5865 if (!L.contains(cast<Instruction>(U))) 5866 return false; 5867 } 5868 5869 // Induction variables that are widened require special handling that is 5870 // currently not supported. 5871 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5872 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5873 this->isProfitableToScalarize(Entry.first, VF)); 5874 })) 5875 return false; 5876 5877 // Epilogue vectorization code has not been auditted to ensure it handles 5878 // non-latch exits properly. It may be fine, but it needs auditted and 5879 // tested. 5880 if (L.getExitingBlock() != L.getLoopLatch()) 5881 return false; 5882 5883 return true; 5884 } 5885 5886 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5887 const ElementCount VF) const { 5888 // FIXME: We need a much better cost-model to take different parameters such 5889 // as register pressure, code size increase and cost of extra branches into 5890 // account. For now we apply a very crude heuristic and only consider loops 5891 // with vectorization factors larger than a certain value. 5892 // We also consider epilogue vectorization unprofitable for targets that don't 5893 // consider interleaving beneficial (eg. MVE). 5894 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5895 return false; 5896 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5897 return true; 5898 return false; 5899 } 5900 5901 VectorizationFactor 5902 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5903 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5904 VectorizationFactor Result = VectorizationFactor::Disabled(); 5905 if (!EnableEpilogueVectorization) { 5906 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5907 return Result; 5908 } 5909 5910 if (!isScalarEpilogueAllowed()) { 5911 LLVM_DEBUG( 5912 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5913 "allowed.\n";); 5914 return Result; 5915 } 5916 5917 // Not really a cost consideration, but check for unsupported cases here to 5918 // simplify the logic. 5919 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5920 LLVM_DEBUG( 5921 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5922 "not a supported candidate.\n";); 5923 return Result; 5924 } 5925 5926 if (EpilogueVectorizationForceVF > 1) { 5927 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5928 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5929 if (LVP.hasPlanWithVF(ForcedEC)) 5930 return {ForcedEC, 0}; 5931 else { 5932 LLVM_DEBUG( 5933 dbgs() 5934 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5935 return Result; 5936 } 5937 } 5938 5939 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5940 TheLoop->getHeader()->getParent()->hasMinSize()) { 5941 LLVM_DEBUG( 5942 dbgs() 5943 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5944 return Result; 5945 } 5946 5947 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5948 if (MainLoopVF.isScalable()) 5949 LLVM_DEBUG( 5950 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 5951 "yet supported. Converting to fixed-width (VF=" 5952 << FixedMainLoopVF << ") instead\n"); 5953 5954 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 5955 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5956 "this loop\n"); 5957 return Result; 5958 } 5959 5960 for (auto &NextVF : ProfitableVFs) 5961 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 5962 (Result.Width.getFixedValue() == 1 || 5963 isMoreProfitable(NextVF, Result)) && 5964 LVP.hasPlanWithVF(NextVF.Width)) 5965 Result = NextVF; 5966 5967 if (Result != VectorizationFactor::Disabled()) 5968 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5969 << Result.Width.getFixedValue() << "\n";); 5970 return Result; 5971 } 5972 5973 std::pair<unsigned, unsigned> 5974 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5975 unsigned MinWidth = -1U; 5976 unsigned MaxWidth = 8; 5977 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5978 for (Type *T : ElementTypesInLoop) { 5979 MinWidth = std::min<unsigned>( 5980 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5981 MaxWidth = std::max<unsigned>( 5982 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5983 } 5984 return {MinWidth, MaxWidth}; 5985 } 5986 5987 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5988 ElementTypesInLoop.clear(); 5989 // For each block. 5990 for (BasicBlock *BB : TheLoop->blocks()) { 5991 // For each instruction in the loop. 5992 for (Instruction &I : BB->instructionsWithoutDebug()) { 5993 Type *T = I.getType(); 5994 5995 // Skip ignored values. 5996 if (ValuesToIgnore.count(&I)) 5997 continue; 5998 5999 // Only examine Loads, Stores and PHINodes. 6000 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6001 continue; 6002 6003 // Examine PHI nodes that are reduction variables. Update the type to 6004 // account for the recurrence type. 6005 if (auto *PN = dyn_cast<PHINode>(&I)) { 6006 if (!Legal->isReductionVariable(PN)) 6007 continue; 6008 const RecurrenceDescriptor &RdxDesc = 6009 Legal->getReductionVars().find(PN)->second; 6010 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6011 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6012 RdxDesc.getRecurrenceType(), 6013 TargetTransformInfo::ReductionFlags())) 6014 continue; 6015 T = RdxDesc.getRecurrenceType(); 6016 } 6017 6018 // Examine the stored values. 6019 if (auto *ST = dyn_cast<StoreInst>(&I)) 6020 T = ST->getValueOperand()->getType(); 6021 6022 // Ignore loaded pointer types and stored pointer types that are not 6023 // vectorizable. 6024 // 6025 // FIXME: The check here attempts to predict whether a load or store will 6026 // be vectorized. We only know this for certain after a VF has 6027 // been selected. Here, we assume that if an access can be 6028 // vectorized, it will be. We should also look at extending this 6029 // optimization to non-pointer types. 6030 // 6031 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6032 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6033 continue; 6034 6035 ElementTypesInLoop.insert(T); 6036 } 6037 } 6038 } 6039 6040 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6041 unsigned LoopCost) { 6042 // -- The interleave heuristics -- 6043 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6044 // There are many micro-architectural considerations that we can't predict 6045 // at this level. For example, frontend pressure (on decode or fetch) due to 6046 // code size, or the number and capabilities of the execution ports. 6047 // 6048 // We use the following heuristics to select the interleave count: 6049 // 1. If the code has reductions, then we interleave to break the cross 6050 // iteration dependency. 6051 // 2. If the loop is really small, then we interleave to reduce the loop 6052 // overhead. 6053 // 3. We don't interleave if we think that we will spill registers to memory 6054 // due to the increased register pressure. 6055 6056 if (!isScalarEpilogueAllowed()) 6057 return 1; 6058 6059 // We used the distance for the interleave count. 6060 if (Legal->getMaxSafeDepDistBytes() != -1U) 6061 return 1; 6062 6063 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6064 const bool HasReductions = !Legal->getReductionVars().empty(); 6065 // Do not interleave loops with a relatively small known or estimated trip 6066 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6067 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6068 // because with the above conditions interleaving can expose ILP and break 6069 // cross iteration dependences for reductions. 6070 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6071 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6072 return 1; 6073 6074 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6075 // We divide by these constants so assume that we have at least one 6076 // instruction that uses at least one register. 6077 for (auto& pair : R.MaxLocalUsers) { 6078 pair.second = std::max(pair.second, 1U); 6079 } 6080 6081 // We calculate the interleave count using the following formula. 6082 // Subtract the number of loop invariants from the number of available 6083 // registers. These registers are used by all of the interleaved instances. 6084 // Next, divide the remaining registers by the number of registers that is 6085 // required by the loop, in order to estimate how many parallel instances 6086 // fit without causing spills. All of this is rounded down if necessary to be 6087 // a power of two. We want power of two interleave count to simplify any 6088 // addressing operations or alignment considerations. 6089 // We also want power of two interleave counts to ensure that the induction 6090 // variable of the vector loop wraps to zero, when tail is folded by masking; 6091 // this currently happens when OptForSize, in which case IC is set to 1 above. 6092 unsigned IC = UINT_MAX; 6093 6094 for (auto& pair : R.MaxLocalUsers) { 6095 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6096 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6097 << " registers of " 6098 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6099 if (VF.isScalar()) { 6100 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6101 TargetNumRegisters = ForceTargetNumScalarRegs; 6102 } else { 6103 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6104 TargetNumRegisters = ForceTargetNumVectorRegs; 6105 } 6106 unsigned MaxLocalUsers = pair.second; 6107 unsigned LoopInvariantRegs = 0; 6108 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6109 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6110 6111 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6112 // Don't count the induction variable as interleaved. 6113 if (EnableIndVarRegisterHeur) { 6114 TmpIC = 6115 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6116 std::max(1U, (MaxLocalUsers - 1))); 6117 } 6118 6119 IC = std::min(IC, TmpIC); 6120 } 6121 6122 // Clamp the interleave ranges to reasonable counts. 6123 unsigned MaxInterleaveCount = 6124 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6125 6126 // Check if the user has overridden the max. 6127 if (VF.isScalar()) { 6128 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6129 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6130 } else { 6131 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6132 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6133 } 6134 6135 // If trip count is known or estimated compile time constant, limit the 6136 // interleave count to be less than the trip count divided by VF, provided it 6137 // is at least 1. 6138 // 6139 // For scalable vectors we can't know if interleaving is beneficial. It may 6140 // not be beneficial for small loops if none of the lanes in the second vector 6141 // iterations is enabled. However, for larger loops, there is likely to be a 6142 // similar benefit as for fixed-width vectors. For now, we choose to leave 6143 // the InterleaveCount as if vscale is '1', although if some information about 6144 // the vector is known (e.g. min vector size), we can make a better decision. 6145 if (BestKnownTC) { 6146 MaxInterleaveCount = 6147 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6148 // Make sure MaxInterleaveCount is greater than 0. 6149 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6150 } 6151 6152 assert(MaxInterleaveCount > 0 && 6153 "Maximum interleave count must be greater than 0"); 6154 6155 // Clamp the calculated IC to be between the 1 and the max interleave count 6156 // that the target and trip count allows. 6157 if (IC > MaxInterleaveCount) 6158 IC = MaxInterleaveCount; 6159 else 6160 // Make sure IC is greater than 0. 6161 IC = std::max(1u, IC); 6162 6163 assert(IC > 0 && "Interleave count must be greater than 0."); 6164 6165 // If we did not calculate the cost for VF (because the user selected the VF) 6166 // then we calculate the cost of VF here. 6167 if (LoopCost == 0) { 6168 InstructionCost C = expectedCost(VF).first; 6169 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6170 LoopCost = *C.getValue(); 6171 } 6172 6173 assert(LoopCost && "Non-zero loop cost expected"); 6174 6175 // Interleave if we vectorized this loop and there is a reduction that could 6176 // benefit from interleaving. 6177 if (VF.isVector() && HasReductions) { 6178 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6179 return IC; 6180 } 6181 6182 // Note that if we've already vectorized the loop we will have done the 6183 // runtime check and so interleaving won't require further checks. 6184 bool InterleavingRequiresRuntimePointerCheck = 6185 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6186 6187 // We want to interleave small loops in order to reduce the loop overhead and 6188 // potentially expose ILP opportunities. 6189 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6190 << "LV: IC is " << IC << '\n' 6191 << "LV: VF is " << VF << '\n'); 6192 const bool AggressivelyInterleaveReductions = 6193 TTI.enableAggressiveInterleaving(HasReductions); 6194 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6195 // We assume that the cost overhead is 1 and we use the cost model 6196 // to estimate the cost of the loop and interleave until the cost of the 6197 // loop overhead is about 5% of the cost of the loop. 6198 unsigned SmallIC = 6199 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6200 6201 // Interleave until store/load ports (estimated by max interleave count) are 6202 // saturated. 6203 unsigned NumStores = Legal->getNumStores(); 6204 unsigned NumLoads = Legal->getNumLoads(); 6205 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6206 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6207 6208 // There is little point in interleaving for reductions containing selects 6209 // and compares when VF=1 since it may just create more overhead than it's 6210 // worth for loops with small trip counts. This is because we still have to 6211 // do the final reduction after the loop. 6212 bool HasSelectCmpReductions = 6213 HasReductions && 6214 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6215 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6216 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6217 RdxDesc.getRecurrenceKind()); 6218 }); 6219 if (HasSelectCmpReductions) { 6220 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6221 return 1; 6222 } 6223 6224 // If we have a scalar reduction (vector reductions are already dealt with 6225 // by this point), we can increase the critical path length if the loop 6226 // we're interleaving is inside another loop. For tree-wise reductions 6227 // set the limit to 2, and for ordered reductions it's best to disable 6228 // interleaving entirely. 6229 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6230 bool HasOrderedReductions = 6231 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6232 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6233 return RdxDesc.isOrdered(); 6234 }); 6235 if (HasOrderedReductions) { 6236 LLVM_DEBUG( 6237 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6238 return 1; 6239 } 6240 6241 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6242 SmallIC = std::min(SmallIC, F); 6243 StoresIC = std::min(StoresIC, F); 6244 LoadsIC = std::min(LoadsIC, F); 6245 } 6246 6247 if (EnableLoadStoreRuntimeInterleave && 6248 std::max(StoresIC, LoadsIC) > SmallIC) { 6249 LLVM_DEBUG( 6250 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6251 return std::max(StoresIC, LoadsIC); 6252 } 6253 6254 // If there are scalar reductions and TTI has enabled aggressive 6255 // interleaving for reductions, we will interleave to expose ILP. 6256 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6257 AggressivelyInterleaveReductions) { 6258 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6259 // Interleave no less than SmallIC but not as aggressive as the normal IC 6260 // to satisfy the rare situation when resources are too limited. 6261 return std::max(IC / 2, SmallIC); 6262 } else { 6263 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6264 return SmallIC; 6265 } 6266 } 6267 6268 // Interleave if this is a large loop (small loops are already dealt with by 6269 // this point) that could benefit from interleaving. 6270 if (AggressivelyInterleaveReductions) { 6271 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6272 return IC; 6273 } 6274 6275 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6276 return 1; 6277 } 6278 6279 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6280 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6281 // This function calculates the register usage by measuring the highest number 6282 // of values that are alive at a single location. Obviously, this is a very 6283 // rough estimation. We scan the loop in a topological order in order and 6284 // assign a number to each instruction. We use RPO to ensure that defs are 6285 // met before their users. We assume that each instruction that has in-loop 6286 // users starts an interval. We record every time that an in-loop value is 6287 // used, so we have a list of the first and last occurrences of each 6288 // instruction. Next, we transpose this data structure into a multi map that 6289 // holds the list of intervals that *end* at a specific location. This multi 6290 // map allows us to perform a linear search. We scan the instructions linearly 6291 // and record each time that a new interval starts, by placing it in a set. 6292 // If we find this value in the multi-map then we remove it from the set. 6293 // The max register usage is the maximum size of the set. 6294 // We also search for instructions that are defined outside the loop, but are 6295 // used inside the loop. We need this number separately from the max-interval 6296 // usage number because when we unroll, loop-invariant values do not take 6297 // more register. 6298 LoopBlocksDFS DFS(TheLoop); 6299 DFS.perform(LI); 6300 6301 RegisterUsage RU; 6302 6303 // Each 'key' in the map opens a new interval. The values 6304 // of the map are the index of the 'last seen' usage of the 6305 // instruction that is the key. 6306 using IntervalMap = DenseMap<Instruction *, unsigned>; 6307 6308 // Maps instruction to its index. 6309 SmallVector<Instruction *, 64> IdxToInstr; 6310 // Marks the end of each interval. 6311 IntervalMap EndPoint; 6312 // Saves the list of instruction indices that are used in the loop. 6313 SmallPtrSet<Instruction *, 8> Ends; 6314 // Saves the list of values that are used in the loop but are 6315 // defined outside the loop, such as arguments and constants. 6316 SmallPtrSet<Value *, 8> LoopInvariants; 6317 6318 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6319 for (Instruction &I : BB->instructionsWithoutDebug()) { 6320 IdxToInstr.push_back(&I); 6321 6322 // Save the end location of each USE. 6323 for (Value *U : I.operands()) { 6324 auto *Instr = dyn_cast<Instruction>(U); 6325 6326 // Ignore non-instruction values such as arguments, constants, etc. 6327 if (!Instr) 6328 continue; 6329 6330 // If this instruction is outside the loop then record it and continue. 6331 if (!TheLoop->contains(Instr)) { 6332 LoopInvariants.insert(Instr); 6333 continue; 6334 } 6335 6336 // Overwrite previous end points. 6337 EndPoint[Instr] = IdxToInstr.size(); 6338 Ends.insert(Instr); 6339 } 6340 } 6341 } 6342 6343 // Saves the list of intervals that end with the index in 'key'. 6344 using InstrList = SmallVector<Instruction *, 2>; 6345 DenseMap<unsigned, InstrList> TransposeEnds; 6346 6347 // Transpose the EndPoints to a list of values that end at each index. 6348 for (auto &Interval : EndPoint) 6349 TransposeEnds[Interval.second].push_back(Interval.first); 6350 6351 SmallPtrSet<Instruction *, 8> OpenIntervals; 6352 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6353 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6354 6355 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6356 6357 // A lambda that gets the register usage for the given type and VF. 6358 const auto &TTICapture = TTI; 6359 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6360 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6361 return 0; 6362 InstructionCost::CostType RegUsage = 6363 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6364 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6365 "Nonsensical values for register usage."); 6366 return RegUsage; 6367 }; 6368 6369 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6370 Instruction *I = IdxToInstr[i]; 6371 6372 // Remove all of the instructions that end at this location. 6373 InstrList &List = TransposeEnds[i]; 6374 for (Instruction *ToRemove : List) 6375 OpenIntervals.erase(ToRemove); 6376 6377 // Ignore instructions that are never used within the loop. 6378 if (!Ends.count(I)) 6379 continue; 6380 6381 // Skip ignored values. 6382 if (ValuesToIgnore.count(I)) 6383 continue; 6384 6385 // For each VF find the maximum usage of registers. 6386 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6387 // Count the number of live intervals. 6388 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6389 6390 if (VFs[j].isScalar()) { 6391 for (auto Inst : OpenIntervals) { 6392 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6393 if (RegUsage.find(ClassID) == RegUsage.end()) 6394 RegUsage[ClassID] = 1; 6395 else 6396 RegUsage[ClassID] += 1; 6397 } 6398 } else { 6399 collectUniformsAndScalars(VFs[j]); 6400 for (auto Inst : OpenIntervals) { 6401 // Skip ignored values for VF > 1. 6402 if (VecValuesToIgnore.count(Inst)) 6403 continue; 6404 if (isScalarAfterVectorization(Inst, VFs[j])) { 6405 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6406 if (RegUsage.find(ClassID) == RegUsage.end()) 6407 RegUsage[ClassID] = 1; 6408 else 6409 RegUsage[ClassID] += 1; 6410 } else { 6411 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6412 if (RegUsage.find(ClassID) == RegUsage.end()) 6413 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6414 else 6415 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6416 } 6417 } 6418 } 6419 6420 for (auto& pair : RegUsage) { 6421 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6422 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6423 else 6424 MaxUsages[j][pair.first] = pair.second; 6425 } 6426 } 6427 6428 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6429 << OpenIntervals.size() << '\n'); 6430 6431 // Add the current instruction to the list of open intervals. 6432 OpenIntervals.insert(I); 6433 } 6434 6435 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6436 SmallMapVector<unsigned, unsigned, 4> Invariant; 6437 6438 for (auto Inst : LoopInvariants) { 6439 unsigned Usage = 6440 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6441 unsigned ClassID = 6442 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6443 if (Invariant.find(ClassID) == Invariant.end()) 6444 Invariant[ClassID] = Usage; 6445 else 6446 Invariant[ClassID] += Usage; 6447 } 6448 6449 LLVM_DEBUG({ 6450 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6451 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6452 << " item\n"; 6453 for (const auto &pair : MaxUsages[i]) { 6454 dbgs() << "LV(REG): RegisterClass: " 6455 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6456 << " registers\n"; 6457 } 6458 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6459 << " item\n"; 6460 for (const auto &pair : Invariant) { 6461 dbgs() << "LV(REG): RegisterClass: " 6462 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6463 << " registers\n"; 6464 } 6465 }); 6466 6467 RU.LoopInvariantRegs = Invariant; 6468 RU.MaxLocalUsers = MaxUsages[i]; 6469 RUs[i] = RU; 6470 } 6471 6472 return RUs; 6473 } 6474 6475 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6476 // TODO: Cost model for emulated masked load/store is completely 6477 // broken. This hack guides the cost model to use an artificially 6478 // high enough value to practically disable vectorization with such 6479 // operations, except where previously deployed legality hack allowed 6480 // using very low cost values. This is to avoid regressions coming simply 6481 // from moving "masked load/store" check from legality to cost model. 6482 // Masked Load/Gather emulation was previously never allowed. 6483 // Limited number of Masked Store/Scatter emulation was allowed. 6484 assert(isPredicatedInst(I) && 6485 "Expecting a scalar emulated instruction"); 6486 return isa<LoadInst>(I) || 6487 (isa<StoreInst>(I) && 6488 NumPredStores > NumberOfStoresToPredicate); 6489 } 6490 6491 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6492 // If we aren't vectorizing the loop, or if we've already collected the 6493 // instructions to scalarize, there's nothing to do. Collection may already 6494 // have occurred if we have a user-selected VF and are now computing the 6495 // expected cost for interleaving. 6496 if (VF.isScalar() || VF.isZero() || 6497 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6498 return; 6499 6500 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6501 // not profitable to scalarize any instructions, the presence of VF in the 6502 // map will indicate that we've analyzed it already. 6503 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6504 6505 // Find all the instructions that are scalar with predication in the loop and 6506 // determine if it would be better to not if-convert the blocks they are in. 6507 // If so, we also record the instructions to scalarize. 6508 for (BasicBlock *BB : TheLoop->blocks()) { 6509 if (!blockNeedsPredicationForAnyReason(BB)) 6510 continue; 6511 for (Instruction &I : *BB) 6512 if (isScalarWithPredication(&I)) { 6513 ScalarCostsTy ScalarCosts; 6514 // Do not apply discount if scalable, because that would lead to 6515 // invalid scalarization costs. 6516 // Do not apply discount logic if hacked cost is needed 6517 // for emulated masked memrefs. 6518 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6519 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6520 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6521 // Remember that BB will remain after vectorization. 6522 PredicatedBBsAfterVectorization.insert(BB); 6523 } 6524 } 6525 } 6526 6527 int LoopVectorizationCostModel::computePredInstDiscount( 6528 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6529 assert(!isUniformAfterVectorization(PredInst, VF) && 6530 "Instruction marked uniform-after-vectorization will be predicated"); 6531 6532 // Initialize the discount to zero, meaning that the scalar version and the 6533 // vector version cost the same. 6534 InstructionCost Discount = 0; 6535 6536 // Holds instructions to analyze. The instructions we visit are mapped in 6537 // ScalarCosts. Those instructions are the ones that would be scalarized if 6538 // we find that the scalar version costs less. 6539 SmallVector<Instruction *, 8> Worklist; 6540 6541 // Returns true if the given instruction can be scalarized. 6542 auto canBeScalarized = [&](Instruction *I) -> bool { 6543 // We only attempt to scalarize instructions forming a single-use chain 6544 // from the original predicated block that would otherwise be vectorized. 6545 // Although not strictly necessary, we give up on instructions we know will 6546 // already be scalar to avoid traversing chains that are unlikely to be 6547 // beneficial. 6548 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6549 isScalarAfterVectorization(I, VF)) 6550 return false; 6551 6552 // If the instruction is scalar with predication, it will be analyzed 6553 // separately. We ignore it within the context of PredInst. 6554 if (isScalarWithPredication(I)) 6555 return false; 6556 6557 // If any of the instruction's operands are uniform after vectorization, 6558 // the instruction cannot be scalarized. This prevents, for example, a 6559 // masked load from being scalarized. 6560 // 6561 // We assume we will only emit a value for lane zero of an instruction 6562 // marked uniform after vectorization, rather than VF identical values. 6563 // Thus, if we scalarize an instruction that uses a uniform, we would 6564 // create uses of values corresponding to the lanes we aren't emitting code 6565 // for. This behavior can be changed by allowing getScalarValue to clone 6566 // the lane zero values for uniforms rather than asserting. 6567 for (Use &U : I->operands()) 6568 if (auto *J = dyn_cast<Instruction>(U.get())) 6569 if (isUniformAfterVectorization(J, VF)) 6570 return false; 6571 6572 // Otherwise, we can scalarize the instruction. 6573 return true; 6574 }; 6575 6576 // Compute the expected cost discount from scalarizing the entire expression 6577 // feeding the predicated instruction. We currently only consider expressions 6578 // that are single-use instruction chains. 6579 Worklist.push_back(PredInst); 6580 while (!Worklist.empty()) { 6581 Instruction *I = Worklist.pop_back_val(); 6582 6583 // If we've already analyzed the instruction, there's nothing to do. 6584 if (ScalarCosts.find(I) != ScalarCosts.end()) 6585 continue; 6586 6587 // Compute the cost of the vector instruction. Note that this cost already 6588 // includes the scalarization overhead of the predicated instruction. 6589 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6590 6591 // Compute the cost of the scalarized instruction. This cost is the cost of 6592 // the instruction as if it wasn't if-converted and instead remained in the 6593 // predicated block. We will scale this cost by block probability after 6594 // computing the scalarization overhead. 6595 InstructionCost ScalarCost = 6596 VF.getFixedValue() * 6597 getInstructionCost(I, ElementCount::getFixed(1)).first; 6598 6599 // Compute the scalarization overhead of needed insertelement instructions 6600 // and phi nodes. 6601 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6602 ScalarCost += TTI.getScalarizationOverhead( 6603 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6604 APInt::getAllOnes(VF.getFixedValue()), true, false); 6605 ScalarCost += 6606 VF.getFixedValue() * 6607 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6608 } 6609 6610 // Compute the scalarization overhead of needed extractelement 6611 // instructions. For each of the instruction's operands, if the operand can 6612 // be scalarized, add it to the worklist; otherwise, account for the 6613 // overhead. 6614 for (Use &U : I->operands()) 6615 if (auto *J = dyn_cast<Instruction>(U.get())) { 6616 assert(VectorType::isValidElementType(J->getType()) && 6617 "Instruction has non-scalar type"); 6618 if (canBeScalarized(J)) 6619 Worklist.push_back(J); 6620 else if (needsExtract(J, VF)) { 6621 ScalarCost += TTI.getScalarizationOverhead( 6622 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6623 APInt::getAllOnes(VF.getFixedValue()), false, true); 6624 } 6625 } 6626 6627 // Scale the total scalar cost by block probability. 6628 ScalarCost /= getReciprocalPredBlockProb(); 6629 6630 // Compute the discount. A non-negative discount means the vector version 6631 // of the instruction costs more, and scalarizing would be beneficial. 6632 Discount += VectorCost - ScalarCost; 6633 ScalarCosts[I] = ScalarCost; 6634 } 6635 6636 return *Discount.getValue(); 6637 } 6638 6639 LoopVectorizationCostModel::VectorizationCostTy 6640 LoopVectorizationCostModel::expectedCost( 6641 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6642 VectorizationCostTy Cost; 6643 6644 // For each block. 6645 for (BasicBlock *BB : TheLoop->blocks()) { 6646 VectorizationCostTy BlockCost; 6647 6648 // For each instruction in the old loop. 6649 for (Instruction &I : BB->instructionsWithoutDebug()) { 6650 // Skip ignored values. 6651 if (ValuesToIgnore.count(&I) || 6652 (VF.isVector() && VecValuesToIgnore.count(&I))) 6653 continue; 6654 6655 VectorizationCostTy C = getInstructionCost(&I, VF); 6656 6657 // Check if we should override the cost. 6658 if (C.first.isValid() && 6659 ForceTargetInstructionCost.getNumOccurrences() > 0) 6660 C.first = InstructionCost(ForceTargetInstructionCost); 6661 6662 // Keep a list of instructions with invalid costs. 6663 if (Invalid && !C.first.isValid()) 6664 Invalid->emplace_back(&I, VF); 6665 6666 BlockCost.first += C.first; 6667 BlockCost.second |= C.second; 6668 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6669 << " for VF " << VF << " For instruction: " << I 6670 << '\n'); 6671 } 6672 6673 // If we are vectorizing a predicated block, it will have been 6674 // if-converted. This means that the block's instructions (aside from 6675 // stores and instructions that may divide by zero) will now be 6676 // unconditionally executed. For the scalar case, we may not always execute 6677 // the predicated block, if it is an if-else block. Thus, scale the block's 6678 // cost by the probability of executing it. blockNeedsPredication from 6679 // Legal is used so as to not include all blocks in tail folded loops. 6680 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6681 BlockCost.first /= getReciprocalPredBlockProb(); 6682 6683 Cost.first += BlockCost.first; 6684 Cost.second |= BlockCost.second; 6685 } 6686 6687 return Cost; 6688 } 6689 6690 /// Gets Address Access SCEV after verifying that the access pattern 6691 /// is loop invariant except the induction variable dependence. 6692 /// 6693 /// This SCEV can be sent to the Target in order to estimate the address 6694 /// calculation cost. 6695 static const SCEV *getAddressAccessSCEV( 6696 Value *Ptr, 6697 LoopVectorizationLegality *Legal, 6698 PredicatedScalarEvolution &PSE, 6699 const Loop *TheLoop) { 6700 6701 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6702 if (!Gep) 6703 return nullptr; 6704 6705 // We are looking for a gep with all loop invariant indices except for one 6706 // which should be an induction variable. 6707 auto SE = PSE.getSE(); 6708 unsigned NumOperands = Gep->getNumOperands(); 6709 for (unsigned i = 1; i < NumOperands; ++i) { 6710 Value *Opd = Gep->getOperand(i); 6711 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6712 !Legal->isInductionVariable(Opd)) 6713 return nullptr; 6714 } 6715 6716 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6717 return PSE.getSCEV(Ptr); 6718 } 6719 6720 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6721 return Legal->hasStride(I->getOperand(0)) || 6722 Legal->hasStride(I->getOperand(1)); 6723 } 6724 6725 InstructionCost 6726 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6727 ElementCount VF) { 6728 assert(VF.isVector() && 6729 "Scalarization cost of instruction implies vectorization."); 6730 if (VF.isScalable()) 6731 return InstructionCost::getInvalid(); 6732 6733 Type *ValTy = getLoadStoreType(I); 6734 auto SE = PSE.getSE(); 6735 6736 unsigned AS = getLoadStoreAddressSpace(I); 6737 Value *Ptr = getLoadStorePointerOperand(I); 6738 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6739 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6740 // that it is being called from this specific place. 6741 6742 // Figure out whether the access is strided and get the stride value 6743 // if it's known in compile time 6744 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6745 6746 // Get the cost of the scalar memory instruction and address computation. 6747 InstructionCost Cost = 6748 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6749 6750 // Don't pass *I here, since it is scalar but will actually be part of a 6751 // vectorized loop where the user of it is a vectorized instruction. 6752 const Align Alignment = getLoadStoreAlignment(I); 6753 Cost += VF.getKnownMinValue() * 6754 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6755 AS, TTI::TCK_RecipThroughput); 6756 6757 // Get the overhead of the extractelement and insertelement instructions 6758 // we might create due to scalarization. 6759 Cost += getScalarizationOverhead(I, VF); 6760 6761 // If we have a predicated load/store, it will need extra i1 extracts and 6762 // conditional branches, but may not be executed for each vector lane. Scale 6763 // the cost by the probability of executing the predicated block. 6764 if (isPredicatedInst(I)) { 6765 Cost /= getReciprocalPredBlockProb(); 6766 6767 // Add the cost of an i1 extract and a branch 6768 auto *Vec_i1Ty = 6769 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6770 Cost += TTI.getScalarizationOverhead( 6771 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6772 /*Insert=*/false, /*Extract=*/true); 6773 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6774 6775 if (useEmulatedMaskMemRefHack(I)) 6776 // Artificially setting to a high enough value to practically disable 6777 // vectorization with such operations. 6778 Cost = 3000000; 6779 } 6780 6781 return Cost; 6782 } 6783 6784 InstructionCost 6785 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6786 ElementCount VF) { 6787 Type *ValTy = getLoadStoreType(I); 6788 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6789 Value *Ptr = getLoadStorePointerOperand(I); 6790 unsigned AS = getLoadStoreAddressSpace(I); 6791 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6792 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6793 6794 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6795 "Stride should be 1 or -1 for consecutive memory access"); 6796 const Align Alignment = getLoadStoreAlignment(I); 6797 InstructionCost Cost = 0; 6798 if (Legal->isMaskRequired(I)) 6799 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6800 CostKind); 6801 else 6802 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6803 CostKind, I); 6804 6805 bool Reverse = ConsecutiveStride < 0; 6806 if (Reverse) 6807 Cost += 6808 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6809 return Cost; 6810 } 6811 6812 InstructionCost 6813 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6814 ElementCount VF) { 6815 assert(Legal->isUniformMemOp(*I)); 6816 6817 Type *ValTy = getLoadStoreType(I); 6818 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6819 const Align Alignment = getLoadStoreAlignment(I); 6820 unsigned AS = getLoadStoreAddressSpace(I); 6821 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6822 if (isa<LoadInst>(I)) { 6823 return TTI.getAddressComputationCost(ValTy) + 6824 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6825 CostKind) + 6826 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6827 } 6828 StoreInst *SI = cast<StoreInst>(I); 6829 6830 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6831 return TTI.getAddressComputationCost(ValTy) + 6832 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6833 CostKind) + 6834 (isLoopInvariantStoreValue 6835 ? 0 6836 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6837 VF.getKnownMinValue() - 1)); 6838 } 6839 6840 InstructionCost 6841 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6842 ElementCount VF) { 6843 Type *ValTy = getLoadStoreType(I); 6844 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6845 const Align Alignment = getLoadStoreAlignment(I); 6846 const Value *Ptr = getLoadStorePointerOperand(I); 6847 6848 return TTI.getAddressComputationCost(VectorTy) + 6849 TTI.getGatherScatterOpCost( 6850 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6851 TargetTransformInfo::TCK_RecipThroughput, I); 6852 } 6853 6854 InstructionCost 6855 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6856 ElementCount VF) { 6857 // TODO: Once we have support for interleaving with scalable vectors 6858 // we can calculate the cost properly here. 6859 if (VF.isScalable()) 6860 return InstructionCost::getInvalid(); 6861 6862 Type *ValTy = getLoadStoreType(I); 6863 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6864 unsigned AS = getLoadStoreAddressSpace(I); 6865 6866 auto Group = getInterleavedAccessGroup(I); 6867 assert(Group && "Fail to get an interleaved access group."); 6868 6869 unsigned InterleaveFactor = Group->getFactor(); 6870 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6871 6872 // Holds the indices of existing members in the interleaved group. 6873 SmallVector<unsigned, 4> Indices; 6874 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6875 if (Group->getMember(IF)) 6876 Indices.push_back(IF); 6877 6878 // Calculate the cost of the whole interleaved group. 6879 bool UseMaskForGaps = 6880 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6881 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6882 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6883 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6884 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6885 6886 if (Group->isReverse()) { 6887 // TODO: Add support for reversed masked interleaved access. 6888 assert(!Legal->isMaskRequired(I) && 6889 "Reverse masked interleaved access not supported."); 6890 Cost += 6891 Group->getNumMembers() * 6892 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6893 } 6894 return Cost; 6895 } 6896 6897 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6898 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6899 using namespace llvm::PatternMatch; 6900 // Early exit for no inloop reductions 6901 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6902 return None; 6903 auto *VectorTy = cast<VectorType>(Ty); 6904 6905 // We are looking for a pattern of, and finding the minimal acceptable cost: 6906 // reduce(mul(ext(A), ext(B))) or 6907 // reduce(mul(A, B)) or 6908 // reduce(ext(A)) or 6909 // reduce(A). 6910 // The basic idea is that we walk down the tree to do that, finding the root 6911 // reduction instruction in InLoopReductionImmediateChains. From there we find 6912 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6913 // of the components. If the reduction cost is lower then we return it for the 6914 // reduction instruction and 0 for the other instructions in the pattern. If 6915 // it is not we return an invalid cost specifying the orignal cost method 6916 // should be used. 6917 Instruction *RetI = I; 6918 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6919 if (!RetI->hasOneUser()) 6920 return None; 6921 RetI = RetI->user_back(); 6922 } 6923 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6924 RetI->user_back()->getOpcode() == Instruction::Add) { 6925 if (!RetI->hasOneUser()) 6926 return None; 6927 RetI = RetI->user_back(); 6928 } 6929 6930 // Test if the found instruction is a reduction, and if not return an invalid 6931 // cost specifying the parent to use the original cost modelling. 6932 if (!InLoopReductionImmediateChains.count(RetI)) 6933 return None; 6934 6935 // Find the reduction this chain is a part of and calculate the basic cost of 6936 // the reduction on its own. 6937 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6938 Instruction *ReductionPhi = LastChain; 6939 while (!isa<PHINode>(ReductionPhi)) 6940 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6941 6942 const RecurrenceDescriptor &RdxDesc = 6943 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6944 6945 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6946 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6947 6948 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6949 // normal fmul instruction to the cost of the fadd reduction. 6950 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6951 BaseCost += 6952 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6953 6954 // If we're using ordered reductions then we can just return the base cost 6955 // here, since getArithmeticReductionCost calculates the full ordered 6956 // reduction cost when FP reassociation is not allowed. 6957 if (useOrderedReductions(RdxDesc)) 6958 return BaseCost; 6959 6960 // Get the operand that was not the reduction chain and match it to one of the 6961 // patterns, returning the better cost if it is found. 6962 Instruction *RedOp = RetI->getOperand(1) == LastChain 6963 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6964 : dyn_cast<Instruction>(RetI->getOperand(1)); 6965 6966 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6967 6968 Instruction *Op0, *Op1; 6969 if (RedOp && 6970 match(RedOp, 6971 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6972 match(Op0, m_ZExtOrSExt(m_Value())) && 6973 Op0->getOpcode() == Op1->getOpcode() && 6974 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6975 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6976 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6977 6978 // Matched reduce(ext(mul(ext(A), ext(B))) 6979 // Note that the extend opcodes need to all match, or if A==B they will have 6980 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6981 // which is equally fine. 6982 bool IsUnsigned = isa<ZExtInst>(Op0); 6983 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6984 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6985 6986 InstructionCost ExtCost = 6987 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6988 TTI::CastContextHint::None, CostKind, Op0); 6989 InstructionCost MulCost = 6990 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6991 InstructionCost Ext2Cost = 6992 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6993 TTI::CastContextHint::None, CostKind, RedOp); 6994 6995 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6996 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6997 CostKind); 6998 6999 if (RedCost.isValid() && 7000 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7001 return I == RetI ? RedCost : 0; 7002 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7003 !TheLoop->isLoopInvariant(RedOp)) { 7004 // Matched reduce(ext(A)) 7005 bool IsUnsigned = isa<ZExtInst>(RedOp); 7006 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7007 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7008 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7009 CostKind); 7010 7011 InstructionCost ExtCost = 7012 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7013 TTI::CastContextHint::None, CostKind, RedOp); 7014 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7015 return I == RetI ? RedCost : 0; 7016 } else if (RedOp && 7017 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7018 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7019 Op0->getOpcode() == Op1->getOpcode() && 7020 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7021 bool IsUnsigned = isa<ZExtInst>(Op0); 7022 Type *Op0Ty = Op0->getOperand(0)->getType(); 7023 Type *Op1Ty = Op1->getOperand(0)->getType(); 7024 Type *LargestOpTy = 7025 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 7026 : Op0Ty; 7027 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 7028 7029 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 7030 // different sizes. We take the largest type as the ext to reduce, and add 7031 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 7032 InstructionCost ExtCost0 = TTI.getCastInstrCost( 7033 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 7034 TTI::CastContextHint::None, CostKind, Op0); 7035 InstructionCost ExtCost1 = TTI.getCastInstrCost( 7036 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 7037 TTI::CastContextHint::None, CostKind, Op1); 7038 InstructionCost MulCost = 7039 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7040 7041 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7042 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7043 CostKind); 7044 InstructionCost ExtraExtCost = 0; 7045 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7046 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7047 ExtraExtCost = TTI.getCastInstrCost( 7048 ExtraExtOp->getOpcode(), ExtType, 7049 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7050 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7051 } 7052 7053 if (RedCost.isValid() && 7054 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7055 return I == RetI ? RedCost : 0; 7056 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7057 // Matched reduce(mul()) 7058 InstructionCost MulCost = 7059 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7060 7061 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7062 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7063 CostKind); 7064 7065 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7066 return I == RetI ? RedCost : 0; 7067 } 7068 } 7069 7070 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7071 } 7072 7073 InstructionCost 7074 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7075 ElementCount VF) { 7076 // Calculate scalar cost only. Vectorization cost should be ready at this 7077 // moment. 7078 if (VF.isScalar()) { 7079 Type *ValTy = getLoadStoreType(I); 7080 const Align Alignment = getLoadStoreAlignment(I); 7081 unsigned AS = getLoadStoreAddressSpace(I); 7082 7083 return TTI.getAddressComputationCost(ValTy) + 7084 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7085 TTI::TCK_RecipThroughput, I); 7086 } 7087 return getWideningCost(I, VF); 7088 } 7089 7090 LoopVectorizationCostModel::VectorizationCostTy 7091 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7092 ElementCount VF) { 7093 // If we know that this instruction will remain uniform, check the cost of 7094 // the scalar version. 7095 if (isUniformAfterVectorization(I, VF)) 7096 VF = ElementCount::getFixed(1); 7097 7098 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7099 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7100 7101 // Forced scalars do not have any scalarization overhead. 7102 auto ForcedScalar = ForcedScalars.find(VF); 7103 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7104 auto InstSet = ForcedScalar->second; 7105 if (InstSet.count(I)) 7106 return VectorizationCostTy( 7107 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7108 VF.getKnownMinValue()), 7109 false); 7110 } 7111 7112 Type *VectorTy; 7113 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7114 7115 bool TypeNotScalarized = false; 7116 if (VF.isVector() && VectorTy->isVectorTy()) { 7117 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7118 if (NumParts) 7119 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7120 else 7121 C = InstructionCost::getInvalid(); 7122 } 7123 return VectorizationCostTy(C, TypeNotScalarized); 7124 } 7125 7126 InstructionCost 7127 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7128 ElementCount VF) const { 7129 7130 // There is no mechanism yet to create a scalable scalarization loop, 7131 // so this is currently Invalid. 7132 if (VF.isScalable()) 7133 return InstructionCost::getInvalid(); 7134 7135 if (VF.isScalar()) 7136 return 0; 7137 7138 InstructionCost Cost = 0; 7139 Type *RetTy = ToVectorTy(I->getType(), VF); 7140 if (!RetTy->isVoidTy() && 7141 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7142 Cost += TTI.getScalarizationOverhead( 7143 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7144 false); 7145 7146 // Some targets keep addresses scalar. 7147 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7148 return Cost; 7149 7150 // Some targets support efficient element stores. 7151 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7152 return Cost; 7153 7154 // Collect operands to consider. 7155 CallInst *CI = dyn_cast<CallInst>(I); 7156 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7157 7158 // Skip operands that do not require extraction/scalarization and do not incur 7159 // any overhead. 7160 SmallVector<Type *> Tys; 7161 for (auto *V : filterExtractingOperands(Ops, VF)) 7162 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7163 return Cost + TTI.getOperandsScalarizationOverhead( 7164 filterExtractingOperands(Ops, VF), Tys); 7165 } 7166 7167 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7168 if (VF.isScalar()) 7169 return; 7170 NumPredStores = 0; 7171 for (BasicBlock *BB : TheLoop->blocks()) { 7172 // For each instruction in the old loop. 7173 for (Instruction &I : *BB) { 7174 Value *Ptr = getLoadStorePointerOperand(&I); 7175 if (!Ptr) 7176 continue; 7177 7178 // TODO: We should generate better code and update the cost model for 7179 // predicated uniform stores. Today they are treated as any other 7180 // predicated store (see added test cases in 7181 // invariant-store-vectorization.ll). 7182 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7183 NumPredStores++; 7184 7185 if (Legal->isUniformMemOp(I)) { 7186 // TODO: Avoid replicating loads and stores instead of 7187 // relying on instcombine to remove them. 7188 // Load: Scalar load + broadcast 7189 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7190 InstructionCost Cost; 7191 if (isa<StoreInst>(&I) && VF.isScalable() && 7192 isLegalGatherOrScatter(&I)) { 7193 Cost = getGatherScatterCost(&I, VF); 7194 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7195 } else { 7196 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7197 "Cannot yet scalarize uniform stores"); 7198 Cost = getUniformMemOpCost(&I, VF); 7199 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7200 } 7201 continue; 7202 } 7203 7204 // We assume that widening is the best solution when possible. 7205 if (memoryInstructionCanBeWidened(&I, VF)) { 7206 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7207 int ConsecutiveStride = Legal->isConsecutivePtr( 7208 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7209 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7210 "Expected consecutive stride."); 7211 InstWidening Decision = 7212 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7213 setWideningDecision(&I, VF, Decision, Cost); 7214 continue; 7215 } 7216 7217 // Choose between Interleaving, Gather/Scatter or Scalarization. 7218 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7219 unsigned NumAccesses = 1; 7220 if (isAccessInterleaved(&I)) { 7221 auto Group = getInterleavedAccessGroup(&I); 7222 assert(Group && "Fail to get an interleaved access group."); 7223 7224 // Make one decision for the whole group. 7225 if (getWideningDecision(&I, VF) != CM_Unknown) 7226 continue; 7227 7228 NumAccesses = Group->getNumMembers(); 7229 if (interleavedAccessCanBeWidened(&I, VF)) 7230 InterleaveCost = getInterleaveGroupCost(&I, VF); 7231 } 7232 7233 InstructionCost GatherScatterCost = 7234 isLegalGatherOrScatter(&I) 7235 ? getGatherScatterCost(&I, VF) * NumAccesses 7236 : InstructionCost::getInvalid(); 7237 7238 InstructionCost ScalarizationCost = 7239 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7240 7241 // Choose better solution for the current VF, 7242 // write down this decision and use it during vectorization. 7243 InstructionCost Cost; 7244 InstWidening Decision; 7245 if (InterleaveCost <= GatherScatterCost && 7246 InterleaveCost < ScalarizationCost) { 7247 Decision = CM_Interleave; 7248 Cost = InterleaveCost; 7249 } else if (GatherScatterCost < ScalarizationCost) { 7250 Decision = CM_GatherScatter; 7251 Cost = GatherScatterCost; 7252 } else { 7253 Decision = CM_Scalarize; 7254 Cost = ScalarizationCost; 7255 } 7256 // If the instructions belongs to an interleave group, the whole group 7257 // receives the same decision. The whole group receives the cost, but 7258 // the cost will actually be assigned to one instruction. 7259 if (auto Group = getInterleavedAccessGroup(&I)) 7260 setWideningDecision(Group, VF, Decision, Cost); 7261 else 7262 setWideningDecision(&I, VF, Decision, Cost); 7263 } 7264 } 7265 7266 // Make sure that any load of address and any other address computation 7267 // remains scalar unless there is gather/scatter support. This avoids 7268 // inevitable extracts into address registers, and also has the benefit of 7269 // activating LSR more, since that pass can't optimize vectorized 7270 // addresses. 7271 if (TTI.prefersVectorizedAddressing()) 7272 return; 7273 7274 // Start with all scalar pointer uses. 7275 SmallPtrSet<Instruction *, 8> AddrDefs; 7276 for (BasicBlock *BB : TheLoop->blocks()) 7277 for (Instruction &I : *BB) { 7278 Instruction *PtrDef = 7279 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7280 if (PtrDef && TheLoop->contains(PtrDef) && 7281 getWideningDecision(&I, VF) != CM_GatherScatter) 7282 AddrDefs.insert(PtrDef); 7283 } 7284 7285 // Add all instructions used to generate the addresses. 7286 SmallVector<Instruction *, 4> Worklist; 7287 append_range(Worklist, AddrDefs); 7288 while (!Worklist.empty()) { 7289 Instruction *I = Worklist.pop_back_val(); 7290 for (auto &Op : I->operands()) 7291 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7292 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7293 AddrDefs.insert(InstOp).second) 7294 Worklist.push_back(InstOp); 7295 } 7296 7297 for (auto *I : AddrDefs) { 7298 if (isa<LoadInst>(I)) { 7299 // Setting the desired widening decision should ideally be handled in 7300 // by cost functions, but since this involves the task of finding out 7301 // if the loaded register is involved in an address computation, it is 7302 // instead changed here when we know this is the case. 7303 InstWidening Decision = getWideningDecision(I, VF); 7304 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7305 // Scalarize a widened load of address. 7306 setWideningDecision( 7307 I, VF, CM_Scalarize, 7308 (VF.getKnownMinValue() * 7309 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7310 else if (auto Group = getInterleavedAccessGroup(I)) { 7311 // Scalarize an interleave group of address loads. 7312 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7313 if (Instruction *Member = Group->getMember(I)) 7314 setWideningDecision( 7315 Member, VF, CM_Scalarize, 7316 (VF.getKnownMinValue() * 7317 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7318 } 7319 } 7320 } else 7321 // Make sure I gets scalarized and a cost estimate without 7322 // scalarization overhead. 7323 ForcedScalars[VF].insert(I); 7324 } 7325 } 7326 7327 InstructionCost 7328 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7329 Type *&VectorTy) { 7330 Type *RetTy = I->getType(); 7331 if (canTruncateToMinimalBitwidth(I, VF)) 7332 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7333 auto SE = PSE.getSE(); 7334 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7335 7336 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7337 ElementCount VF) -> bool { 7338 if (VF.isScalar()) 7339 return true; 7340 7341 auto Scalarized = InstsToScalarize.find(VF); 7342 assert(Scalarized != InstsToScalarize.end() && 7343 "VF not yet analyzed for scalarization profitability"); 7344 return !Scalarized->second.count(I) && 7345 llvm::all_of(I->users(), [&](User *U) { 7346 auto *UI = cast<Instruction>(U); 7347 return !Scalarized->second.count(UI); 7348 }); 7349 }; 7350 (void) hasSingleCopyAfterVectorization; 7351 7352 if (isScalarAfterVectorization(I, VF)) { 7353 // With the exception of GEPs and PHIs, after scalarization there should 7354 // only be one copy of the instruction generated in the loop. This is 7355 // because the VF is either 1, or any instructions that need scalarizing 7356 // have already been dealt with by the the time we get here. As a result, 7357 // it means we don't have to multiply the instruction cost by VF. 7358 assert(I->getOpcode() == Instruction::GetElementPtr || 7359 I->getOpcode() == Instruction::PHI || 7360 (I->getOpcode() == Instruction::BitCast && 7361 I->getType()->isPointerTy()) || 7362 hasSingleCopyAfterVectorization(I, VF)); 7363 VectorTy = RetTy; 7364 } else 7365 VectorTy = ToVectorTy(RetTy, VF); 7366 7367 // TODO: We need to estimate the cost of intrinsic calls. 7368 switch (I->getOpcode()) { 7369 case Instruction::GetElementPtr: 7370 // We mark this instruction as zero-cost because the cost of GEPs in 7371 // vectorized code depends on whether the corresponding memory instruction 7372 // is scalarized or not. Therefore, we handle GEPs with the memory 7373 // instruction cost. 7374 return 0; 7375 case Instruction::Br: { 7376 // In cases of scalarized and predicated instructions, there will be VF 7377 // predicated blocks in the vectorized loop. Each branch around these 7378 // blocks requires also an extract of its vector compare i1 element. 7379 bool ScalarPredicatedBB = false; 7380 BranchInst *BI = cast<BranchInst>(I); 7381 if (VF.isVector() && BI->isConditional() && 7382 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7383 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7384 ScalarPredicatedBB = true; 7385 7386 if (ScalarPredicatedBB) { 7387 // Not possible to scalarize scalable vector with predicated instructions. 7388 if (VF.isScalable()) 7389 return InstructionCost::getInvalid(); 7390 // Return cost for branches around scalarized and predicated blocks. 7391 auto *Vec_i1Ty = 7392 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7393 return ( 7394 TTI.getScalarizationOverhead( 7395 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7396 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7397 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7398 // The back-edge branch will remain, as will all scalar branches. 7399 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7400 else 7401 // This branch will be eliminated by if-conversion. 7402 return 0; 7403 // Note: We currently assume zero cost for an unconditional branch inside 7404 // a predicated block since it will become a fall-through, although we 7405 // may decide in the future to call TTI for all branches. 7406 } 7407 case Instruction::PHI: { 7408 auto *Phi = cast<PHINode>(I); 7409 7410 // First-order recurrences are replaced by vector shuffles inside the loop. 7411 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7412 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7413 return TTI.getShuffleCost( 7414 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7415 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7416 7417 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7418 // converted into select instructions. We require N - 1 selects per phi 7419 // node, where N is the number of incoming values. 7420 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7421 return (Phi->getNumIncomingValues() - 1) * 7422 TTI.getCmpSelInstrCost( 7423 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7424 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7425 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7426 7427 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7428 } 7429 case Instruction::UDiv: 7430 case Instruction::SDiv: 7431 case Instruction::URem: 7432 case Instruction::SRem: 7433 // If we have a predicated instruction, it may not be executed for each 7434 // vector lane. Get the scalarization cost and scale this amount by the 7435 // probability of executing the predicated block. If the instruction is not 7436 // predicated, we fall through to the next case. 7437 if (VF.isVector() && isScalarWithPredication(I)) { 7438 InstructionCost Cost = 0; 7439 7440 // These instructions have a non-void type, so account for the phi nodes 7441 // that we will create. This cost is likely to be zero. The phi node 7442 // cost, if any, should be scaled by the block probability because it 7443 // models a copy at the end of each predicated block. 7444 Cost += VF.getKnownMinValue() * 7445 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7446 7447 // The cost of the non-predicated instruction. 7448 Cost += VF.getKnownMinValue() * 7449 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7450 7451 // The cost of insertelement and extractelement instructions needed for 7452 // scalarization. 7453 Cost += getScalarizationOverhead(I, VF); 7454 7455 // Scale the cost by the probability of executing the predicated blocks. 7456 // This assumes the predicated block for each vector lane is equally 7457 // likely. 7458 return Cost / getReciprocalPredBlockProb(); 7459 } 7460 LLVM_FALLTHROUGH; 7461 case Instruction::Add: 7462 case Instruction::FAdd: 7463 case Instruction::Sub: 7464 case Instruction::FSub: 7465 case Instruction::Mul: 7466 case Instruction::FMul: 7467 case Instruction::FDiv: 7468 case Instruction::FRem: 7469 case Instruction::Shl: 7470 case Instruction::LShr: 7471 case Instruction::AShr: 7472 case Instruction::And: 7473 case Instruction::Or: 7474 case Instruction::Xor: { 7475 // Since we will replace the stride by 1 the multiplication should go away. 7476 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7477 return 0; 7478 7479 // Detect reduction patterns 7480 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7481 return *RedCost; 7482 7483 // Certain instructions can be cheaper to vectorize if they have a constant 7484 // second vector operand. One example of this are shifts on x86. 7485 Value *Op2 = I->getOperand(1); 7486 TargetTransformInfo::OperandValueProperties Op2VP; 7487 TargetTransformInfo::OperandValueKind Op2VK = 7488 TTI.getOperandInfo(Op2, Op2VP); 7489 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7490 Op2VK = TargetTransformInfo::OK_UniformValue; 7491 7492 SmallVector<const Value *, 4> Operands(I->operand_values()); 7493 return TTI.getArithmeticInstrCost( 7494 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7495 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7496 } 7497 case Instruction::FNeg: { 7498 return TTI.getArithmeticInstrCost( 7499 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7500 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7501 TargetTransformInfo::OP_None, I->getOperand(0), I); 7502 } 7503 case Instruction::Select: { 7504 SelectInst *SI = cast<SelectInst>(I); 7505 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7506 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7507 7508 const Value *Op0, *Op1; 7509 using namespace llvm::PatternMatch; 7510 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7511 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7512 // select x, y, false --> x & y 7513 // select x, true, y --> x | y 7514 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7515 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7516 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7517 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7518 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7519 Op1->getType()->getScalarSizeInBits() == 1); 7520 7521 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7522 return TTI.getArithmeticInstrCost( 7523 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7524 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7525 } 7526 7527 Type *CondTy = SI->getCondition()->getType(); 7528 if (!ScalarCond) 7529 CondTy = VectorType::get(CondTy, VF); 7530 7531 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7532 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7533 Pred = Cmp->getPredicate(); 7534 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7535 CostKind, I); 7536 } 7537 case Instruction::ICmp: 7538 case Instruction::FCmp: { 7539 Type *ValTy = I->getOperand(0)->getType(); 7540 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7541 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7542 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7543 VectorTy = ToVectorTy(ValTy, VF); 7544 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7545 cast<CmpInst>(I)->getPredicate(), CostKind, 7546 I); 7547 } 7548 case Instruction::Store: 7549 case Instruction::Load: { 7550 ElementCount Width = VF; 7551 if (Width.isVector()) { 7552 InstWidening Decision = getWideningDecision(I, Width); 7553 assert(Decision != CM_Unknown && 7554 "CM decision should be taken at this point"); 7555 if (Decision == CM_Scalarize) 7556 Width = ElementCount::getFixed(1); 7557 } 7558 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7559 return getMemoryInstructionCost(I, VF); 7560 } 7561 case Instruction::BitCast: 7562 if (I->getType()->isPointerTy()) 7563 return 0; 7564 LLVM_FALLTHROUGH; 7565 case Instruction::ZExt: 7566 case Instruction::SExt: 7567 case Instruction::FPToUI: 7568 case Instruction::FPToSI: 7569 case Instruction::FPExt: 7570 case Instruction::PtrToInt: 7571 case Instruction::IntToPtr: 7572 case Instruction::SIToFP: 7573 case Instruction::UIToFP: 7574 case Instruction::Trunc: 7575 case Instruction::FPTrunc: { 7576 // Computes the CastContextHint from a Load/Store instruction. 7577 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7578 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7579 "Expected a load or a store!"); 7580 7581 if (VF.isScalar() || !TheLoop->contains(I)) 7582 return TTI::CastContextHint::Normal; 7583 7584 switch (getWideningDecision(I, VF)) { 7585 case LoopVectorizationCostModel::CM_GatherScatter: 7586 return TTI::CastContextHint::GatherScatter; 7587 case LoopVectorizationCostModel::CM_Interleave: 7588 return TTI::CastContextHint::Interleave; 7589 case LoopVectorizationCostModel::CM_Scalarize: 7590 case LoopVectorizationCostModel::CM_Widen: 7591 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7592 : TTI::CastContextHint::Normal; 7593 case LoopVectorizationCostModel::CM_Widen_Reverse: 7594 return TTI::CastContextHint::Reversed; 7595 case LoopVectorizationCostModel::CM_Unknown: 7596 llvm_unreachable("Instr did not go through cost modelling?"); 7597 } 7598 7599 llvm_unreachable("Unhandled case!"); 7600 }; 7601 7602 unsigned Opcode = I->getOpcode(); 7603 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7604 // For Trunc, the context is the only user, which must be a StoreInst. 7605 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7606 if (I->hasOneUse()) 7607 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7608 CCH = ComputeCCH(Store); 7609 } 7610 // For Z/Sext, the context is the operand, which must be a LoadInst. 7611 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7612 Opcode == Instruction::FPExt) { 7613 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7614 CCH = ComputeCCH(Load); 7615 } 7616 7617 // We optimize the truncation of induction variables having constant 7618 // integer steps. The cost of these truncations is the same as the scalar 7619 // operation. 7620 if (isOptimizableIVTruncate(I, VF)) { 7621 auto *Trunc = cast<TruncInst>(I); 7622 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7623 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7624 } 7625 7626 // Detect reduction patterns 7627 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7628 return *RedCost; 7629 7630 Type *SrcScalarTy = I->getOperand(0)->getType(); 7631 Type *SrcVecTy = 7632 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7633 if (canTruncateToMinimalBitwidth(I, VF)) { 7634 // This cast is going to be shrunk. This may remove the cast or it might 7635 // turn it into slightly different cast. For example, if MinBW == 16, 7636 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7637 // 7638 // Calculate the modified src and dest types. 7639 Type *MinVecTy = VectorTy; 7640 if (Opcode == Instruction::Trunc) { 7641 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7642 VectorTy = 7643 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7644 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7645 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7646 VectorTy = 7647 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7648 } 7649 } 7650 7651 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7652 } 7653 case Instruction::Call: { 7654 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7655 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7656 return *RedCost; 7657 bool NeedToScalarize; 7658 CallInst *CI = cast<CallInst>(I); 7659 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7660 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7661 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7662 return std::min(CallCost, IntrinsicCost); 7663 } 7664 return CallCost; 7665 } 7666 case Instruction::ExtractValue: 7667 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7668 case Instruction::Alloca: 7669 // We cannot easily widen alloca to a scalable alloca, as 7670 // the result would need to be a vector of pointers. 7671 if (VF.isScalable()) 7672 return InstructionCost::getInvalid(); 7673 LLVM_FALLTHROUGH; 7674 default: 7675 // This opcode is unknown. Assume that it is the same as 'mul'. 7676 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7677 } // end of switch. 7678 } 7679 7680 char LoopVectorize::ID = 0; 7681 7682 static const char lv_name[] = "Loop Vectorization"; 7683 7684 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7685 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7686 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7687 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7688 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7689 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7690 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7691 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7692 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7693 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7694 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7695 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7696 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7697 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7698 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7699 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7700 7701 namespace llvm { 7702 7703 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7704 7705 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7706 bool VectorizeOnlyWhenForced) { 7707 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7708 } 7709 7710 } // end namespace llvm 7711 7712 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7713 // Check if the pointer operand of a load or store instruction is 7714 // consecutive. 7715 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7716 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7717 return false; 7718 } 7719 7720 void LoopVectorizationCostModel::collectValuesToIgnore() { 7721 // Ignore ephemeral values. 7722 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7723 7724 // Ignore type-promoting instructions we identified during reduction 7725 // detection. 7726 for (auto &Reduction : Legal->getReductionVars()) { 7727 const RecurrenceDescriptor &RedDes = Reduction.second; 7728 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7729 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7730 } 7731 // Ignore type-casting instructions we identified during induction 7732 // detection. 7733 for (auto &Induction : Legal->getInductionVars()) { 7734 const InductionDescriptor &IndDes = Induction.second; 7735 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7736 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7737 } 7738 } 7739 7740 void LoopVectorizationCostModel::collectInLoopReductions() { 7741 for (auto &Reduction : Legal->getReductionVars()) { 7742 PHINode *Phi = Reduction.first; 7743 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7744 7745 // We don't collect reductions that are type promoted (yet). 7746 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7747 continue; 7748 7749 // If the target would prefer this reduction to happen "in-loop", then we 7750 // want to record it as such. 7751 unsigned Opcode = RdxDesc.getOpcode(); 7752 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7753 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7754 TargetTransformInfo::ReductionFlags())) 7755 continue; 7756 7757 // Check that we can correctly put the reductions into the loop, by 7758 // finding the chain of operations that leads from the phi to the loop 7759 // exit value. 7760 SmallVector<Instruction *, 4> ReductionOperations = 7761 RdxDesc.getReductionOpChain(Phi, TheLoop); 7762 bool InLoop = !ReductionOperations.empty(); 7763 if (InLoop) { 7764 InLoopReductionChains[Phi] = ReductionOperations; 7765 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7766 Instruction *LastChain = Phi; 7767 for (auto *I : ReductionOperations) { 7768 InLoopReductionImmediateChains[I] = LastChain; 7769 LastChain = I; 7770 } 7771 } 7772 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7773 << " reduction for phi: " << *Phi << "\n"); 7774 } 7775 } 7776 7777 // TODO: we could return a pair of values that specify the max VF and 7778 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7779 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7780 // doesn't have a cost model that can choose which plan to execute if 7781 // more than one is generated. 7782 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7783 LoopVectorizationCostModel &CM) { 7784 unsigned WidestType; 7785 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7786 return WidestVectorRegBits / WidestType; 7787 } 7788 7789 VectorizationFactor 7790 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7791 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7792 ElementCount VF = UserVF; 7793 // Outer loop handling: They may require CFG and instruction level 7794 // transformations before even evaluating whether vectorization is profitable. 7795 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7796 // the vectorization pipeline. 7797 if (!OrigLoop->isInnermost()) { 7798 // If the user doesn't provide a vectorization factor, determine a 7799 // reasonable one. 7800 if (UserVF.isZero()) { 7801 VF = ElementCount::getFixed(determineVPlanVF( 7802 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7803 .getFixedSize(), 7804 CM)); 7805 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7806 7807 // Make sure we have a VF > 1 for stress testing. 7808 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7809 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7810 << "overriding computed VF.\n"); 7811 VF = ElementCount::getFixed(4); 7812 } 7813 } 7814 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7815 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7816 "VF needs to be a power of two"); 7817 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7818 << "VF " << VF << " to build VPlans.\n"); 7819 buildVPlans(VF, VF); 7820 7821 // For VPlan build stress testing, we bail out after VPlan construction. 7822 if (VPlanBuildStressTest) 7823 return VectorizationFactor::Disabled(); 7824 7825 return {VF, 0 /*Cost*/}; 7826 } 7827 7828 LLVM_DEBUG( 7829 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7830 "VPlan-native path.\n"); 7831 return VectorizationFactor::Disabled(); 7832 } 7833 7834 Optional<VectorizationFactor> 7835 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7836 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7837 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7838 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7839 return None; 7840 7841 // Invalidate interleave groups if all blocks of loop will be predicated. 7842 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7843 !useMaskedInterleavedAccesses(*TTI)) { 7844 LLVM_DEBUG( 7845 dbgs() 7846 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7847 "which requires masked-interleaved support.\n"); 7848 if (CM.InterleaveInfo.invalidateGroups()) 7849 // Invalidating interleave groups also requires invalidating all decisions 7850 // based on them, which includes widening decisions and uniform and scalar 7851 // values. 7852 CM.invalidateCostModelingDecisions(); 7853 } 7854 7855 ElementCount MaxUserVF = 7856 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7857 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7858 if (!UserVF.isZero() && UserVFIsLegal) { 7859 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7860 "VF needs to be a power of two"); 7861 // Collect the instructions (and their associated costs) that will be more 7862 // profitable to scalarize. 7863 if (CM.selectUserVectorizationFactor(UserVF)) { 7864 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7865 CM.collectInLoopReductions(); 7866 buildVPlansWithVPRecipes(UserVF, UserVF); 7867 LLVM_DEBUG(printPlans(dbgs())); 7868 return {{UserVF, 0}}; 7869 } else 7870 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7871 "InvalidCost", ORE, OrigLoop); 7872 } 7873 7874 // Populate the set of Vectorization Factor Candidates. 7875 ElementCountSet VFCandidates; 7876 for (auto VF = ElementCount::getFixed(1); 7877 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7878 VFCandidates.insert(VF); 7879 for (auto VF = ElementCount::getScalable(1); 7880 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7881 VFCandidates.insert(VF); 7882 7883 for (const auto &VF : VFCandidates) { 7884 // Collect Uniform and Scalar instructions after vectorization with VF. 7885 CM.collectUniformsAndScalars(VF); 7886 7887 // Collect the instructions (and their associated costs) that will be more 7888 // profitable to scalarize. 7889 if (VF.isVector()) 7890 CM.collectInstsToScalarize(VF); 7891 } 7892 7893 CM.collectInLoopReductions(); 7894 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7895 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7896 7897 LLVM_DEBUG(printPlans(dbgs())); 7898 if (!MaxFactors.hasVector()) 7899 return VectorizationFactor::Disabled(); 7900 7901 // Select the optimal vectorization factor. 7902 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7903 7904 // Check if it is profitable to vectorize with runtime checks. 7905 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7906 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7907 bool PragmaThresholdReached = 7908 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7909 bool ThresholdReached = 7910 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7911 if ((ThresholdReached && !Hints.allowReordering()) || 7912 PragmaThresholdReached) { 7913 ORE->emit([&]() { 7914 return OptimizationRemarkAnalysisAliasing( 7915 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7916 OrigLoop->getHeader()) 7917 << "loop not vectorized: cannot prove it is safe to reorder " 7918 "memory operations"; 7919 }); 7920 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7921 Hints.emitRemarkWithHints(); 7922 return VectorizationFactor::Disabled(); 7923 } 7924 } 7925 return SelectedVF; 7926 } 7927 7928 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7929 assert(count_if(VPlans, 7930 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7931 1 && 7932 "Best VF has not a single VPlan."); 7933 7934 for (const VPlanPtr &Plan : VPlans) { 7935 if (Plan->hasVF(VF)) 7936 return *Plan.get(); 7937 } 7938 llvm_unreachable("No plan found!"); 7939 } 7940 7941 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7942 VPlan &BestVPlan, 7943 InnerLoopVectorizer &ILV, 7944 DominatorTree *DT) { 7945 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7946 << '\n'); 7947 7948 // Perform the actual loop transformation. 7949 7950 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7951 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7952 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7953 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7954 State.CanonicalIV = ILV.Induction; 7955 ILV.collectPoisonGeneratingRecipes(State); 7956 7957 ILV.printDebugTracesAtStart(); 7958 7959 //===------------------------------------------------===// 7960 // 7961 // Notice: any optimization or new instruction that go 7962 // into the code below should also be implemented in 7963 // the cost-model. 7964 // 7965 //===------------------------------------------------===// 7966 7967 // 2. Copy and widen instructions from the old loop into the new loop. 7968 BestVPlan.execute(&State); 7969 7970 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7971 // predication, updating analyses. 7972 ILV.fixVectorizedLoop(State); 7973 7974 ILV.printDebugTracesAtEnd(); 7975 } 7976 7977 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7978 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7979 for (const auto &Plan : VPlans) 7980 if (PrintVPlansInDotFormat) 7981 Plan->printDOT(O); 7982 else 7983 Plan->print(O); 7984 } 7985 #endif 7986 7987 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7988 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7989 7990 // We create new control-flow for the vectorized loop, so the original exit 7991 // conditions will be dead after vectorization if it's only used by the 7992 // terminator 7993 SmallVector<BasicBlock*> ExitingBlocks; 7994 OrigLoop->getExitingBlocks(ExitingBlocks); 7995 for (auto *BB : ExitingBlocks) { 7996 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7997 if (!Cmp || !Cmp->hasOneUse()) 7998 continue; 7999 8000 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8001 if (!DeadInstructions.insert(Cmp).second) 8002 continue; 8003 8004 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8005 // TODO: can recurse through operands in general 8006 for (Value *Op : Cmp->operands()) { 8007 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8008 DeadInstructions.insert(cast<Instruction>(Op)); 8009 } 8010 } 8011 8012 // We create new "steps" for induction variable updates to which the original 8013 // induction variables map. An original update instruction will be dead if 8014 // all its users except the induction variable are dead. 8015 auto *Latch = OrigLoop->getLoopLatch(); 8016 for (auto &Induction : Legal->getInductionVars()) { 8017 PHINode *Ind = Induction.first; 8018 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8019 8020 // If the tail is to be folded by masking, the primary induction variable, 8021 // if exists, isn't dead: it will be used for masking. Don't kill it. 8022 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8023 continue; 8024 8025 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8026 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8027 })) 8028 DeadInstructions.insert(IndUpdate); 8029 } 8030 } 8031 8032 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8033 8034 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8035 8036 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8037 Value *Step, 8038 Instruction::BinaryOps BinOp) { 8039 // When unrolling and the VF is 1, we only need to add a simple scalar. 8040 Type *Ty = Val->getType(); 8041 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8042 8043 if (Ty->isFloatingPointTy()) { 8044 // Floating-point operations inherit FMF via the builder's flags. 8045 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8046 return Builder.CreateBinOp(BinOp, Val, MulOp); 8047 } 8048 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8049 } 8050 8051 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8052 SmallVector<Metadata *, 4> MDs; 8053 // Reserve first location for self reference to the LoopID metadata node. 8054 MDs.push_back(nullptr); 8055 bool IsUnrollMetadata = false; 8056 MDNode *LoopID = L->getLoopID(); 8057 if (LoopID) { 8058 // First find existing loop unrolling disable metadata. 8059 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8060 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8061 if (MD) { 8062 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8063 IsUnrollMetadata = 8064 S && S->getString().startswith("llvm.loop.unroll.disable"); 8065 } 8066 MDs.push_back(LoopID->getOperand(i)); 8067 } 8068 } 8069 8070 if (!IsUnrollMetadata) { 8071 // Add runtime unroll disable metadata. 8072 LLVMContext &Context = L->getHeader()->getContext(); 8073 SmallVector<Metadata *, 1> DisableOperands; 8074 DisableOperands.push_back( 8075 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8076 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8077 MDs.push_back(DisableNode); 8078 MDNode *NewLoopID = MDNode::get(Context, MDs); 8079 // Set operand 0 to refer to the loop id itself. 8080 NewLoopID->replaceOperandWith(0, NewLoopID); 8081 L->setLoopID(NewLoopID); 8082 } 8083 } 8084 8085 //===--------------------------------------------------------------------===// 8086 // EpilogueVectorizerMainLoop 8087 //===--------------------------------------------------------------------===// 8088 8089 /// This function is partially responsible for generating the control flow 8090 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8091 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8092 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8093 Loop *Lp = createVectorLoopSkeleton(""); 8094 8095 // Generate the code to check the minimum iteration count of the vector 8096 // epilogue (see below). 8097 EPI.EpilogueIterationCountCheck = 8098 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8099 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8100 8101 // Generate the code to check any assumptions that we've made for SCEV 8102 // expressions. 8103 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8104 8105 // Generate the code that checks at runtime if arrays overlap. We put the 8106 // checks into a separate block to make the more common case of few elements 8107 // faster. 8108 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8109 8110 // Generate the iteration count check for the main loop, *after* the check 8111 // for the epilogue loop, so that the path-length is shorter for the case 8112 // that goes directly through the vector epilogue. The longer-path length for 8113 // the main loop is compensated for, by the gain from vectorizing the larger 8114 // trip count. Note: the branch will get updated later on when we vectorize 8115 // the epilogue. 8116 EPI.MainLoopIterationCountCheck = 8117 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8118 8119 // Generate the induction variable. 8120 OldInduction = Legal->getPrimaryInduction(); 8121 Type *IdxTy = Legal->getWidestInductionType(); 8122 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8123 8124 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8125 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8126 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8127 EPI.VectorTripCount = CountRoundDown; 8128 Induction = 8129 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8130 getDebugLocFromInstOrOperands(OldInduction)); 8131 8132 // Skip induction resume value creation here because they will be created in 8133 // the second pass. If we created them here, they wouldn't be used anyway, 8134 // because the vplan in the second pass still contains the inductions from the 8135 // original loop. 8136 8137 return completeLoopSkeleton(Lp, OrigLoopID); 8138 } 8139 8140 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8141 LLVM_DEBUG({ 8142 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8143 << "Main Loop VF:" << EPI.MainLoopVF 8144 << ", Main Loop UF:" << EPI.MainLoopUF 8145 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8146 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8147 }); 8148 } 8149 8150 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8151 DEBUG_WITH_TYPE(VerboseDebug, { 8152 dbgs() << "intermediate fn:\n" 8153 << *OrigLoop->getHeader()->getParent() << "\n"; 8154 }); 8155 } 8156 8157 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8158 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8159 assert(L && "Expected valid Loop."); 8160 assert(Bypass && "Expected valid bypass basic block."); 8161 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8162 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8163 Value *Count = getOrCreateTripCount(L); 8164 // Reuse existing vector loop preheader for TC checks. 8165 // Note that new preheader block is generated for vector loop. 8166 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8167 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8168 8169 // Generate code to check if the loop's trip count is less than VF * UF of the 8170 // main vector loop. 8171 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8172 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8173 8174 Value *CheckMinIters = Builder.CreateICmp( 8175 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8176 "min.iters.check"); 8177 8178 if (!ForEpilogue) 8179 TCCheckBlock->setName("vector.main.loop.iter.check"); 8180 8181 // Create new preheader for vector loop. 8182 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8183 DT, LI, nullptr, "vector.ph"); 8184 8185 if (ForEpilogue) { 8186 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8187 DT->getNode(Bypass)->getIDom()) && 8188 "TC check is expected to dominate Bypass"); 8189 8190 // Update dominator for Bypass & LoopExit. 8191 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8192 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8193 // For loops with multiple exits, there's no edge from the middle block 8194 // to exit blocks (as the epilogue must run) and thus no need to update 8195 // the immediate dominator of the exit blocks. 8196 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8197 8198 LoopBypassBlocks.push_back(TCCheckBlock); 8199 8200 // Save the trip count so we don't have to regenerate it in the 8201 // vec.epilog.iter.check. This is safe to do because the trip count 8202 // generated here dominates the vector epilog iter check. 8203 EPI.TripCount = Count; 8204 } 8205 8206 ReplaceInstWithInst( 8207 TCCheckBlock->getTerminator(), 8208 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8209 8210 return TCCheckBlock; 8211 } 8212 8213 //===--------------------------------------------------------------------===// 8214 // EpilogueVectorizerEpilogueLoop 8215 //===--------------------------------------------------------------------===// 8216 8217 /// This function is partially responsible for generating the control flow 8218 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8219 BasicBlock * 8220 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8221 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8222 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8223 8224 // Now, compare the remaining count and if there aren't enough iterations to 8225 // execute the vectorized epilogue skip to the scalar part. 8226 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8227 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8228 LoopVectorPreHeader = 8229 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8230 LI, nullptr, "vec.epilog.ph"); 8231 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8232 VecEpilogueIterationCountCheck); 8233 8234 // Adjust the control flow taking the state info from the main loop 8235 // vectorization into account. 8236 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8237 "expected this to be saved from the previous pass."); 8238 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8239 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8240 8241 DT->changeImmediateDominator(LoopVectorPreHeader, 8242 EPI.MainLoopIterationCountCheck); 8243 8244 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8245 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8246 8247 if (EPI.SCEVSafetyCheck) 8248 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8249 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8250 if (EPI.MemSafetyCheck) 8251 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8252 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8253 8254 DT->changeImmediateDominator( 8255 VecEpilogueIterationCountCheck, 8256 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8257 8258 DT->changeImmediateDominator(LoopScalarPreHeader, 8259 EPI.EpilogueIterationCountCheck); 8260 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8261 // If there is an epilogue which must run, there's no edge from the 8262 // middle block to exit blocks and thus no need to update the immediate 8263 // dominator of the exit blocks. 8264 DT->changeImmediateDominator(LoopExitBlock, 8265 EPI.EpilogueIterationCountCheck); 8266 8267 // Keep track of bypass blocks, as they feed start values to the induction 8268 // phis in the scalar loop preheader. 8269 if (EPI.SCEVSafetyCheck) 8270 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8271 if (EPI.MemSafetyCheck) 8272 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8273 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8274 8275 // Generate a resume induction for the vector epilogue and put it in the 8276 // vector epilogue preheader 8277 Type *IdxTy = Legal->getWidestInductionType(); 8278 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8279 LoopVectorPreHeader->getFirstNonPHI()); 8280 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8281 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8282 EPI.MainLoopIterationCountCheck); 8283 8284 // Generate the induction variable. 8285 OldInduction = Legal->getPrimaryInduction(); 8286 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8287 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8288 Value *StartIdx = EPResumeVal; 8289 Induction = 8290 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8291 getDebugLocFromInstOrOperands(OldInduction)); 8292 8293 // Generate induction resume values. These variables save the new starting 8294 // indexes for the scalar loop. They are used to test if there are any tail 8295 // iterations left once the vector loop has completed. 8296 // Note that when the vectorized epilogue is skipped due to iteration count 8297 // check, then the resume value for the induction variable comes from 8298 // the trip count of the main vector loop, hence passing the AdditionalBypass 8299 // argument. 8300 createInductionResumeValues(Lp, CountRoundDown, 8301 {VecEpilogueIterationCountCheck, 8302 EPI.VectorTripCount} /* AdditionalBypass */); 8303 8304 AddRuntimeUnrollDisableMetaData(Lp); 8305 return completeLoopSkeleton(Lp, OrigLoopID); 8306 } 8307 8308 BasicBlock * 8309 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8310 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8311 8312 assert(EPI.TripCount && 8313 "Expected trip count to have been safed in the first pass."); 8314 assert( 8315 (!isa<Instruction>(EPI.TripCount) || 8316 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8317 "saved trip count does not dominate insertion point."); 8318 Value *TC = EPI.TripCount; 8319 IRBuilder<> Builder(Insert->getTerminator()); 8320 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8321 8322 // Generate code to check if the loop's trip count is less than VF * UF of the 8323 // vector epilogue loop. 8324 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8325 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8326 8327 Value *CheckMinIters = 8328 Builder.CreateICmp(P, Count, 8329 createStepForVF(Builder, Count->getType(), 8330 EPI.EpilogueVF, EPI.EpilogueUF), 8331 "min.epilog.iters.check"); 8332 8333 ReplaceInstWithInst( 8334 Insert->getTerminator(), 8335 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8336 8337 LoopBypassBlocks.push_back(Insert); 8338 return Insert; 8339 } 8340 8341 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8342 LLVM_DEBUG({ 8343 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8344 << "Epilogue Loop VF:" << EPI.EpilogueVF 8345 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8346 }); 8347 } 8348 8349 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8350 DEBUG_WITH_TYPE(VerboseDebug, { 8351 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8352 }); 8353 } 8354 8355 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8356 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8357 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8358 bool PredicateAtRangeStart = Predicate(Range.Start); 8359 8360 for (ElementCount TmpVF = Range.Start * 2; 8361 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8362 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8363 Range.End = TmpVF; 8364 break; 8365 } 8366 8367 return PredicateAtRangeStart; 8368 } 8369 8370 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8371 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8372 /// of VF's starting at a given VF and extending it as much as possible. Each 8373 /// vectorization decision can potentially shorten this sub-range during 8374 /// buildVPlan(). 8375 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8376 ElementCount MaxVF) { 8377 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8378 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8379 VFRange SubRange = {VF, MaxVFPlusOne}; 8380 VPlans.push_back(buildVPlan(SubRange)); 8381 VF = SubRange.End; 8382 } 8383 } 8384 8385 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8386 VPlanPtr &Plan) { 8387 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8388 8389 // Look for cached value. 8390 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8391 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8392 if (ECEntryIt != EdgeMaskCache.end()) 8393 return ECEntryIt->second; 8394 8395 VPValue *SrcMask = createBlockInMask(Src, Plan); 8396 8397 // The terminator has to be a branch inst! 8398 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8399 assert(BI && "Unexpected terminator found"); 8400 8401 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8402 return EdgeMaskCache[Edge] = SrcMask; 8403 8404 // If source is an exiting block, we know the exit edge is dynamically dead 8405 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8406 // adding uses of an otherwise potentially dead instruction. 8407 if (OrigLoop->isLoopExiting(Src)) 8408 return EdgeMaskCache[Edge] = SrcMask; 8409 8410 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8411 assert(EdgeMask && "No Edge Mask found for condition"); 8412 8413 if (BI->getSuccessor(0) != Dst) 8414 EdgeMask = Builder.createNot(EdgeMask); 8415 8416 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8417 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8418 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8419 // The select version does not introduce new UB if SrcMask is false and 8420 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8421 VPValue *False = Plan->getOrAddVPValue( 8422 ConstantInt::getFalse(BI->getCondition()->getType())); 8423 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8424 } 8425 8426 return EdgeMaskCache[Edge] = EdgeMask; 8427 } 8428 8429 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8430 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8431 8432 // Look for cached value. 8433 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8434 if (BCEntryIt != BlockMaskCache.end()) 8435 return BCEntryIt->second; 8436 8437 // All-one mask is modelled as no-mask following the convention for masked 8438 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8439 VPValue *BlockMask = nullptr; 8440 8441 if (OrigLoop->getHeader() == BB) { 8442 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8443 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8444 8445 // Create the block in mask as the first non-phi instruction in the block. 8446 VPBuilder::InsertPointGuard Guard(Builder); 8447 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8448 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8449 8450 // Introduce the early-exit compare IV <= BTC to form header block mask. 8451 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8452 // Start by constructing the desired canonical IV. 8453 VPValue *IV = nullptr; 8454 if (Legal->getPrimaryInduction()) 8455 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8456 else { 8457 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8458 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8459 IV = IVRecipe; 8460 } 8461 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8462 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8463 8464 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8465 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8466 // as a second argument, we only pass the IV here and extract the 8467 // tripcount from the transform state where codegen of the VP instructions 8468 // happen. 8469 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8470 } else { 8471 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8472 } 8473 return BlockMaskCache[BB] = BlockMask; 8474 } 8475 8476 // This is the block mask. We OR all incoming edges. 8477 for (auto *Predecessor : predecessors(BB)) { 8478 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8479 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8480 return BlockMaskCache[BB] = EdgeMask; 8481 8482 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8483 BlockMask = EdgeMask; 8484 continue; 8485 } 8486 8487 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8488 } 8489 8490 return BlockMaskCache[BB] = BlockMask; 8491 } 8492 8493 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8494 ArrayRef<VPValue *> Operands, 8495 VFRange &Range, 8496 VPlanPtr &Plan) { 8497 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8498 "Must be called with either a load or store"); 8499 8500 auto willWiden = [&](ElementCount VF) -> bool { 8501 if (VF.isScalar()) 8502 return false; 8503 LoopVectorizationCostModel::InstWidening Decision = 8504 CM.getWideningDecision(I, VF); 8505 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8506 "CM decision should be taken at this point."); 8507 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8508 return true; 8509 if (CM.isScalarAfterVectorization(I, VF) || 8510 CM.isProfitableToScalarize(I, VF)) 8511 return false; 8512 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8513 }; 8514 8515 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8516 return nullptr; 8517 8518 VPValue *Mask = nullptr; 8519 if (Legal->isMaskRequired(I)) 8520 Mask = createBlockInMask(I->getParent(), Plan); 8521 8522 // Determine if the pointer operand of the access is either consecutive or 8523 // reverse consecutive. 8524 LoopVectorizationCostModel::InstWidening Decision = 8525 CM.getWideningDecision(I, Range.Start); 8526 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8527 bool Consecutive = 8528 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8529 8530 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8531 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8532 Consecutive, Reverse); 8533 8534 StoreInst *Store = cast<StoreInst>(I); 8535 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8536 Mask, Consecutive, Reverse); 8537 } 8538 8539 VPWidenIntOrFpInductionRecipe * 8540 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8541 ArrayRef<VPValue *> Operands) const { 8542 // Check if this is an integer or fp induction. If so, build the recipe that 8543 // produces its scalar and vector values. 8544 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { 8545 assert(II->getStartValue() == 8546 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8547 return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); 8548 } 8549 8550 return nullptr; 8551 } 8552 8553 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8554 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8555 VPlan &Plan) const { 8556 // Optimize the special case where the source is a constant integer 8557 // induction variable. Notice that we can only optimize the 'trunc' case 8558 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8559 // (c) other casts depend on pointer size. 8560 8561 // Determine whether \p K is a truncation based on an induction variable that 8562 // can be optimized. 8563 auto isOptimizableIVTruncate = 8564 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8565 return [=](ElementCount VF) -> bool { 8566 return CM.isOptimizableIVTruncate(K, VF); 8567 }; 8568 }; 8569 8570 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8571 isOptimizableIVTruncate(I), Range)) { 8572 8573 auto *Phi = cast<PHINode>(I->getOperand(0)); 8574 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8575 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8576 return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); 8577 } 8578 return nullptr; 8579 } 8580 8581 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8582 ArrayRef<VPValue *> Operands, 8583 VPlanPtr &Plan) { 8584 // If all incoming values are equal, the incoming VPValue can be used directly 8585 // instead of creating a new VPBlendRecipe. 8586 VPValue *FirstIncoming = Operands[0]; 8587 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8588 return FirstIncoming == Inc; 8589 })) { 8590 return Operands[0]; 8591 } 8592 8593 // We know that all PHIs in non-header blocks are converted into selects, so 8594 // we don't have to worry about the insertion order and we can just use the 8595 // builder. At this point we generate the predication tree. There may be 8596 // duplications since this is a simple recursive scan, but future 8597 // optimizations will clean it up. 8598 SmallVector<VPValue *, 2> OperandsWithMask; 8599 unsigned NumIncoming = Phi->getNumIncomingValues(); 8600 8601 for (unsigned In = 0; In < NumIncoming; In++) { 8602 VPValue *EdgeMask = 8603 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8604 assert((EdgeMask || NumIncoming == 1) && 8605 "Multiple predecessors with one having a full mask"); 8606 OperandsWithMask.push_back(Operands[In]); 8607 if (EdgeMask) 8608 OperandsWithMask.push_back(EdgeMask); 8609 } 8610 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8611 } 8612 8613 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8614 ArrayRef<VPValue *> Operands, 8615 VFRange &Range) const { 8616 8617 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8618 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8619 Range); 8620 8621 if (IsPredicated) 8622 return nullptr; 8623 8624 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8625 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8626 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8627 ID == Intrinsic::pseudoprobe || 8628 ID == Intrinsic::experimental_noalias_scope_decl)) 8629 return nullptr; 8630 8631 auto willWiden = [&](ElementCount VF) -> bool { 8632 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8633 // The following case may be scalarized depending on the VF. 8634 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8635 // version of the instruction. 8636 // Is it beneficial to perform intrinsic call compared to lib call? 8637 bool NeedToScalarize = false; 8638 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8639 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8640 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8641 return UseVectorIntrinsic || !NeedToScalarize; 8642 }; 8643 8644 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8645 return nullptr; 8646 8647 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8648 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8649 } 8650 8651 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8652 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8653 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8654 // Instruction should be widened, unless it is scalar after vectorization, 8655 // scalarization is profitable or it is predicated. 8656 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8657 return CM.isScalarAfterVectorization(I, VF) || 8658 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8659 }; 8660 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8661 Range); 8662 } 8663 8664 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8665 ArrayRef<VPValue *> Operands) const { 8666 auto IsVectorizableOpcode = [](unsigned Opcode) { 8667 switch (Opcode) { 8668 case Instruction::Add: 8669 case Instruction::And: 8670 case Instruction::AShr: 8671 case Instruction::BitCast: 8672 case Instruction::FAdd: 8673 case Instruction::FCmp: 8674 case Instruction::FDiv: 8675 case Instruction::FMul: 8676 case Instruction::FNeg: 8677 case Instruction::FPExt: 8678 case Instruction::FPToSI: 8679 case Instruction::FPToUI: 8680 case Instruction::FPTrunc: 8681 case Instruction::FRem: 8682 case Instruction::FSub: 8683 case Instruction::ICmp: 8684 case Instruction::IntToPtr: 8685 case Instruction::LShr: 8686 case Instruction::Mul: 8687 case Instruction::Or: 8688 case Instruction::PtrToInt: 8689 case Instruction::SDiv: 8690 case Instruction::Select: 8691 case Instruction::SExt: 8692 case Instruction::Shl: 8693 case Instruction::SIToFP: 8694 case Instruction::SRem: 8695 case Instruction::Sub: 8696 case Instruction::Trunc: 8697 case Instruction::UDiv: 8698 case Instruction::UIToFP: 8699 case Instruction::URem: 8700 case Instruction::Xor: 8701 case Instruction::ZExt: 8702 return true; 8703 } 8704 return false; 8705 }; 8706 8707 if (!IsVectorizableOpcode(I->getOpcode())) 8708 return nullptr; 8709 8710 // Success: widen this instruction. 8711 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8712 } 8713 8714 void VPRecipeBuilder::fixHeaderPhis() { 8715 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8716 for (VPWidenPHIRecipe *R : PhisToFix) { 8717 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8718 VPRecipeBase *IncR = 8719 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8720 R->addOperand(IncR->getVPSingleValue()); 8721 } 8722 } 8723 8724 VPBasicBlock *VPRecipeBuilder::handleReplication( 8725 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8726 VPlanPtr &Plan) { 8727 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8728 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8729 Range); 8730 8731 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8732 [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, 8733 Range); 8734 8735 // Even if the instruction is not marked as uniform, there are certain 8736 // intrinsic calls that can be effectively treated as such, so we check for 8737 // them here. Conservatively, we only do this for scalable vectors, since 8738 // for fixed-width VFs we can always fall back on full scalarization. 8739 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8740 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8741 case Intrinsic::assume: 8742 case Intrinsic::lifetime_start: 8743 case Intrinsic::lifetime_end: 8744 // For scalable vectors if one of the operands is variant then we still 8745 // want to mark as uniform, which will generate one instruction for just 8746 // the first lane of the vector. We can't scalarize the call in the same 8747 // way as for fixed-width vectors because we don't know how many lanes 8748 // there are. 8749 // 8750 // The reasons for doing it this way for scalable vectors are: 8751 // 1. For the assume intrinsic generating the instruction for the first 8752 // lane is still be better than not generating any at all. For 8753 // example, the input may be a splat across all lanes. 8754 // 2. For the lifetime start/end intrinsics the pointer operand only 8755 // does anything useful when the input comes from a stack object, 8756 // which suggests it should always be uniform. For non-stack objects 8757 // the effect is to poison the object, which still allows us to 8758 // remove the call. 8759 IsUniform = true; 8760 break; 8761 default: 8762 break; 8763 } 8764 } 8765 8766 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8767 IsUniform, IsPredicated); 8768 setRecipe(I, Recipe); 8769 Plan->addVPValue(I, Recipe); 8770 8771 // Find if I uses a predicated instruction. If so, it will use its scalar 8772 // value. Avoid hoisting the insert-element which packs the scalar value into 8773 // a vector value, as that happens iff all users use the vector value. 8774 for (VPValue *Op : Recipe->operands()) { 8775 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8776 if (!PredR) 8777 continue; 8778 auto *RepR = 8779 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8780 assert(RepR->isPredicated() && 8781 "expected Replicate recipe to be predicated"); 8782 RepR->setAlsoPack(false); 8783 } 8784 8785 // Finalize the recipe for Instr, first if it is not predicated. 8786 if (!IsPredicated) { 8787 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8788 VPBB->appendRecipe(Recipe); 8789 return VPBB; 8790 } 8791 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8792 assert(VPBB->getSuccessors().empty() && 8793 "VPBB has successors when handling predicated replication."); 8794 // Record predicated instructions for above packing optimizations. 8795 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8796 VPBlockUtils::insertBlockAfter(Region, VPBB); 8797 auto *RegSucc = new VPBasicBlock(); 8798 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8799 return RegSucc; 8800 } 8801 8802 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8803 VPRecipeBase *PredRecipe, 8804 VPlanPtr &Plan) { 8805 // Instructions marked for predication are replicated and placed under an 8806 // if-then construct to prevent side-effects. 8807 8808 // Generate recipes to compute the block mask for this region. 8809 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8810 8811 // Build the triangular if-then region. 8812 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8813 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8814 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8815 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8816 auto *PHIRecipe = Instr->getType()->isVoidTy() 8817 ? nullptr 8818 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8819 if (PHIRecipe) { 8820 Plan->removeVPValueFor(Instr); 8821 Plan->addVPValue(Instr, PHIRecipe); 8822 } 8823 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8824 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8825 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8826 8827 // Note: first set Entry as region entry and then connect successors starting 8828 // from it in order, to propagate the "parent" of each VPBasicBlock. 8829 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8830 VPBlockUtils::connectBlocks(Pred, Exit); 8831 8832 return Region; 8833 } 8834 8835 VPRecipeOrVPValueTy 8836 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8837 ArrayRef<VPValue *> Operands, 8838 VFRange &Range, VPlanPtr &Plan) { 8839 // First, check for specific widening recipes that deal with calls, memory 8840 // operations, inductions and Phi nodes. 8841 if (auto *CI = dyn_cast<CallInst>(Instr)) 8842 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8843 8844 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8845 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8846 8847 VPRecipeBase *Recipe; 8848 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8849 if (Phi->getParent() != OrigLoop->getHeader()) 8850 return tryToBlend(Phi, Operands, Plan); 8851 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8852 return toVPRecipeResult(Recipe); 8853 8854 VPWidenPHIRecipe *PhiRecipe = nullptr; 8855 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8856 VPValue *StartV = Operands[0]; 8857 if (Legal->isReductionVariable(Phi)) { 8858 const RecurrenceDescriptor &RdxDesc = 8859 Legal->getReductionVars().find(Phi)->second; 8860 assert(RdxDesc.getRecurrenceStartValue() == 8861 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8862 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8863 CM.isInLoopReduction(Phi), 8864 CM.useOrderedReductions(RdxDesc)); 8865 } else { 8866 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8867 } 8868 8869 // Record the incoming value from the backedge, so we can add the incoming 8870 // value from the backedge after all recipes have been created. 8871 recordRecipeOf(cast<Instruction>( 8872 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8873 PhisToFix.push_back(PhiRecipe); 8874 } else { 8875 // TODO: record start and backedge value for remaining pointer induction 8876 // phis. 8877 assert(Phi->getType()->isPointerTy() && 8878 "only pointer phis should be handled here"); 8879 PhiRecipe = new VPWidenPHIRecipe(Phi); 8880 } 8881 8882 return toVPRecipeResult(PhiRecipe); 8883 } 8884 8885 if (isa<TruncInst>(Instr) && 8886 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8887 Range, *Plan))) 8888 return toVPRecipeResult(Recipe); 8889 8890 if (!shouldWiden(Instr, Range)) 8891 return nullptr; 8892 8893 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8894 return toVPRecipeResult(new VPWidenGEPRecipe( 8895 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8896 8897 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8898 bool InvariantCond = 8899 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8900 return toVPRecipeResult(new VPWidenSelectRecipe( 8901 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8902 } 8903 8904 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8905 } 8906 8907 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8908 ElementCount MaxVF) { 8909 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8910 8911 // Collect instructions from the original loop that will become trivially dead 8912 // in the vectorized loop. We don't need to vectorize these instructions. For 8913 // example, original induction update instructions can become dead because we 8914 // separately emit induction "steps" when generating code for the new loop. 8915 // Similarly, we create a new latch condition when setting up the structure 8916 // of the new loop, so the old one can become dead. 8917 SmallPtrSet<Instruction *, 4> DeadInstructions; 8918 collectTriviallyDeadInstructions(DeadInstructions); 8919 8920 // Add assume instructions we need to drop to DeadInstructions, to prevent 8921 // them from being added to the VPlan. 8922 // TODO: We only need to drop assumes in blocks that get flattend. If the 8923 // control flow is preserved, we should keep them. 8924 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8925 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8926 8927 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8928 // Dead instructions do not need sinking. Remove them from SinkAfter. 8929 for (Instruction *I : DeadInstructions) 8930 SinkAfter.erase(I); 8931 8932 // Cannot sink instructions after dead instructions (there won't be any 8933 // recipes for them). Instead, find the first non-dead previous instruction. 8934 for (auto &P : Legal->getSinkAfter()) { 8935 Instruction *SinkTarget = P.second; 8936 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8937 (void)FirstInst; 8938 while (DeadInstructions.contains(SinkTarget)) { 8939 assert( 8940 SinkTarget != FirstInst && 8941 "Must find a live instruction (at least the one feeding the " 8942 "first-order recurrence PHI) before reaching beginning of the block"); 8943 SinkTarget = SinkTarget->getPrevNode(); 8944 assert(SinkTarget != P.first && 8945 "sink source equals target, no sinking required"); 8946 } 8947 P.second = SinkTarget; 8948 } 8949 8950 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8951 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8952 VFRange SubRange = {VF, MaxVFPlusOne}; 8953 VPlans.push_back( 8954 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8955 VF = SubRange.End; 8956 } 8957 } 8958 8959 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8960 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8961 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8962 8963 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8964 8965 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8966 8967 // --------------------------------------------------------------------------- 8968 // Pre-construction: record ingredients whose recipes we'll need to further 8969 // process after constructing the initial VPlan. 8970 // --------------------------------------------------------------------------- 8971 8972 // Mark instructions we'll need to sink later and their targets as 8973 // ingredients whose recipe we'll need to record. 8974 for (auto &Entry : SinkAfter) { 8975 RecipeBuilder.recordRecipeOf(Entry.first); 8976 RecipeBuilder.recordRecipeOf(Entry.second); 8977 } 8978 for (auto &Reduction : CM.getInLoopReductionChains()) { 8979 PHINode *Phi = Reduction.first; 8980 RecurKind Kind = 8981 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8982 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8983 8984 RecipeBuilder.recordRecipeOf(Phi); 8985 for (auto &R : ReductionOperations) { 8986 RecipeBuilder.recordRecipeOf(R); 8987 // For min/max reducitons, where we have a pair of icmp/select, we also 8988 // need to record the ICmp recipe, so it can be removed later. 8989 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8990 "Only min/max recurrences allowed for inloop reductions"); 8991 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8992 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8993 } 8994 } 8995 8996 // For each interleave group which is relevant for this (possibly trimmed) 8997 // Range, add it to the set of groups to be later applied to the VPlan and add 8998 // placeholders for its members' Recipes which we'll be replacing with a 8999 // single VPInterleaveRecipe. 9000 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9001 auto applyIG = [IG, this](ElementCount VF) -> bool { 9002 return (VF.isVector() && // Query is illegal for VF == 1 9003 CM.getWideningDecision(IG->getInsertPos(), VF) == 9004 LoopVectorizationCostModel::CM_Interleave); 9005 }; 9006 if (!getDecisionAndClampRange(applyIG, Range)) 9007 continue; 9008 InterleaveGroups.insert(IG); 9009 for (unsigned i = 0; i < IG->getFactor(); i++) 9010 if (Instruction *Member = IG->getMember(i)) 9011 RecipeBuilder.recordRecipeOf(Member); 9012 }; 9013 9014 // --------------------------------------------------------------------------- 9015 // Build initial VPlan: Scan the body of the loop in a topological order to 9016 // visit each basic block after having visited its predecessor basic blocks. 9017 // --------------------------------------------------------------------------- 9018 9019 auto Plan = std::make_unique<VPlan>(); 9020 9021 // Scan the body of the loop in a topological order to visit each basic block 9022 // after having visited its predecessor basic blocks. 9023 LoopBlocksDFS DFS(OrigLoop); 9024 DFS.perform(LI); 9025 9026 VPBasicBlock *VPBB = nullptr; 9027 VPBasicBlock *HeaderVPBB = nullptr; 9028 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9029 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9030 // Relevant instructions from basic block BB will be grouped into VPRecipe 9031 // ingredients and fill a new VPBasicBlock. 9032 unsigned VPBBsForBB = 0; 9033 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9034 if (VPBB) 9035 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9036 else { 9037 auto *TopRegion = new VPRegionBlock("vector loop"); 9038 TopRegion->setEntry(FirstVPBBForBB); 9039 Plan->setEntry(TopRegion); 9040 HeaderVPBB = FirstVPBBForBB; 9041 } 9042 VPBB = FirstVPBBForBB; 9043 Builder.setInsertPoint(VPBB); 9044 9045 // Introduce each ingredient into VPlan. 9046 // TODO: Model and preserve debug instrinsics in VPlan. 9047 for (Instruction &I : BB->instructionsWithoutDebug()) { 9048 Instruction *Instr = &I; 9049 9050 // First filter out irrelevant instructions, to ensure no recipes are 9051 // built for them. 9052 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9053 continue; 9054 9055 SmallVector<VPValue *, 4> Operands; 9056 auto *Phi = dyn_cast<PHINode>(Instr); 9057 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9058 Operands.push_back(Plan->getOrAddVPValue( 9059 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9060 } else { 9061 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9062 Operands = {OpRange.begin(), OpRange.end()}; 9063 } 9064 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9065 Instr, Operands, Range, Plan)) { 9066 // If Instr can be simplified to an existing VPValue, use it. 9067 if (RecipeOrValue.is<VPValue *>()) { 9068 auto *VPV = RecipeOrValue.get<VPValue *>(); 9069 Plan->addVPValue(Instr, VPV); 9070 // If the re-used value is a recipe, register the recipe for the 9071 // instruction, in case the recipe for Instr needs to be recorded. 9072 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9073 RecipeBuilder.setRecipe(Instr, R); 9074 continue; 9075 } 9076 // Otherwise, add the new recipe. 9077 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9078 for (auto *Def : Recipe->definedValues()) { 9079 auto *UV = Def->getUnderlyingValue(); 9080 Plan->addVPValue(UV, Def); 9081 } 9082 9083 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9084 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9085 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9086 // of the header block. That can happen for truncates of induction 9087 // variables. Those recipes are moved to the phi section of the header 9088 // block after applying SinkAfter, which relies on the original 9089 // position of the trunc. 9090 assert(isa<TruncInst>(Instr)); 9091 InductionsToMove.push_back( 9092 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9093 } 9094 RecipeBuilder.setRecipe(Instr, Recipe); 9095 VPBB->appendRecipe(Recipe); 9096 continue; 9097 } 9098 9099 // Otherwise, if all widening options failed, Instruction is to be 9100 // replicated. This may create a successor for VPBB. 9101 VPBasicBlock *NextVPBB = 9102 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9103 if (NextVPBB != VPBB) { 9104 VPBB = NextVPBB; 9105 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9106 : ""); 9107 } 9108 } 9109 } 9110 9111 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9112 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9113 "entry block must be set to a VPRegionBlock having a non-empty entry " 9114 "VPBasicBlock"); 9115 RecipeBuilder.fixHeaderPhis(); 9116 9117 // --------------------------------------------------------------------------- 9118 // Transform initial VPlan: Apply previously taken decisions, in order, to 9119 // bring the VPlan to its final state. 9120 // --------------------------------------------------------------------------- 9121 9122 // Apply Sink-After legal constraints. 9123 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9124 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9125 if (Region && Region->isReplicator()) { 9126 assert(Region->getNumSuccessors() == 1 && 9127 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9128 assert(R->getParent()->size() == 1 && 9129 "A recipe in an original replicator region must be the only " 9130 "recipe in its block"); 9131 return Region; 9132 } 9133 return nullptr; 9134 }; 9135 for (auto &Entry : SinkAfter) { 9136 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9137 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9138 9139 auto *TargetRegion = GetReplicateRegion(Target); 9140 auto *SinkRegion = GetReplicateRegion(Sink); 9141 if (!SinkRegion) { 9142 // If the sink source is not a replicate region, sink the recipe directly. 9143 if (TargetRegion) { 9144 // The target is in a replication region, make sure to move Sink to 9145 // the block after it, not into the replication region itself. 9146 VPBasicBlock *NextBlock = 9147 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9148 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9149 } else 9150 Sink->moveAfter(Target); 9151 continue; 9152 } 9153 9154 // The sink source is in a replicate region. Unhook the region from the CFG. 9155 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9156 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9157 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9158 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9159 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9160 9161 if (TargetRegion) { 9162 // The target recipe is also in a replicate region, move the sink region 9163 // after the target region. 9164 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9165 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9166 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9167 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9168 } else { 9169 // The sink source is in a replicate region, we need to move the whole 9170 // replicate region, which should only contain a single recipe in the 9171 // main block. 9172 auto *SplitBlock = 9173 Target->getParent()->splitAt(std::next(Target->getIterator())); 9174 9175 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9176 9177 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9178 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9179 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9180 if (VPBB == SplitPred) 9181 VPBB = SplitBlock; 9182 } 9183 } 9184 9185 cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); 9186 9187 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9188 9189 // Now that sink-after is done, move induction recipes for optimized truncates 9190 // to the phi section of the header block. 9191 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9192 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9193 9194 // Adjust the recipes for any inloop reductions. 9195 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9196 9197 // Introduce a recipe to combine the incoming and previous values of a 9198 // first-order recurrence. 9199 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9200 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9201 if (!RecurPhi) 9202 continue; 9203 9204 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9205 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9206 auto *Region = GetReplicateRegion(PrevRecipe); 9207 if (Region) 9208 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9209 if (Region || PrevRecipe->isPhi()) 9210 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9211 else 9212 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9213 9214 auto *RecurSplice = cast<VPInstruction>( 9215 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9216 {RecurPhi, RecurPhi->getBackedgeValue()})); 9217 9218 RecurPhi->replaceAllUsesWith(RecurSplice); 9219 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9220 // all users. 9221 RecurSplice->setOperand(0, RecurPhi); 9222 } 9223 9224 // Interleave memory: for each Interleave Group we marked earlier as relevant 9225 // for this VPlan, replace the Recipes widening its memory instructions with a 9226 // single VPInterleaveRecipe at its insertion point. 9227 for (auto IG : InterleaveGroups) { 9228 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9229 RecipeBuilder.getRecipe(IG->getInsertPos())); 9230 SmallVector<VPValue *, 4> StoredValues; 9231 for (unsigned i = 0; i < IG->getFactor(); ++i) 9232 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9233 auto *StoreR = 9234 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9235 StoredValues.push_back(StoreR->getStoredValue()); 9236 } 9237 9238 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9239 Recipe->getMask()); 9240 VPIG->insertBefore(Recipe); 9241 unsigned J = 0; 9242 for (unsigned i = 0; i < IG->getFactor(); ++i) 9243 if (Instruction *Member = IG->getMember(i)) { 9244 if (!Member->getType()->isVoidTy()) { 9245 VPValue *OriginalV = Plan->getVPValue(Member); 9246 Plan->removeVPValueFor(Member); 9247 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9248 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9249 J++; 9250 } 9251 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9252 } 9253 } 9254 9255 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9256 // in ways that accessing values using original IR values is incorrect. 9257 Plan->disableValue2VPValue(); 9258 9259 VPlanTransforms::sinkScalarOperands(*Plan); 9260 VPlanTransforms::mergeReplicateRegions(*Plan); 9261 9262 std::string PlanName; 9263 raw_string_ostream RSO(PlanName); 9264 ElementCount VF = Range.Start; 9265 Plan->addVF(VF); 9266 RSO << "Initial VPlan for VF={" << VF; 9267 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9268 Plan->addVF(VF); 9269 RSO << "," << VF; 9270 } 9271 RSO << "},UF>=1"; 9272 RSO.flush(); 9273 Plan->setName(PlanName); 9274 9275 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9276 return Plan; 9277 } 9278 9279 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9280 // Outer loop handling: They may require CFG and instruction level 9281 // transformations before even evaluating whether vectorization is profitable. 9282 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9283 // the vectorization pipeline. 9284 assert(!OrigLoop->isInnermost()); 9285 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9286 9287 // Create new empty VPlan 9288 auto Plan = std::make_unique<VPlan>(); 9289 9290 // Build hierarchical CFG 9291 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9292 HCFGBuilder.buildHierarchicalCFG(); 9293 9294 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9295 VF *= 2) 9296 Plan->addVF(VF); 9297 9298 if (EnableVPlanPredication) { 9299 VPlanPredicator VPP(*Plan); 9300 VPP.predicate(); 9301 9302 // Avoid running transformation to recipes until masked code generation in 9303 // VPlan-native path is in place. 9304 return Plan; 9305 } 9306 9307 SmallPtrSet<Instruction *, 1> DeadInstructions; 9308 VPlanTransforms::VPInstructionsToVPRecipes( 9309 OrigLoop, Plan, 9310 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9311 DeadInstructions, *PSE.getSE()); 9312 return Plan; 9313 } 9314 9315 // Adjust the recipes for reductions. For in-loop reductions the chain of 9316 // instructions leading from the loop exit instr to the phi need to be converted 9317 // to reductions, with one operand being vector and the other being the scalar 9318 // reduction chain. For other reductions, a select is introduced between the phi 9319 // and live-out recipes when folding the tail. 9320 void LoopVectorizationPlanner::adjustRecipesForReductions( 9321 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9322 ElementCount MinVF) { 9323 for (auto &Reduction : CM.getInLoopReductionChains()) { 9324 PHINode *Phi = Reduction.first; 9325 const RecurrenceDescriptor &RdxDesc = 9326 Legal->getReductionVars().find(Phi)->second; 9327 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9328 9329 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9330 continue; 9331 9332 // ReductionOperations are orders top-down from the phi's use to the 9333 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9334 // which of the two operands will remain scalar and which will be reduced. 9335 // For minmax the chain will be the select instructions. 9336 Instruction *Chain = Phi; 9337 for (Instruction *R : ReductionOperations) { 9338 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9339 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9340 9341 VPValue *ChainOp = Plan->getVPValue(Chain); 9342 unsigned FirstOpId; 9343 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9344 "Only min/max recurrences allowed for inloop reductions"); 9345 // Recognize a call to the llvm.fmuladd intrinsic. 9346 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9347 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9348 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9349 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9350 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9351 "Expected to replace a VPWidenSelectSC"); 9352 FirstOpId = 1; 9353 } else { 9354 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9355 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9356 "Expected to replace a VPWidenSC"); 9357 FirstOpId = 0; 9358 } 9359 unsigned VecOpId = 9360 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9361 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9362 9363 auto *CondOp = CM.foldTailByMasking() 9364 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9365 : nullptr; 9366 9367 if (IsFMulAdd) { 9368 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9369 // need to create an fmul recipe to use as the vector operand for the 9370 // fadd reduction. 9371 VPInstruction *FMulRecipe = new VPInstruction( 9372 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9373 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9374 WidenRecipe->getParent()->insert(FMulRecipe, 9375 WidenRecipe->getIterator()); 9376 VecOp = FMulRecipe; 9377 } 9378 VPReductionRecipe *RedRecipe = 9379 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9380 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9381 Plan->removeVPValueFor(R); 9382 Plan->addVPValue(R, RedRecipe); 9383 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9384 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9385 WidenRecipe->eraseFromParent(); 9386 9387 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9388 VPRecipeBase *CompareRecipe = 9389 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9390 assert(isa<VPWidenRecipe>(CompareRecipe) && 9391 "Expected to replace a VPWidenSC"); 9392 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9393 "Expected no remaining users"); 9394 CompareRecipe->eraseFromParent(); 9395 } 9396 Chain = R; 9397 } 9398 } 9399 9400 // If tail is folded by masking, introduce selects between the phi 9401 // and the live-out instruction of each reduction, at the end of the latch. 9402 if (CM.foldTailByMasking()) { 9403 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9404 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9405 if (!PhiR || PhiR->isInLoop()) 9406 continue; 9407 Builder.setInsertPoint(LatchVPBB); 9408 VPValue *Cond = 9409 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9410 VPValue *Red = PhiR->getBackedgeValue(); 9411 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9412 } 9413 } 9414 } 9415 9416 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9417 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9418 VPSlotTracker &SlotTracker) const { 9419 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9420 IG->getInsertPos()->printAsOperand(O, false); 9421 O << ", "; 9422 getAddr()->printAsOperand(O, SlotTracker); 9423 VPValue *Mask = getMask(); 9424 if (Mask) { 9425 O << ", "; 9426 Mask->printAsOperand(O, SlotTracker); 9427 } 9428 9429 unsigned OpIdx = 0; 9430 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9431 if (!IG->getMember(i)) 9432 continue; 9433 if (getNumStoreOperands() > 0) { 9434 O << "\n" << Indent << " store "; 9435 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9436 O << " to index " << i; 9437 } else { 9438 O << "\n" << Indent << " "; 9439 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9440 O << " = load from index " << i; 9441 } 9442 ++OpIdx; 9443 } 9444 } 9445 #endif 9446 9447 void VPWidenCallRecipe::execute(VPTransformState &State) { 9448 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9449 *this, State); 9450 } 9451 9452 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9453 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9454 State.ILV->setDebugLocFromInst(&I); 9455 9456 // The condition can be loop invariant but still defined inside the 9457 // loop. This means that we can't just use the original 'cond' value. 9458 // We have to take the 'vectorized' value and pick the first lane. 9459 // Instcombine will make this a no-op. 9460 auto *InvarCond = 9461 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9462 9463 for (unsigned Part = 0; Part < State.UF; ++Part) { 9464 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9465 Value *Op0 = State.get(getOperand(1), Part); 9466 Value *Op1 = State.get(getOperand(2), Part); 9467 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9468 State.set(this, Sel, Part); 9469 State.ILV->addMetadata(Sel, &I); 9470 } 9471 } 9472 9473 void VPWidenRecipe::execute(VPTransformState &State) { 9474 auto &I = *cast<Instruction>(getUnderlyingValue()); 9475 auto &Builder = State.Builder; 9476 switch (I.getOpcode()) { 9477 case Instruction::Call: 9478 case Instruction::Br: 9479 case Instruction::PHI: 9480 case Instruction::GetElementPtr: 9481 case Instruction::Select: 9482 llvm_unreachable("This instruction is handled by a different recipe."); 9483 case Instruction::UDiv: 9484 case Instruction::SDiv: 9485 case Instruction::SRem: 9486 case Instruction::URem: 9487 case Instruction::Add: 9488 case Instruction::FAdd: 9489 case Instruction::Sub: 9490 case Instruction::FSub: 9491 case Instruction::FNeg: 9492 case Instruction::Mul: 9493 case Instruction::FMul: 9494 case Instruction::FDiv: 9495 case Instruction::FRem: 9496 case Instruction::Shl: 9497 case Instruction::LShr: 9498 case Instruction::AShr: 9499 case Instruction::And: 9500 case Instruction::Or: 9501 case Instruction::Xor: { 9502 // Just widen unops and binops. 9503 State.ILV->setDebugLocFromInst(&I); 9504 9505 for (unsigned Part = 0; Part < State.UF; ++Part) { 9506 SmallVector<Value *, 2> Ops; 9507 for (VPValue *VPOp : operands()) 9508 Ops.push_back(State.get(VPOp, Part)); 9509 9510 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9511 9512 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9513 VecOp->copyIRFlags(&I); 9514 9515 // If the instruction is vectorized and was in a basic block that needed 9516 // predication, we can't propagate poison-generating flags (nuw/nsw, 9517 // exact, etc.). The control flow has been linearized and the 9518 // instruction is no longer guarded by the predicate, which could make 9519 // the flag properties to no longer hold. 9520 if (State.MayGeneratePoisonRecipes.count(this) > 0) 9521 VecOp->dropPoisonGeneratingFlags(); 9522 } 9523 9524 // Use this vector value for all users of the original instruction. 9525 State.set(this, V, Part); 9526 State.ILV->addMetadata(V, &I); 9527 } 9528 9529 break; 9530 } 9531 case Instruction::ICmp: 9532 case Instruction::FCmp: { 9533 // Widen compares. Generate vector compares. 9534 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9535 auto *Cmp = cast<CmpInst>(&I); 9536 State.ILV->setDebugLocFromInst(Cmp); 9537 for (unsigned Part = 0; Part < State.UF; ++Part) { 9538 Value *A = State.get(getOperand(0), Part); 9539 Value *B = State.get(getOperand(1), Part); 9540 Value *C = nullptr; 9541 if (FCmp) { 9542 // Propagate fast math flags. 9543 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9544 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9545 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9546 } else { 9547 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9548 } 9549 State.set(this, C, Part); 9550 State.ILV->addMetadata(C, &I); 9551 } 9552 9553 break; 9554 } 9555 9556 case Instruction::ZExt: 9557 case Instruction::SExt: 9558 case Instruction::FPToUI: 9559 case Instruction::FPToSI: 9560 case Instruction::FPExt: 9561 case Instruction::PtrToInt: 9562 case Instruction::IntToPtr: 9563 case Instruction::SIToFP: 9564 case Instruction::UIToFP: 9565 case Instruction::Trunc: 9566 case Instruction::FPTrunc: 9567 case Instruction::BitCast: { 9568 auto *CI = cast<CastInst>(&I); 9569 State.ILV->setDebugLocFromInst(CI); 9570 9571 /// Vectorize casts. 9572 Type *DestTy = (State.VF.isScalar()) 9573 ? CI->getType() 9574 : VectorType::get(CI->getType(), State.VF); 9575 9576 for (unsigned Part = 0; Part < State.UF; ++Part) { 9577 Value *A = State.get(getOperand(0), Part); 9578 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9579 State.set(this, Cast, Part); 9580 State.ILV->addMetadata(Cast, &I); 9581 } 9582 break; 9583 } 9584 default: 9585 // This instruction is not vectorized by simple widening. 9586 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9587 llvm_unreachable("Unhandled instruction!"); 9588 } // end of switch. 9589 } 9590 9591 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9592 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9593 // Construct a vector GEP by widening the operands of the scalar GEP as 9594 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9595 // results in a vector of pointers when at least one operand of the GEP 9596 // is vector-typed. Thus, to keep the representation compact, we only use 9597 // vector-typed operands for loop-varying values. 9598 9599 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9600 // If we are vectorizing, but the GEP has only loop-invariant operands, 9601 // the GEP we build (by only using vector-typed operands for 9602 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9603 // produce a vector of pointers, we need to either arbitrarily pick an 9604 // operand to broadcast, or broadcast a clone of the original GEP. 9605 // Here, we broadcast a clone of the original. 9606 // 9607 // TODO: If at some point we decide to scalarize instructions having 9608 // loop-invariant operands, this special case will no longer be 9609 // required. We would add the scalarization decision to 9610 // collectLoopScalars() and teach getVectorValue() to broadcast 9611 // the lane-zero scalar value. 9612 auto *Clone = State.Builder.Insert(GEP->clone()); 9613 for (unsigned Part = 0; Part < State.UF; ++Part) { 9614 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9615 State.set(this, EntryPart, Part); 9616 State.ILV->addMetadata(EntryPart, GEP); 9617 } 9618 } else { 9619 // If the GEP has at least one loop-varying operand, we are sure to 9620 // produce a vector of pointers. But if we are only unrolling, we want 9621 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9622 // produce with the code below will be scalar (if VF == 1) or vector 9623 // (otherwise). Note that for the unroll-only case, we still maintain 9624 // values in the vector mapping with initVector, as we do for other 9625 // instructions. 9626 for (unsigned Part = 0; Part < State.UF; ++Part) { 9627 // The pointer operand of the new GEP. If it's loop-invariant, we 9628 // won't broadcast it. 9629 auto *Ptr = IsPtrLoopInvariant 9630 ? State.get(getOperand(0), VPIteration(0, 0)) 9631 : State.get(getOperand(0), Part); 9632 9633 // Collect all the indices for the new GEP. If any index is 9634 // loop-invariant, we won't broadcast it. 9635 SmallVector<Value *, 4> Indices; 9636 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9637 VPValue *Operand = getOperand(I); 9638 if (IsIndexLoopInvariant[I - 1]) 9639 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9640 else 9641 Indices.push_back(State.get(Operand, Part)); 9642 } 9643 9644 // If the GEP instruction is vectorized and was in a basic block that 9645 // needed predication, we can't propagate the poison-generating 'inbounds' 9646 // flag. The control flow has been linearized and the GEP is no longer 9647 // guarded by the predicate, which could make the 'inbounds' properties to 9648 // no longer hold. 9649 bool IsInBounds = 9650 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9651 9652 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9653 // but it should be a vector, otherwise. 9654 auto *NewGEP = IsInBounds 9655 ? State.Builder.CreateInBoundsGEP( 9656 GEP->getSourceElementType(), Ptr, Indices) 9657 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9658 Ptr, Indices); 9659 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9660 "NewGEP is not a pointer vector"); 9661 State.set(this, NewGEP, Part); 9662 State.ILV->addMetadata(NewGEP, GEP); 9663 } 9664 } 9665 } 9666 9667 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9668 assert(!State.Instance && "Int or FP induction being replicated."); 9669 State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), 9670 getStartValue()->getLiveInIRValue(), 9671 getTruncInst(), getVPValue(0), State); 9672 } 9673 9674 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9675 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9676 State); 9677 } 9678 9679 void VPBlendRecipe::execute(VPTransformState &State) { 9680 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9681 // We know that all PHIs in non-header blocks are converted into 9682 // selects, so we don't have to worry about the insertion order and we 9683 // can just use the builder. 9684 // At this point we generate the predication tree. There may be 9685 // duplications since this is a simple recursive scan, but future 9686 // optimizations will clean it up. 9687 9688 unsigned NumIncoming = getNumIncomingValues(); 9689 9690 // Generate a sequence of selects of the form: 9691 // SELECT(Mask3, In3, 9692 // SELECT(Mask2, In2, 9693 // SELECT(Mask1, In1, 9694 // In0))) 9695 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9696 // are essentially undef are taken from In0. 9697 InnerLoopVectorizer::VectorParts Entry(State.UF); 9698 for (unsigned In = 0; In < NumIncoming; ++In) { 9699 for (unsigned Part = 0; Part < State.UF; ++Part) { 9700 // We might have single edge PHIs (blocks) - use an identity 9701 // 'select' for the first PHI operand. 9702 Value *In0 = State.get(getIncomingValue(In), Part); 9703 if (In == 0) 9704 Entry[Part] = In0; // Initialize with the first incoming value. 9705 else { 9706 // Select between the current value and the previous incoming edge 9707 // based on the incoming mask. 9708 Value *Cond = State.get(getMask(In), Part); 9709 Entry[Part] = 9710 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9711 } 9712 } 9713 } 9714 for (unsigned Part = 0; Part < State.UF; ++Part) 9715 State.set(this, Entry[Part], Part); 9716 } 9717 9718 void VPInterleaveRecipe::execute(VPTransformState &State) { 9719 assert(!State.Instance && "Interleave group being replicated."); 9720 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9721 getStoredValues(), getMask()); 9722 } 9723 9724 void VPReductionRecipe::execute(VPTransformState &State) { 9725 assert(!State.Instance && "Reduction being replicated."); 9726 Value *PrevInChain = State.get(getChainOp(), 0); 9727 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9728 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9729 // Propagate the fast-math flags carried by the underlying instruction. 9730 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9731 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9732 for (unsigned Part = 0; Part < State.UF; ++Part) { 9733 Value *NewVecOp = State.get(getVecOp(), Part); 9734 if (VPValue *Cond = getCondOp()) { 9735 Value *NewCond = State.get(Cond, Part); 9736 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9737 Value *Iden = RdxDesc->getRecurrenceIdentity( 9738 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9739 Value *IdenVec = 9740 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9741 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9742 NewVecOp = Select; 9743 } 9744 Value *NewRed; 9745 Value *NextInChain; 9746 if (IsOrdered) { 9747 if (State.VF.isVector()) 9748 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9749 PrevInChain); 9750 else 9751 NewRed = State.Builder.CreateBinOp( 9752 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9753 NewVecOp); 9754 PrevInChain = NewRed; 9755 } else { 9756 PrevInChain = State.get(getChainOp(), Part); 9757 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9758 } 9759 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9760 NextInChain = 9761 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9762 NewRed, PrevInChain); 9763 } else if (IsOrdered) 9764 NextInChain = NewRed; 9765 else 9766 NextInChain = State.Builder.CreateBinOp( 9767 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9768 PrevInChain); 9769 State.set(this, NextInChain, Part); 9770 } 9771 } 9772 9773 void VPReplicateRecipe::execute(VPTransformState &State) { 9774 if (State.Instance) { // Generate a single instance. 9775 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9776 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9777 IsPredicated, State); 9778 // Insert scalar instance packing it into a vector. 9779 if (AlsoPack && State.VF.isVector()) { 9780 // If we're constructing lane 0, initialize to start from poison. 9781 if (State.Instance->Lane.isFirstLane()) { 9782 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9783 Value *Poison = PoisonValue::get( 9784 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9785 State.set(this, Poison, State.Instance->Part); 9786 } 9787 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9788 } 9789 return; 9790 } 9791 9792 // Generate scalar instances for all VF lanes of all UF parts, unless the 9793 // instruction is uniform inwhich case generate only the first lane for each 9794 // of the UF parts. 9795 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9796 assert((!State.VF.isScalable() || IsUniform) && 9797 "Can't scalarize a scalable vector"); 9798 for (unsigned Part = 0; Part < State.UF; ++Part) 9799 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9800 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9801 VPIteration(Part, Lane), IsPredicated, 9802 State); 9803 } 9804 9805 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9806 assert(State.Instance && "Branch on Mask works only on single instance."); 9807 9808 unsigned Part = State.Instance->Part; 9809 unsigned Lane = State.Instance->Lane.getKnownLane(); 9810 9811 Value *ConditionBit = nullptr; 9812 VPValue *BlockInMask = getMask(); 9813 if (BlockInMask) { 9814 ConditionBit = State.get(BlockInMask, Part); 9815 if (ConditionBit->getType()->isVectorTy()) 9816 ConditionBit = State.Builder.CreateExtractElement( 9817 ConditionBit, State.Builder.getInt32(Lane)); 9818 } else // Block in mask is all-one. 9819 ConditionBit = State.Builder.getTrue(); 9820 9821 // Replace the temporary unreachable terminator with a new conditional branch, 9822 // whose two destinations will be set later when they are created. 9823 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9824 assert(isa<UnreachableInst>(CurrentTerminator) && 9825 "Expected to replace unreachable terminator with conditional branch."); 9826 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9827 CondBr->setSuccessor(0, nullptr); 9828 ReplaceInstWithInst(CurrentTerminator, CondBr); 9829 } 9830 9831 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9832 assert(State.Instance && "Predicated instruction PHI works per instance."); 9833 Instruction *ScalarPredInst = 9834 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9835 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9836 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9837 assert(PredicatingBB && "Predicated block has no single predecessor."); 9838 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9839 "operand must be VPReplicateRecipe"); 9840 9841 // By current pack/unpack logic we need to generate only a single phi node: if 9842 // a vector value for the predicated instruction exists at this point it means 9843 // the instruction has vector users only, and a phi for the vector value is 9844 // needed. In this case the recipe of the predicated instruction is marked to 9845 // also do that packing, thereby "hoisting" the insert-element sequence. 9846 // Otherwise, a phi node for the scalar value is needed. 9847 unsigned Part = State.Instance->Part; 9848 if (State.hasVectorValue(getOperand(0), Part)) { 9849 Value *VectorValue = State.get(getOperand(0), Part); 9850 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9851 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9852 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9853 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9854 if (State.hasVectorValue(this, Part)) 9855 State.reset(this, VPhi, Part); 9856 else 9857 State.set(this, VPhi, Part); 9858 // NOTE: Currently we need to update the value of the operand, so the next 9859 // predicated iteration inserts its generated value in the correct vector. 9860 State.reset(getOperand(0), VPhi, Part); 9861 } else { 9862 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9863 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9864 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9865 PredicatingBB); 9866 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9867 if (State.hasScalarValue(this, *State.Instance)) 9868 State.reset(this, Phi, *State.Instance); 9869 else 9870 State.set(this, Phi, *State.Instance); 9871 // NOTE: Currently we need to update the value of the operand, so the next 9872 // predicated iteration inserts its generated value in the correct vector. 9873 State.reset(getOperand(0), Phi, *State.Instance); 9874 } 9875 } 9876 9877 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9878 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9879 9880 // Attempt to issue a wide load. 9881 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9882 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9883 9884 assert((LI || SI) && "Invalid Load/Store instruction"); 9885 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9886 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9887 9888 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9889 9890 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9891 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9892 bool CreateGatherScatter = !Consecutive; 9893 9894 auto &Builder = State.Builder; 9895 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9896 bool isMaskRequired = getMask(); 9897 if (isMaskRequired) 9898 for (unsigned Part = 0; Part < State.UF; ++Part) 9899 BlockInMaskParts[Part] = State.get(getMask(), Part); 9900 9901 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9902 // Calculate the pointer for the specific unroll-part. 9903 GetElementPtrInst *PartPtr = nullptr; 9904 9905 bool InBounds = false; 9906 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9907 InBounds = gep->isInBounds(); 9908 if (Reverse) { 9909 // If the address is consecutive but reversed, then the 9910 // wide store needs to start at the last vector element. 9911 // RunTimeVF = VScale * VF.getKnownMinValue() 9912 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9913 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9914 // NumElt = -Part * RunTimeVF 9915 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9916 // LastLane = 1 - RunTimeVF 9917 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9918 PartPtr = 9919 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9920 PartPtr->setIsInBounds(InBounds); 9921 PartPtr = cast<GetElementPtrInst>( 9922 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9923 PartPtr->setIsInBounds(InBounds); 9924 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9925 BlockInMaskParts[Part] = 9926 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9927 } else { 9928 Value *Increment = 9929 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9930 PartPtr = cast<GetElementPtrInst>( 9931 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9932 PartPtr->setIsInBounds(InBounds); 9933 } 9934 9935 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9936 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9937 }; 9938 9939 // Handle Stores: 9940 if (SI) { 9941 State.ILV->setDebugLocFromInst(SI); 9942 9943 for (unsigned Part = 0; Part < State.UF; ++Part) { 9944 Instruction *NewSI = nullptr; 9945 Value *StoredVal = State.get(StoredValue, Part); 9946 if (CreateGatherScatter) { 9947 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9948 Value *VectorGep = State.get(getAddr(), Part); 9949 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9950 MaskPart); 9951 } else { 9952 if (Reverse) { 9953 // If we store to reverse consecutive memory locations, then we need 9954 // to reverse the order of elements in the stored value. 9955 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9956 // We don't want to update the value in the map as it might be used in 9957 // another expression. So don't call resetVectorValue(StoredVal). 9958 } 9959 auto *VecPtr = 9960 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9961 if (isMaskRequired) 9962 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9963 BlockInMaskParts[Part]); 9964 else 9965 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9966 } 9967 State.ILV->addMetadata(NewSI, SI); 9968 } 9969 return; 9970 } 9971 9972 // Handle loads. 9973 assert(LI && "Must have a load instruction"); 9974 State.ILV->setDebugLocFromInst(LI); 9975 for (unsigned Part = 0; Part < State.UF; ++Part) { 9976 Value *NewLI; 9977 if (CreateGatherScatter) { 9978 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9979 Value *VectorGep = State.get(getAddr(), Part); 9980 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9981 nullptr, "wide.masked.gather"); 9982 State.ILV->addMetadata(NewLI, LI); 9983 } else { 9984 auto *VecPtr = 9985 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9986 if (isMaskRequired) 9987 NewLI = Builder.CreateMaskedLoad( 9988 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9989 PoisonValue::get(DataTy), "wide.masked.load"); 9990 else 9991 NewLI = 9992 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9993 9994 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9995 State.ILV->addMetadata(NewLI, LI); 9996 if (Reverse) 9997 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9998 } 9999 10000 State.set(getVPSingleValue(), NewLI, Part); 10001 } 10002 } 10003 10004 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10005 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10006 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10007 // for predication. 10008 static ScalarEpilogueLowering getScalarEpilogueLowering( 10009 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10010 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10011 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10012 LoopVectorizationLegality &LVL) { 10013 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10014 // don't look at hints or options, and don't request a scalar epilogue. 10015 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10016 // LoopAccessInfo (due to code dependency and not being able to reliably get 10017 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10018 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10019 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10020 // back to the old way and vectorize with versioning when forced. See D81345.) 10021 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10022 PGSOQueryType::IRPass) && 10023 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10024 return CM_ScalarEpilogueNotAllowedOptSize; 10025 10026 // 2) If set, obey the directives 10027 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10028 switch (PreferPredicateOverEpilogue) { 10029 case PreferPredicateTy::ScalarEpilogue: 10030 return CM_ScalarEpilogueAllowed; 10031 case PreferPredicateTy::PredicateElseScalarEpilogue: 10032 return CM_ScalarEpilogueNotNeededUsePredicate; 10033 case PreferPredicateTy::PredicateOrDontVectorize: 10034 return CM_ScalarEpilogueNotAllowedUsePredicate; 10035 }; 10036 } 10037 10038 // 3) If set, obey the hints 10039 switch (Hints.getPredicate()) { 10040 case LoopVectorizeHints::FK_Enabled: 10041 return CM_ScalarEpilogueNotNeededUsePredicate; 10042 case LoopVectorizeHints::FK_Disabled: 10043 return CM_ScalarEpilogueAllowed; 10044 }; 10045 10046 // 4) if the TTI hook indicates this is profitable, request predication. 10047 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10048 LVL.getLAI())) 10049 return CM_ScalarEpilogueNotNeededUsePredicate; 10050 10051 return CM_ScalarEpilogueAllowed; 10052 } 10053 10054 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10055 // If Values have been set for this Def return the one relevant for \p Part. 10056 if (hasVectorValue(Def, Part)) 10057 return Data.PerPartOutput[Def][Part]; 10058 10059 if (!hasScalarValue(Def, {Part, 0})) { 10060 Value *IRV = Def->getLiveInIRValue(); 10061 Value *B = ILV->getBroadcastInstrs(IRV); 10062 set(Def, B, Part); 10063 return B; 10064 } 10065 10066 Value *ScalarValue = get(Def, {Part, 0}); 10067 // If we aren't vectorizing, we can just copy the scalar map values over 10068 // to the vector map. 10069 if (VF.isScalar()) { 10070 set(Def, ScalarValue, Part); 10071 return ScalarValue; 10072 } 10073 10074 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10075 bool IsUniform = RepR && RepR->isUniform(); 10076 10077 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10078 // Check if there is a scalar value for the selected lane. 10079 if (!hasScalarValue(Def, {Part, LastLane})) { 10080 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10081 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10082 "unexpected recipe found to be invariant"); 10083 IsUniform = true; 10084 LastLane = 0; 10085 } 10086 10087 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10088 // Set the insert point after the last scalarized instruction or after the 10089 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10090 // will directly follow the scalar definitions. 10091 auto OldIP = Builder.saveIP(); 10092 auto NewIP = 10093 isa<PHINode>(LastInst) 10094 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10095 : std::next(BasicBlock::iterator(LastInst)); 10096 Builder.SetInsertPoint(&*NewIP); 10097 10098 // However, if we are vectorizing, we need to construct the vector values. 10099 // If the value is known to be uniform after vectorization, we can just 10100 // broadcast the scalar value corresponding to lane zero for each unroll 10101 // iteration. Otherwise, we construct the vector values using 10102 // insertelement instructions. Since the resulting vectors are stored in 10103 // State, we will only generate the insertelements once. 10104 Value *VectorValue = nullptr; 10105 if (IsUniform) { 10106 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10107 set(Def, VectorValue, Part); 10108 } else { 10109 // Initialize packing with insertelements to start from undef. 10110 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10111 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10112 set(Def, Undef, Part); 10113 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10114 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10115 VectorValue = get(Def, Part); 10116 } 10117 Builder.restoreIP(OldIP); 10118 return VectorValue; 10119 } 10120 10121 // Process the loop in the VPlan-native vectorization path. This path builds 10122 // VPlan upfront in the vectorization pipeline, which allows to apply 10123 // VPlan-to-VPlan transformations from the very beginning without modifying the 10124 // input LLVM IR. 10125 static bool processLoopInVPlanNativePath( 10126 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10127 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10128 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10129 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10130 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10131 LoopVectorizationRequirements &Requirements) { 10132 10133 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10134 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10135 return false; 10136 } 10137 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10138 Function *F = L->getHeader()->getParent(); 10139 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10140 10141 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10142 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10143 10144 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10145 &Hints, IAI); 10146 // Use the planner for outer loop vectorization. 10147 // TODO: CM is not used at this point inside the planner. Turn CM into an 10148 // optional argument if we don't need it in the future. 10149 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10150 Requirements, ORE); 10151 10152 // Get user vectorization factor. 10153 ElementCount UserVF = Hints.getWidth(); 10154 10155 CM.collectElementTypesForWidening(); 10156 10157 // Plan how to best vectorize, return the best VF and its cost. 10158 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10159 10160 // If we are stress testing VPlan builds, do not attempt to generate vector 10161 // code. Masked vector code generation support will follow soon. 10162 // Also, do not attempt to vectorize if no vector code will be produced. 10163 if (VPlanBuildStressTest || EnableVPlanPredication || 10164 VectorizationFactor::Disabled() == VF) 10165 return false; 10166 10167 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10168 10169 { 10170 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10171 F->getParent()->getDataLayout()); 10172 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10173 &CM, BFI, PSI, Checks); 10174 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10175 << L->getHeader()->getParent()->getName() << "\"\n"); 10176 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10177 } 10178 10179 // Mark the loop as already vectorized to avoid vectorizing again. 10180 Hints.setAlreadyVectorized(); 10181 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10182 return true; 10183 } 10184 10185 // Emit a remark if there are stores to floats that required a floating point 10186 // extension. If the vectorized loop was generated with floating point there 10187 // will be a performance penalty from the conversion overhead and the change in 10188 // the vector width. 10189 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10190 SmallVector<Instruction *, 4> Worklist; 10191 for (BasicBlock *BB : L->getBlocks()) { 10192 for (Instruction &Inst : *BB) { 10193 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10194 if (S->getValueOperand()->getType()->isFloatTy()) 10195 Worklist.push_back(S); 10196 } 10197 } 10198 } 10199 10200 // Traverse the floating point stores upwards searching, for floating point 10201 // conversions. 10202 SmallPtrSet<const Instruction *, 4> Visited; 10203 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10204 while (!Worklist.empty()) { 10205 auto *I = Worklist.pop_back_val(); 10206 if (!L->contains(I)) 10207 continue; 10208 if (!Visited.insert(I).second) 10209 continue; 10210 10211 // Emit a remark if the floating point store required a floating 10212 // point conversion. 10213 // TODO: More work could be done to identify the root cause such as a 10214 // constant or a function return type and point the user to it. 10215 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10216 ORE->emit([&]() { 10217 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10218 I->getDebugLoc(), L->getHeader()) 10219 << "floating point conversion changes vector width. " 10220 << "Mixed floating point precision requires an up/down " 10221 << "cast that will negatively impact performance."; 10222 }); 10223 10224 for (Use &Op : I->operands()) 10225 if (auto *OpI = dyn_cast<Instruction>(Op)) 10226 Worklist.push_back(OpI); 10227 } 10228 } 10229 10230 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10231 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10232 !EnableLoopInterleaving), 10233 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10234 !EnableLoopVectorization) {} 10235 10236 bool LoopVectorizePass::processLoop(Loop *L) { 10237 assert((EnableVPlanNativePath || L->isInnermost()) && 10238 "VPlan-native path is not enabled. Only process inner loops."); 10239 10240 #ifndef NDEBUG 10241 const std::string DebugLocStr = getDebugLocString(L); 10242 #endif /* NDEBUG */ 10243 10244 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10245 << L->getHeader()->getParent()->getName() << "\" from " 10246 << DebugLocStr << "\n"); 10247 10248 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10249 10250 LLVM_DEBUG( 10251 dbgs() << "LV: Loop hints:" 10252 << " force=" 10253 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10254 ? "disabled" 10255 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10256 ? "enabled" 10257 : "?")) 10258 << " width=" << Hints.getWidth() 10259 << " interleave=" << Hints.getInterleave() << "\n"); 10260 10261 // Function containing loop 10262 Function *F = L->getHeader()->getParent(); 10263 10264 // Looking at the diagnostic output is the only way to determine if a loop 10265 // was vectorized (other than looking at the IR or machine code), so it 10266 // is important to generate an optimization remark for each loop. Most of 10267 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10268 // generated as OptimizationRemark and OptimizationRemarkMissed are 10269 // less verbose reporting vectorized loops and unvectorized loops that may 10270 // benefit from vectorization, respectively. 10271 10272 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10273 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10274 return false; 10275 } 10276 10277 PredicatedScalarEvolution PSE(*SE, *L); 10278 10279 // Check if it is legal to vectorize the loop. 10280 LoopVectorizationRequirements Requirements; 10281 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10282 &Requirements, &Hints, DB, AC, BFI, PSI); 10283 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10284 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10285 Hints.emitRemarkWithHints(); 10286 return false; 10287 } 10288 10289 // Check the function attributes and profiles to find out if this function 10290 // should be optimized for size. 10291 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10292 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10293 10294 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10295 // here. They may require CFG and instruction level transformations before 10296 // even evaluating whether vectorization is profitable. Since we cannot modify 10297 // the incoming IR, we need to build VPlan upfront in the vectorization 10298 // pipeline. 10299 if (!L->isInnermost()) 10300 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10301 ORE, BFI, PSI, Hints, Requirements); 10302 10303 assert(L->isInnermost() && "Inner loop expected."); 10304 10305 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10306 // count by optimizing for size, to minimize overheads. 10307 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10308 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10309 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10310 << "This loop is worth vectorizing only if no scalar " 10311 << "iteration overheads are incurred."); 10312 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10313 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10314 else { 10315 LLVM_DEBUG(dbgs() << "\n"); 10316 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10317 } 10318 } 10319 10320 // Check the function attributes to see if implicit floats are allowed. 10321 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10322 // an integer loop and the vector instructions selected are purely integer 10323 // vector instructions? 10324 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10325 reportVectorizationFailure( 10326 "Can't vectorize when the NoImplicitFloat attribute is used", 10327 "loop not vectorized due to NoImplicitFloat attribute", 10328 "NoImplicitFloat", ORE, L); 10329 Hints.emitRemarkWithHints(); 10330 return false; 10331 } 10332 10333 // Check if the target supports potentially unsafe FP vectorization. 10334 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10335 // for the target we're vectorizing for, to make sure none of the 10336 // additional fp-math flags can help. 10337 if (Hints.isPotentiallyUnsafe() && 10338 TTI->isFPVectorizationPotentiallyUnsafe()) { 10339 reportVectorizationFailure( 10340 "Potentially unsafe FP op prevents vectorization", 10341 "loop not vectorized due to unsafe FP support.", 10342 "UnsafeFP", ORE, L); 10343 Hints.emitRemarkWithHints(); 10344 return false; 10345 } 10346 10347 bool AllowOrderedReductions; 10348 // If the flag is set, use that instead and override the TTI behaviour. 10349 if (ForceOrderedReductions.getNumOccurrences() > 0) 10350 AllowOrderedReductions = ForceOrderedReductions; 10351 else 10352 AllowOrderedReductions = TTI->enableOrderedReductions(); 10353 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10354 ORE->emit([&]() { 10355 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10356 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10357 ExactFPMathInst->getDebugLoc(), 10358 ExactFPMathInst->getParent()) 10359 << "loop not vectorized: cannot prove it is safe to reorder " 10360 "floating-point operations"; 10361 }); 10362 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10363 "reorder floating-point operations\n"); 10364 Hints.emitRemarkWithHints(); 10365 return false; 10366 } 10367 10368 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10369 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10370 10371 // If an override option has been passed in for interleaved accesses, use it. 10372 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10373 UseInterleaved = EnableInterleavedMemAccesses; 10374 10375 // Analyze interleaved memory accesses. 10376 if (UseInterleaved) { 10377 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10378 } 10379 10380 // Use the cost model. 10381 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10382 F, &Hints, IAI); 10383 CM.collectValuesToIgnore(); 10384 CM.collectElementTypesForWidening(); 10385 10386 // Use the planner for vectorization. 10387 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10388 Requirements, ORE); 10389 10390 // Get user vectorization factor and interleave count. 10391 ElementCount UserVF = Hints.getWidth(); 10392 unsigned UserIC = Hints.getInterleave(); 10393 10394 // Plan how to best vectorize, return the best VF and its cost. 10395 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10396 10397 VectorizationFactor VF = VectorizationFactor::Disabled(); 10398 unsigned IC = 1; 10399 10400 if (MaybeVF) { 10401 VF = *MaybeVF; 10402 // Select the interleave count. 10403 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10404 } 10405 10406 // Identify the diagnostic messages that should be produced. 10407 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10408 bool VectorizeLoop = true, InterleaveLoop = true; 10409 if (VF.Width.isScalar()) { 10410 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10411 VecDiagMsg = std::make_pair( 10412 "VectorizationNotBeneficial", 10413 "the cost-model indicates that vectorization is not beneficial"); 10414 VectorizeLoop = false; 10415 } 10416 10417 if (!MaybeVF && UserIC > 1) { 10418 // Tell the user interleaving was avoided up-front, despite being explicitly 10419 // requested. 10420 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10421 "interleaving should be avoided up front\n"); 10422 IntDiagMsg = std::make_pair( 10423 "InterleavingAvoided", 10424 "Ignoring UserIC, because interleaving was avoided up front"); 10425 InterleaveLoop = false; 10426 } else if (IC == 1 && UserIC <= 1) { 10427 // Tell the user interleaving is not beneficial. 10428 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10429 IntDiagMsg = std::make_pair( 10430 "InterleavingNotBeneficial", 10431 "the cost-model indicates that interleaving is not beneficial"); 10432 InterleaveLoop = false; 10433 if (UserIC == 1) { 10434 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10435 IntDiagMsg.second += 10436 " and is explicitly disabled or interleave count is set to 1"; 10437 } 10438 } else if (IC > 1 && UserIC == 1) { 10439 // Tell the user interleaving is beneficial, but it explicitly disabled. 10440 LLVM_DEBUG( 10441 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10442 IntDiagMsg = std::make_pair( 10443 "InterleavingBeneficialButDisabled", 10444 "the cost-model indicates that interleaving is beneficial " 10445 "but is explicitly disabled or interleave count is set to 1"); 10446 InterleaveLoop = false; 10447 } 10448 10449 // Override IC if user provided an interleave count. 10450 IC = UserIC > 0 ? UserIC : IC; 10451 10452 // Emit diagnostic messages, if any. 10453 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10454 if (!VectorizeLoop && !InterleaveLoop) { 10455 // Do not vectorize or interleaving the loop. 10456 ORE->emit([&]() { 10457 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10458 L->getStartLoc(), L->getHeader()) 10459 << VecDiagMsg.second; 10460 }); 10461 ORE->emit([&]() { 10462 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10463 L->getStartLoc(), L->getHeader()) 10464 << IntDiagMsg.second; 10465 }); 10466 return false; 10467 } else if (!VectorizeLoop && InterleaveLoop) { 10468 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10469 ORE->emit([&]() { 10470 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10471 L->getStartLoc(), L->getHeader()) 10472 << VecDiagMsg.second; 10473 }); 10474 } else if (VectorizeLoop && !InterleaveLoop) { 10475 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10476 << ") in " << DebugLocStr << '\n'); 10477 ORE->emit([&]() { 10478 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10479 L->getStartLoc(), L->getHeader()) 10480 << IntDiagMsg.second; 10481 }); 10482 } else if (VectorizeLoop && InterleaveLoop) { 10483 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10484 << ") in " << DebugLocStr << '\n'); 10485 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10486 } 10487 10488 bool DisableRuntimeUnroll = false; 10489 MDNode *OrigLoopID = L->getLoopID(); 10490 { 10491 // Optimistically generate runtime checks. Drop them if they turn out to not 10492 // be profitable. Limit the scope of Checks, so the cleanup happens 10493 // immediately after vector codegeneration is done. 10494 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10495 F->getParent()->getDataLayout()); 10496 if (!VF.Width.isScalar() || IC > 1) 10497 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10498 10499 using namespace ore; 10500 if (!VectorizeLoop) { 10501 assert(IC > 1 && "interleave count should not be 1 or 0"); 10502 // If we decided that it is not legal to vectorize the loop, then 10503 // interleave it. 10504 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10505 &CM, BFI, PSI, Checks); 10506 10507 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10508 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10509 10510 ORE->emit([&]() { 10511 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10512 L->getHeader()) 10513 << "interleaved loop (interleaved count: " 10514 << NV("InterleaveCount", IC) << ")"; 10515 }); 10516 } else { 10517 // If we decided that it is *legal* to vectorize the loop, then do it. 10518 10519 // Consider vectorizing the epilogue too if it's profitable. 10520 VectorizationFactor EpilogueVF = 10521 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10522 if (EpilogueVF.Width.isVector()) { 10523 10524 // The first pass vectorizes the main loop and creates a scalar epilogue 10525 // to be vectorized by executing the plan (potentially with a different 10526 // factor) again shortly afterwards. 10527 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10528 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10529 EPI, &LVL, &CM, BFI, PSI, Checks); 10530 10531 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10532 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10533 DT); 10534 ++LoopsVectorized; 10535 10536 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10537 formLCSSARecursively(*L, *DT, LI, SE); 10538 10539 // Second pass vectorizes the epilogue and adjusts the control flow 10540 // edges from the first pass. 10541 EPI.MainLoopVF = EPI.EpilogueVF; 10542 EPI.MainLoopUF = EPI.EpilogueUF; 10543 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10544 ORE, EPI, &LVL, &CM, BFI, PSI, 10545 Checks); 10546 10547 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10548 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10549 DT); 10550 ++LoopsEpilogueVectorized; 10551 10552 if (!MainILV.areSafetyChecksAdded()) 10553 DisableRuntimeUnroll = true; 10554 } else { 10555 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10556 &LVL, &CM, BFI, PSI, Checks); 10557 10558 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10559 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10560 ++LoopsVectorized; 10561 10562 // Add metadata to disable runtime unrolling a scalar loop when there 10563 // are no runtime checks about strides and memory. A scalar loop that is 10564 // rarely used is not worth unrolling. 10565 if (!LB.areSafetyChecksAdded()) 10566 DisableRuntimeUnroll = true; 10567 } 10568 // Report the vectorization decision. 10569 ORE->emit([&]() { 10570 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10571 L->getHeader()) 10572 << "vectorized loop (vectorization width: " 10573 << NV("VectorizationFactor", VF.Width) 10574 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10575 }); 10576 } 10577 10578 if (ORE->allowExtraAnalysis(LV_NAME)) 10579 checkMixedPrecision(L, ORE); 10580 } 10581 10582 Optional<MDNode *> RemainderLoopID = 10583 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10584 LLVMLoopVectorizeFollowupEpilogue}); 10585 if (RemainderLoopID.hasValue()) { 10586 L->setLoopID(RemainderLoopID.getValue()); 10587 } else { 10588 if (DisableRuntimeUnroll) 10589 AddRuntimeUnrollDisableMetaData(L); 10590 10591 // Mark the loop as already vectorized to avoid vectorizing again. 10592 Hints.setAlreadyVectorized(); 10593 } 10594 10595 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10596 return true; 10597 } 10598 10599 LoopVectorizeResult LoopVectorizePass::runImpl( 10600 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10601 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10602 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10603 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10604 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10605 SE = &SE_; 10606 LI = &LI_; 10607 TTI = &TTI_; 10608 DT = &DT_; 10609 BFI = &BFI_; 10610 TLI = TLI_; 10611 AA = &AA_; 10612 AC = &AC_; 10613 GetLAA = &GetLAA_; 10614 DB = &DB_; 10615 ORE = &ORE_; 10616 PSI = PSI_; 10617 10618 // Don't attempt if 10619 // 1. the target claims to have no vector registers, and 10620 // 2. interleaving won't help ILP. 10621 // 10622 // The second condition is necessary because, even if the target has no 10623 // vector registers, loop vectorization may still enable scalar 10624 // interleaving. 10625 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10626 TTI->getMaxInterleaveFactor(1) < 2) 10627 return LoopVectorizeResult(false, false); 10628 10629 bool Changed = false, CFGChanged = false; 10630 10631 // The vectorizer requires loops to be in simplified form. 10632 // Since simplification may add new inner loops, it has to run before the 10633 // legality and profitability checks. This means running the loop vectorizer 10634 // will simplify all loops, regardless of whether anything end up being 10635 // vectorized. 10636 for (auto &L : *LI) 10637 Changed |= CFGChanged |= 10638 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10639 10640 // Build up a worklist of inner-loops to vectorize. This is necessary as 10641 // the act of vectorizing or partially unrolling a loop creates new loops 10642 // and can invalidate iterators across the loops. 10643 SmallVector<Loop *, 8> Worklist; 10644 10645 for (Loop *L : *LI) 10646 collectSupportedLoops(*L, LI, ORE, Worklist); 10647 10648 LoopsAnalyzed += Worklist.size(); 10649 10650 // Now walk the identified inner loops. 10651 while (!Worklist.empty()) { 10652 Loop *L = Worklist.pop_back_val(); 10653 10654 // For the inner loops we actually process, form LCSSA to simplify the 10655 // transform. 10656 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10657 10658 Changed |= CFGChanged |= processLoop(L); 10659 } 10660 10661 // Process each loop nest in the function. 10662 return LoopVectorizeResult(Changed, CFGChanged); 10663 } 10664 10665 PreservedAnalyses LoopVectorizePass::run(Function &F, 10666 FunctionAnalysisManager &AM) { 10667 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10668 auto &LI = AM.getResult<LoopAnalysis>(F); 10669 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10670 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10671 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10672 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10673 auto &AA = AM.getResult<AAManager>(F); 10674 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10675 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10676 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10677 10678 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10679 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10680 [&](Loop &L) -> const LoopAccessInfo & { 10681 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10682 TLI, TTI, nullptr, nullptr, nullptr}; 10683 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10684 }; 10685 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10686 ProfileSummaryInfo *PSI = 10687 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10688 LoopVectorizeResult Result = 10689 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10690 if (!Result.MadeAnyChange) 10691 return PreservedAnalyses::all(); 10692 PreservedAnalyses PA; 10693 10694 // We currently do not preserve loopinfo/dominator analyses with outer loop 10695 // vectorization. Until this is addressed, mark these analyses as preserved 10696 // only for non-VPlan-native path. 10697 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10698 if (!EnableVPlanNativePath) { 10699 PA.preserve<LoopAnalysis>(); 10700 PA.preserve<DominatorTreeAnalysis>(); 10701 } 10702 10703 if (Result.MadeCFGChange) { 10704 // Making CFG changes likely means a loop got vectorized. Indicate that 10705 // extra simplification passes should be run. 10706 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10707 // be run if runtime checks have been added. 10708 AM.getResult<ShouldRunExtraVectorPasses>(F); 10709 PA.preserve<ShouldRunExtraVectorPasses>(); 10710 } else { 10711 PA.preserveSet<CFGAnalyses>(); 10712 } 10713 return PA; 10714 } 10715 10716 void LoopVectorizePass::printPipeline( 10717 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10718 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10719 OS, MapClassName2PassName); 10720 10721 OS << "<"; 10722 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10723 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10724 OS << ">"; 10725 } 10726