1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop. 474 /// In the case of epilogue vectorization, this function is overriden to 475 /// handle the more complex control flow around the loops. 476 virtual BasicBlock *createVectorizedLoopSkeleton(); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 483 void fixVectorizedLoop(VPTransformState &State); 484 485 // Return true if any runtime check is added. 486 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 487 488 /// A type for vectorized values in the new loop. Each value from the 489 /// original loop, when vectorized, is represented by UF vector values in the 490 /// new unrolled loop, where UF is the unroll factor. 491 using VectorParts = SmallVector<Value *, 2>; 492 493 /// Vectorize a single first-order recurrence or pointer induction PHINode in 494 /// a block. This method handles the induction variable canonicalization. It 495 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 496 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 497 VPTransformState &State); 498 499 /// A helper function to scalarize a single Instruction in the innermost loop. 500 /// Generates a sequence of scalar instances for each lane between \p MinLane 501 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 502 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 503 /// Instr's operands. 504 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 505 const VPIteration &Instance, bool IfPredicateInstr, 506 VPTransformState &State); 507 508 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 509 /// is provided, the integer induction variable will first be truncated to 510 /// the corresponding type. 511 void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, 512 Value *Start, TruncInst *Trunc, VPValue *Def, 513 VPTransformState &State); 514 515 /// Construct the vector value of a scalarized value \p V one lane at a time. 516 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 517 VPTransformState &State); 518 519 /// Try to vectorize interleaved access group \p Group with the base address 520 /// given in \p Addr, optionally masking the vector operations if \p 521 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 522 /// values in the vectorized loop. 523 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 524 ArrayRef<VPValue *> VPDefs, 525 VPTransformState &State, VPValue *Addr, 526 ArrayRef<VPValue *> StoredValues, 527 VPValue *BlockInMask = nullptr); 528 529 /// Set the debug location in the builder \p Ptr using the debug location in 530 /// \p V. If \p Ptr is None then it uses the class member's Builder. 531 void setDebugLocFromInst(const Value *V, 532 Optional<IRBuilder<> *> CustomBuilder = None); 533 534 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 535 void fixNonInductionPHIs(VPTransformState &State); 536 537 /// Returns true if the reordering of FP operations is not allowed, but we are 538 /// able to vectorize with strict in-order reductions for the given RdxDesc. 539 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 540 541 /// Create a broadcast instruction. This method generates a broadcast 542 /// instruction (shuffle) for loop invariant values and for the induction 543 /// value. If this is the induction variable then we extend it to N, N+1, ... 544 /// this is needed because each iteration in the loop corresponds to a SIMD 545 /// element. 546 virtual Value *getBroadcastInstrs(Value *V); 547 548 /// Add metadata from one instruction to another. 549 /// 550 /// This includes both the original MDs from \p From and additional ones (\see 551 /// addNewMetadata). Use this for *newly created* instructions in the vector 552 /// loop. 553 void addMetadata(Instruction *To, Instruction *From); 554 555 /// Similar to the previous function but it adds the metadata to a 556 /// vector of instructions. 557 void addMetadata(ArrayRef<Value *> To, Instruction *From); 558 559 protected: 560 friend class LoopVectorizationPlanner; 561 562 /// A small list of PHINodes. 563 using PhiVector = SmallVector<PHINode *, 4>; 564 565 /// A type for scalarized values in the new loop. Each value from the 566 /// original loop, when scalarized, is represented by UF x VF scalar values 567 /// in the new unrolled loop, where UF is the unroll factor and VF is the 568 /// vectorization factor. 569 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 570 571 /// Set up the values of the IVs correctly when exiting the vector loop. 572 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 573 Value *CountRoundDown, Value *EndValue, 574 BasicBlock *MiddleBlock); 575 576 /// Create a new induction variable inside L. 577 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 578 Value *Step, Instruction *DL); 579 580 /// Handle all cross-iteration phis in the header. 581 void fixCrossIterationPHIs(VPTransformState &State); 582 583 /// Create the exit value of first order recurrences in the middle block and 584 /// update their users. 585 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 586 587 /// Create code for the loop exit value of the reduction. 588 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 589 590 /// Clear NSW/NUW flags from reduction instructions if necessary. 591 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 592 VPTransformState &State); 593 594 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 595 /// means we need to add the appropriate incoming value from the middle 596 /// block as exiting edges from the scalar epilogue loop (if present) are 597 /// already in place, and we exit the vector loop exclusively to the middle 598 /// block. 599 void fixLCSSAPHIs(VPTransformState &State); 600 601 /// Iteratively sink the scalarized operands of a predicated instruction into 602 /// the block that was created for it. 603 void sinkScalarOperands(Instruction *PredInst); 604 605 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 606 /// represented as. 607 void truncateToMinimalBitwidths(VPTransformState &State); 608 609 /// This function adds 610 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 611 /// to each vector element of Val. The sequence starts at StartIndex. 612 /// \p Opcode is relevant for FP induction variable. 613 virtual Value * 614 getStepVector(Value *Val, Value *StartIdx, Value *Step, 615 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 616 617 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 618 /// variable on which to base the steps, \p Step is the size of the step, and 619 /// \p EntryVal is the value from the original loop that maps to the steps. 620 /// Note that \p EntryVal doesn't have to be an induction variable - it 621 /// can also be a truncate instruction. 622 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 623 const InductionDescriptor &ID, VPValue *Def, 624 VPTransformState &State); 625 626 /// Create a vector induction phi node based on an existing scalar one. \p 627 /// EntryVal is the value from the original loop that maps to the vector phi 628 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 629 /// truncate instruction, instead of widening the original IV, we widen a 630 /// version of the IV truncated to \p EntryVal's type. 631 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 632 Value *Step, Value *Start, 633 Instruction *EntryVal, VPValue *Def, 634 VPTransformState &State); 635 636 /// Returns true if an instruction \p I should be scalarized instead of 637 /// vectorized for the chosen vectorization factor. 638 bool shouldScalarizeInstruction(Instruction *I) const; 639 640 /// Returns true if we should generate a scalar version of \p IV. 641 bool needsScalarInduction(Instruction *IV) const; 642 643 /// Generate a shuffle sequence that will reverse the vector Vec. 644 virtual Value *reverseVector(Value *Vec); 645 646 /// Returns (and creates if needed) the original loop trip count. 647 Value *getOrCreateTripCount(Loop *NewLoop); 648 649 /// Returns (and creates if needed) the trip count of the widened loop. 650 Value *getOrCreateVectorTripCount(Loop *NewLoop); 651 652 /// Returns a bitcasted value to the requested vector type. 653 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 654 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 655 const DataLayout &DL); 656 657 /// Emit a bypass check to see if the vector trip count is zero, including if 658 /// it overflows. 659 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 660 661 /// Emit a bypass check to see if all of the SCEV assumptions we've 662 /// had to make are correct. Returns the block containing the checks or 663 /// nullptr if no checks have been added. 664 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 665 666 /// Emit bypass checks to check any memory assumptions we may have made. 667 /// Returns the block containing the checks or nullptr if no checks have been 668 /// added. 669 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 670 671 /// Compute the transformed value of Index at offset StartValue using step 672 /// StepValue. 673 /// For integer induction, returns StartValue + Index * StepValue. 674 /// For pointer induction, returns StartValue[Index * StepValue]. 675 /// FIXME: The newly created binary instructions should contain nsw/nuw 676 /// flags, which can be found from the original scalar operations. 677 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 678 const DataLayout &DL, 679 const InductionDescriptor &ID) const; 680 681 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 682 /// vector loop preheader, middle block and scalar preheader. Also 683 /// allocate a loop object for the new vector loop and return it. 684 Loop *createVectorLoopSkeleton(StringRef Prefix); 685 686 /// Create new phi nodes for the induction variables to resume iteration count 687 /// in the scalar epilogue, from where the vectorized loop left off (given by 688 /// \p VectorTripCount). 689 /// In cases where the loop skeleton is more complicated (eg. epilogue 690 /// vectorization) and the resume values can come from an additional bypass 691 /// block, the \p AdditionalBypass pair provides information about the bypass 692 /// block and the end value on the edge from bypass to this loop. 693 void createInductionResumeValues( 694 Loop *L, Value *VectorTripCount, 695 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 696 697 /// Complete the loop skeleton by adding debug MDs, creating appropriate 698 /// conditional branches in the middle block, preparing the builder and 699 /// running the verifier. Take in the vector loop \p L as argument, and return 700 /// the preheader of the completed vector loop. 701 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 702 703 /// Add additional metadata to \p To that was not present on \p Orig. 704 /// 705 /// Currently this is used to add the noalias annotations based on the 706 /// inserted memchecks. Use this for instructions that are *cloned* into the 707 /// vector loop. 708 void addNewMetadata(Instruction *To, const Instruction *Orig); 709 710 /// Collect poison-generating recipes that may generate a poison value that is 711 /// used after vectorization, even when their operands are not poison. Those 712 /// recipes meet the following conditions: 713 /// * Contribute to the address computation of a recipe generating a widen 714 /// memory load/store (VPWidenMemoryInstructionRecipe or 715 /// VPInterleaveRecipe). 716 /// * Such a widen memory load/store has at least one underlying Instruction 717 /// that is in a basic block that needs predication and after vectorization 718 /// the generated instruction won't be predicated. 719 void collectPoisonGeneratingRecipes(VPTransformState &State); 720 721 /// Allow subclasses to override and print debug traces before/after vplan 722 /// execution, when trace information is requested. 723 virtual void printDebugTracesAtStart(){}; 724 virtual void printDebugTracesAtEnd(){}; 725 726 /// The original loop. 727 Loop *OrigLoop; 728 729 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 730 /// dynamic knowledge to simplify SCEV expressions and converts them to a 731 /// more usable form. 732 PredicatedScalarEvolution &PSE; 733 734 /// Loop Info. 735 LoopInfo *LI; 736 737 /// Dominator Tree. 738 DominatorTree *DT; 739 740 /// Alias Analysis. 741 AAResults *AA; 742 743 /// Target Library Info. 744 const TargetLibraryInfo *TLI; 745 746 /// Target Transform Info. 747 const TargetTransformInfo *TTI; 748 749 /// Assumption Cache. 750 AssumptionCache *AC; 751 752 /// Interface to emit optimization remarks. 753 OptimizationRemarkEmitter *ORE; 754 755 /// LoopVersioning. It's only set up (non-null) if memchecks were 756 /// used. 757 /// 758 /// This is currently only used to add no-alias metadata based on the 759 /// memchecks. The actually versioning is performed manually. 760 std::unique_ptr<LoopVersioning> LVer; 761 762 /// The vectorization SIMD factor to use. Each vector will have this many 763 /// vector elements. 764 ElementCount VF; 765 766 /// The vectorization unroll factor to use. Each scalar is vectorized to this 767 /// many different vector instructions. 768 unsigned UF; 769 770 /// The builder that we use 771 IRBuilder<> Builder; 772 773 // --- Vectorization state --- 774 775 /// The vector-loop preheader. 776 BasicBlock *LoopVectorPreHeader; 777 778 /// The scalar-loop preheader. 779 BasicBlock *LoopScalarPreHeader; 780 781 /// Middle Block between the vector and the scalar. 782 BasicBlock *LoopMiddleBlock; 783 784 /// The unique ExitBlock of the scalar loop if one exists. Note that 785 /// there can be multiple exiting edges reaching this block. 786 BasicBlock *LoopExitBlock; 787 788 /// The vector loop body. 789 BasicBlock *LoopVectorBody; 790 791 /// The scalar loop body. 792 BasicBlock *LoopScalarBody; 793 794 /// A list of all bypass blocks. The first block is the entry of the loop. 795 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 796 797 /// The new Induction variable which was added to the new block. 798 PHINode *Induction = nullptr; 799 800 /// The induction variable of the old basic block. 801 PHINode *OldInduction = nullptr; 802 803 /// Store instructions that were predicated. 804 SmallVector<Instruction *, 4> PredicatedInstructions; 805 806 /// Trip count of the original loop. 807 Value *TripCount = nullptr; 808 809 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 810 Value *VectorTripCount = nullptr; 811 812 /// The legality analysis. 813 LoopVectorizationLegality *Legal; 814 815 /// The profitablity analysis. 816 LoopVectorizationCostModel *Cost; 817 818 // Record whether runtime checks are added. 819 bool AddedSafetyChecks = false; 820 821 // Holds the end values for each induction variable. We save the end values 822 // so we can later fix-up the external users of the induction variables. 823 DenseMap<PHINode *, Value *> IVEndValues; 824 825 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 826 // fixed up at the end of vector code generation. 827 SmallVector<PHINode *, 8> OrigPHIsToFix; 828 829 /// BFI and PSI are used to check for profile guided size optimizations. 830 BlockFrequencyInfo *BFI; 831 ProfileSummaryInfo *PSI; 832 833 // Whether this loop should be optimized for size based on profile guided size 834 // optimizatios. 835 bool OptForSizeBasedOnProfile; 836 837 /// Structure to hold information about generated runtime checks, responsible 838 /// for cleaning the checks, if vectorization turns out unprofitable. 839 GeneratedRTChecks &RTChecks; 840 }; 841 842 class InnerLoopUnroller : public InnerLoopVectorizer { 843 public: 844 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 845 LoopInfo *LI, DominatorTree *DT, 846 const TargetLibraryInfo *TLI, 847 const TargetTransformInfo *TTI, AssumptionCache *AC, 848 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 849 LoopVectorizationLegality *LVL, 850 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 851 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 852 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 853 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 854 BFI, PSI, Check) {} 855 856 private: 857 Value *getBroadcastInstrs(Value *V) override; 858 Value *getStepVector( 859 Value *Val, Value *StartIdx, Value *Step, 860 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 861 Value *reverseVector(Value *Vec) override; 862 }; 863 864 /// Encapsulate information regarding vectorization of a loop and its epilogue. 865 /// This information is meant to be updated and used across two stages of 866 /// epilogue vectorization. 867 struct EpilogueLoopVectorizationInfo { 868 ElementCount MainLoopVF = ElementCount::getFixed(0); 869 unsigned MainLoopUF = 0; 870 ElementCount EpilogueVF = ElementCount::getFixed(0); 871 unsigned EpilogueUF = 0; 872 BasicBlock *MainLoopIterationCountCheck = nullptr; 873 BasicBlock *EpilogueIterationCountCheck = nullptr; 874 BasicBlock *SCEVSafetyCheck = nullptr; 875 BasicBlock *MemSafetyCheck = nullptr; 876 Value *TripCount = nullptr; 877 Value *VectorTripCount = nullptr; 878 879 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 880 ElementCount EVF, unsigned EUF) 881 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 882 assert(EUF == 1 && 883 "A high UF for the epilogue loop is likely not beneficial."); 884 } 885 }; 886 887 /// An extension of the inner loop vectorizer that creates a skeleton for a 888 /// vectorized loop that has its epilogue (residual) also vectorized. 889 /// The idea is to run the vplan on a given loop twice, firstly to setup the 890 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 891 /// from the first step and vectorize the epilogue. This is achieved by 892 /// deriving two concrete strategy classes from this base class and invoking 893 /// them in succession from the loop vectorizer planner. 894 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 895 public: 896 InnerLoopAndEpilogueVectorizer( 897 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 898 DominatorTree *DT, const TargetLibraryInfo *TLI, 899 const TargetTransformInfo *TTI, AssumptionCache *AC, 900 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 901 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 902 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 903 GeneratedRTChecks &Checks) 904 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 905 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 906 Checks), 907 EPI(EPI) {} 908 909 // Override this function to handle the more complex control flow around the 910 // three loops. 911 BasicBlock *createVectorizedLoopSkeleton() final override { 912 return createEpilogueVectorizedLoopSkeleton(); 913 } 914 915 /// The interface for creating a vectorized skeleton using one of two 916 /// different strategies, each corresponding to one execution of the vplan 917 /// as described above. 918 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 919 920 /// Holds and updates state information required to vectorize the main loop 921 /// and its epilogue in two separate passes. This setup helps us avoid 922 /// regenerating and recomputing runtime safety checks. It also helps us to 923 /// shorten the iteration-count-check path length for the cases where the 924 /// iteration count of the loop is so small that the main vector loop is 925 /// completely skipped. 926 EpilogueLoopVectorizationInfo &EPI; 927 }; 928 929 /// A specialized derived class of inner loop vectorizer that performs 930 /// vectorization of *main* loops in the process of vectorizing loops and their 931 /// epilogues. 932 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 933 public: 934 EpilogueVectorizerMainLoop( 935 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 936 DominatorTree *DT, const TargetLibraryInfo *TLI, 937 const TargetTransformInfo *TTI, AssumptionCache *AC, 938 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 939 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 940 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 941 GeneratedRTChecks &Check) 942 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 943 EPI, LVL, CM, BFI, PSI, Check) {} 944 /// Implements the interface for creating a vectorized skeleton using the 945 /// *main loop* strategy (ie the first pass of vplan execution). 946 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 947 948 protected: 949 /// Emits an iteration count bypass check once for the main loop (when \p 950 /// ForEpilogue is false) and once for the epilogue loop (when \p 951 /// ForEpilogue is true). 952 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 953 bool ForEpilogue); 954 void printDebugTracesAtStart() override; 955 void printDebugTracesAtEnd() override; 956 }; 957 958 // A specialized derived class of inner loop vectorizer that performs 959 // vectorization of *epilogue* loops in the process of vectorizing loops and 960 // their epilogues. 961 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 962 public: 963 EpilogueVectorizerEpilogueLoop( 964 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 965 DominatorTree *DT, const TargetLibraryInfo *TLI, 966 const TargetTransformInfo *TTI, AssumptionCache *AC, 967 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 968 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 969 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 970 GeneratedRTChecks &Checks) 971 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 972 EPI, LVL, CM, BFI, PSI, Checks) {} 973 /// Implements the interface for creating a vectorized skeleton using the 974 /// *epilogue loop* strategy (ie the second pass of vplan execution). 975 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 976 977 protected: 978 /// Emits an iteration count bypass check after the main vector loop has 979 /// finished to see if there are any iterations left to execute by either 980 /// the vector epilogue or the scalar epilogue. 981 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 982 BasicBlock *Bypass, 983 BasicBlock *Insert); 984 void printDebugTracesAtStart() override; 985 void printDebugTracesAtEnd() override; 986 }; 987 } // end namespace llvm 988 989 /// Look for a meaningful debug location on the instruction or it's 990 /// operands. 991 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 992 if (!I) 993 return I; 994 995 DebugLoc Empty; 996 if (I->getDebugLoc() != Empty) 997 return I; 998 999 for (Use &Op : I->operands()) { 1000 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1001 if (OpInst->getDebugLoc() != Empty) 1002 return OpInst; 1003 } 1004 1005 return I; 1006 } 1007 1008 void InnerLoopVectorizer::setDebugLocFromInst( 1009 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1010 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1011 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1012 const DILocation *DIL = Inst->getDebugLoc(); 1013 1014 // When a FSDiscriminator is enabled, we don't need to add the multiply 1015 // factors to the discriminators. 1016 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1017 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1018 // FIXME: For scalable vectors, assume vscale=1. 1019 auto NewDIL = 1020 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1021 if (NewDIL) 1022 B->SetCurrentDebugLocation(NewDIL.getValue()); 1023 else 1024 LLVM_DEBUG(dbgs() 1025 << "Failed to create new discriminator: " 1026 << DIL->getFilename() << " Line: " << DIL->getLine()); 1027 } else 1028 B->SetCurrentDebugLocation(DIL); 1029 } else 1030 B->SetCurrentDebugLocation(DebugLoc()); 1031 } 1032 1033 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1034 /// is passed, the message relates to that particular instruction. 1035 #ifndef NDEBUG 1036 static void debugVectorizationMessage(const StringRef Prefix, 1037 const StringRef DebugMsg, 1038 Instruction *I) { 1039 dbgs() << "LV: " << Prefix << DebugMsg; 1040 if (I != nullptr) 1041 dbgs() << " " << *I; 1042 else 1043 dbgs() << '.'; 1044 dbgs() << '\n'; 1045 } 1046 #endif 1047 1048 /// Create an analysis remark that explains why vectorization failed 1049 /// 1050 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1051 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1052 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1053 /// the location of the remark. \return the remark object that can be 1054 /// streamed to. 1055 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1056 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1057 Value *CodeRegion = TheLoop->getHeader(); 1058 DebugLoc DL = TheLoop->getStartLoc(); 1059 1060 if (I) { 1061 CodeRegion = I->getParent(); 1062 // If there is no debug location attached to the instruction, revert back to 1063 // using the loop's. 1064 if (I->getDebugLoc()) 1065 DL = I->getDebugLoc(); 1066 } 1067 1068 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1069 } 1070 1071 /// Return a value for Step multiplied by VF. 1072 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1073 int64_t Step) { 1074 assert(Ty->isIntegerTy() && "Expected an integer step"); 1075 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1076 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1077 } 1078 1079 namespace llvm { 1080 1081 /// Return the runtime value for VF. 1082 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1083 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1084 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1085 } 1086 1087 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1088 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1089 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1090 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1091 return B.CreateUIToFP(RuntimeVF, FTy); 1092 } 1093 1094 void reportVectorizationFailure(const StringRef DebugMsg, 1095 const StringRef OREMsg, const StringRef ORETag, 1096 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1097 Instruction *I) { 1098 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1099 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1100 ORE->emit( 1101 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1102 << "loop not vectorized: " << OREMsg); 1103 } 1104 1105 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1106 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1107 Instruction *I) { 1108 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1109 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1110 ORE->emit( 1111 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1112 << Msg); 1113 } 1114 1115 } // end namespace llvm 1116 1117 #ifndef NDEBUG 1118 /// \return string containing a file name and a line # for the given loop. 1119 static std::string getDebugLocString(const Loop *L) { 1120 std::string Result; 1121 if (L) { 1122 raw_string_ostream OS(Result); 1123 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1124 LoopDbgLoc.print(OS); 1125 else 1126 // Just print the module name. 1127 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1128 OS.flush(); 1129 } 1130 return Result; 1131 } 1132 #endif 1133 1134 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1135 const Instruction *Orig) { 1136 // If the loop was versioned with memchecks, add the corresponding no-alias 1137 // metadata. 1138 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1139 LVer->annotateInstWithNoAlias(To, Orig); 1140 } 1141 1142 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1143 VPTransformState &State) { 1144 1145 // Collect recipes in the backward slice of `Root` that may generate a poison 1146 // value that is used after vectorization. 1147 SmallPtrSet<VPRecipeBase *, 16> Visited; 1148 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1149 SmallVector<VPRecipeBase *, 16> Worklist; 1150 Worklist.push_back(Root); 1151 1152 // Traverse the backward slice of Root through its use-def chain. 1153 while (!Worklist.empty()) { 1154 VPRecipeBase *CurRec = Worklist.back(); 1155 Worklist.pop_back(); 1156 1157 if (!Visited.insert(CurRec).second) 1158 continue; 1159 1160 // Prune search if we find another recipe generating a widen memory 1161 // instruction. Widen memory instructions involved in address computation 1162 // will lead to gather/scatter instructions, which don't need to be 1163 // handled. 1164 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1165 isa<VPInterleaveRecipe>(CurRec)) 1166 continue; 1167 1168 // This recipe contributes to the address computation of a widen 1169 // load/store. Collect recipe if its underlying instruction has 1170 // poison-generating flags. 1171 Instruction *Instr = CurRec->getUnderlyingInstr(); 1172 if (Instr && Instr->hasPoisonGeneratingFlags()) 1173 State.MayGeneratePoisonRecipes.insert(CurRec); 1174 1175 // Add new definitions to the worklist. 1176 for (VPValue *operand : CurRec->operands()) 1177 if (VPDef *OpDef = operand->getDef()) 1178 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1179 } 1180 }); 1181 1182 // Traverse all the recipes in the VPlan and collect the poison-generating 1183 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1184 // VPInterleaveRecipe. 1185 auto Iter = depth_first( 1186 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1187 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1188 for (VPRecipeBase &Recipe : *VPBB) { 1189 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1190 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1191 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1192 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1193 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1194 collectPoisonGeneratingInstrsInBackwardSlice( 1195 cast<VPRecipeBase>(AddrDef)); 1196 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1197 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1198 if (AddrDef) { 1199 // Check if any member of the interleave group needs predication. 1200 const InterleaveGroup<Instruction> *InterGroup = 1201 InterleaveRec->getInterleaveGroup(); 1202 bool NeedPredication = false; 1203 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1204 I < NumMembers; ++I) { 1205 Instruction *Member = InterGroup->getMember(I); 1206 if (Member) 1207 NeedPredication |= 1208 Legal->blockNeedsPredication(Member->getParent()); 1209 } 1210 1211 if (NeedPredication) 1212 collectPoisonGeneratingInstrsInBackwardSlice( 1213 cast<VPRecipeBase>(AddrDef)); 1214 } 1215 } 1216 } 1217 } 1218 } 1219 1220 void InnerLoopVectorizer::addMetadata(Instruction *To, 1221 Instruction *From) { 1222 propagateMetadata(To, From); 1223 addNewMetadata(To, From); 1224 } 1225 1226 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1227 Instruction *From) { 1228 for (Value *V : To) { 1229 if (Instruction *I = dyn_cast<Instruction>(V)) 1230 addMetadata(I, From); 1231 } 1232 } 1233 1234 namespace llvm { 1235 1236 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1237 // lowered. 1238 enum ScalarEpilogueLowering { 1239 1240 // The default: allowing scalar epilogues. 1241 CM_ScalarEpilogueAllowed, 1242 1243 // Vectorization with OptForSize: don't allow epilogues. 1244 CM_ScalarEpilogueNotAllowedOptSize, 1245 1246 // A special case of vectorisation with OptForSize: loops with a very small 1247 // trip count are considered for vectorization under OptForSize, thereby 1248 // making sure the cost of their loop body is dominant, free of runtime 1249 // guards and scalar iteration overheads. 1250 CM_ScalarEpilogueNotAllowedLowTripLoop, 1251 1252 // Loop hint predicate indicating an epilogue is undesired. 1253 CM_ScalarEpilogueNotNeededUsePredicate, 1254 1255 // Directive indicating we must either tail fold or not vectorize 1256 CM_ScalarEpilogueNotAllowedUsePredicate 1257 }; 1258 1259 /// ElementCountComparator creates a total ordering for ElementCount 1260 /// for the purposes of using it in a set structure. 1261 struct ElementCountComparator { 1262 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1263 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1264 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1265 } 1266 }; 1267 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1268 1269 /// LoopVectorizationCostModel - estimates the expected speedups due to 1270 /// vectorization. 1271 /// In many cases vectorization is not profitable. This can happen because of 1272 /// a number of reasons. In this class we mainly attempt to predict the 1273 /// expected speedup/slowdowns due to the supported instruction set. We use the 1274 /// TargetTransformInfo to query the different backends for the cost of 1275 /// different operations. 1276 class LoopVectorizationCostModel { 1277 public: 1278 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1279 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1280 LoopVectorizationLegality *Legal, 1281 const TargetTransformInfo &TTI, 1282 const TargetLibraryInfo *TLI, DemandedBits *DB, 1283 AssumptionCache *AC, 1284 OptimizationRemarkEmitter *ORE, const Function *F, 1285 const LoopVectorizeHints *Hints, 1286 InterleavedAccessInfo &IAI) 1287 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1288 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1289 Hints(Hints), InterleaveInfo(IAI) {} 1290 1291 /// \return An upper bound for the vectorization factors (both fixed and 1292 /// scalable). If the factors are 0, vectorization and interleaving should be 1293 /// avoided up front. 1294 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1295 1296 /// \return True if runtime checks are required for vectorization, and false 1297 /// otherwise. 1298 bool runtimeChecksRequired(); 1299 1300 /// \return The most profitable vectorization factor and the cost of that VF. 1301 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1302 /// then this vectorization factor will be selected if vectorization is 1303 /// possible. 1304 VectorizationFactor 1305 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1306 1307 VectorizationFactor 1308 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1309 const LoopVectorizationPlanner &LVP); 1310 1311 /// Setup cost-based decisions for user vectorization factor. 1312 /// \return true if the UserVF is a feasible VF to be chosen. 1313 bool selectUserVectorizationFactor(ElementCount UserVF) { 1314 collectUniformsAndScalars(UserVF); 1315 collectInstsToScalarize(UserVF); 1316 return expectedCost(UserVF).first.isValid(); 1317 } 1318 1319 /// \return The size (in bits) of the smallest and widest types in the code 1320 /// that needs to be vectorized. We ignore values that remain scalar such as 1321 /// 64 bit loop indices. 1322 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1323 1324 /// \return The desired interleave count. 1325 /// If interleave count has been specified by metadata it will be returned. 1326 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1327 /// are the selected vectorization factor and the cost of the selected VF. 1328 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1329 1330 /// Memory access instruction may be vectorized in more than one way. 1331 /// Form of instruction after vectorization depends on cost. 1332 /// This function takes cost-based decisions for Load/Store instructions 1333 /// and collects them in a map. This decisions map is used for building 1334 /// the lists of loop-uniform and loop-scalar instructions. 1335 /// The calculated cost is saved with widening decision in order to 1336 /// avoid redundant calculations. 1337 void setCostBasedWideningDecision(ElementCount VF); 1338 1339 /// A struct that represents some properties of the register usage 1340 /// of a loop. 1341 struct RegisterUsage { 1342 /// Holds the number of loop invariant values that are used in the loop. 1343 /// The key is ClassID of target-provided register class. 1344 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1345 /// Holds the maximum number of concurrent live intervals in the loop. 1346 /// The key is ClassID of target-provided register class. 1347 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1348 }; 1349 1350 /// \return Returns information about the register usages of the loop for the 1351 /// given vectorization factors. 1352 SmallVector<RegisterUsage, 8> 1353 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1354 1355 /// Collect values we want to ignore in the cost model. 1356 void collectValuesToIgnore(); 1357 1358 /// Collect all element types in the loop for which widening is needed. 1359 void collectElementTypesForWidening(); 1360 1361 /// Split reductions into those that happen in the loop, and those that happen 1362 /// outside. In loop reductions are collected into InLoopReductionChains. 1363 void collectInLoopReductions(); 1364 1365 /// Returns true if we should use strict in-order reductions for the given 1366 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1367 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1368 /// of FP operations. 1369 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1370 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1371 } 1372 1373 /// \returns The smallest bitwidth each instruction can be represented with. 1374 /// The vector equivalents of these instructions should be truncated to this 1375 /// type. 1376 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1377 return MinBWs; 1378 } 1379 1380 /// \returns True if it is more profitable to scalarize instruction \p I for 1381 /// vectorization factor \p VF. 1382 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1383 assert(VF.isVector() && 1384 "Profitable to scalarize relevant only for VF > 1."); 1385 1386 // Cost model is not run in the VPlan-native path - return conservative 1387 // result until this changes. 1388 if (EnableVPlanNativePath) 1389 return false; 1390 1391 auto Scalars = InstsToScalarize.find(VF); 1392 assert(Scalars != InstsToScalarize.end() && 1393 "VF not yet analyzed for scalarization profitability"); 1394 return Scalars->second.find(I) != Scalars->second.end(); 1395 } 1396 1397 /// Returns true if \p I is known to be uniform after vectorization. 1398 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1399 if (VF.isScalar()) 1400 return true; 1401 1402 // Cost model is not run in the VPlan-native path - return conservative 1403 // result until this changes. 1404 if (EnableVPlanNativePath) 1405 return false; 1406 1407 auto UniformsPerVF = Uniforms.find(VF); 1408 assert(UniformsPerVF != Uniforms.end() && 1409 "VF not yet analyzed for uniformity"); 1410 return UniformsPerVF->second.count(I); 1411 } 1412 1413 /// Returns true if \p I is known to be scalar after vectorization. 1414 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1415 if (VF.isScalar()) 1416 return true; 1417 1418 // Cost model is not run in the VPlan-native path - return conservative 1419 // result until this changes. 1420 if (EnableVPlanNativePath) 1421 return false; 1422 1423 auto ScalarsPerVF = Scalars.find(VF); 1424 assert(ScalarsPerVF != Scalars.end() && 1425 "Scalar values are not calculated for VF"); 1426 return ScalarsPerVF->second.count(I); 1427 } 1428 1429 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1430 /// for vectorization factor \p VF. 1431 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1432 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1433 !isProfitableToScalarize(I, VF) && 1434 !isScalarAfterVectorization(I, VF); 1435 } 1436 1437 /// Decision that was taken during cost calculation for memory instruction. 1438 enum InstWidening { 1439 CM_Unknown, 1440 CM_Widen, // For consecutive accesses with stride +1. 1441 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1442 CM_Interleave, 1443 CM_GatherScatter, 1444 CM_Scalarize 1445 }; 1446 1447 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1448 /// instruction \p I and vector width \p VF. 1449 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1450 InstructionCost Cost) { 1451 assert(VF.isVector() && "Expected VF >=2"); 1452 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1453 } 1454 1455 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1456 /// interleaving group \p Grp and vector width \p VF. 1457 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1458 ElementCount VF, InstWidening W, 1459 InstructionCost Cost) { 1460 assert(VF.isVector() && "Expected VF >=2"); 1461 /// Broadcast this decicion to all instructions inside the group. 1462 /// But the cost will be assigned to one instruction only. 1463 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1464 if (auto *I = Grp->getMember(i)) { 1465 if (Grp->getInsertPos() == I) 1466 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1467 else 1468 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1469 } 1470 } 1471 } 1472 1473 /// Return the cost model decision for the given instruction \p I and vector 1474 /// width \p VF. Return CM_Unknown if this instruction did not pass 1475 /// through the cost modeling. 1476 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1477 assert(VF.isVector() && "Expected VF to be a vector VF"); 1478 // Cost model is not run in the VPlan-native path - return conservative 1479 // result until this changes. 1480 if (EnableVPlanNativePath) 1481 return CM_GatherScatter; 1482 1483 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1484 auto Itr = WideningDecisions.find(InstOnVF); 1485 if (Itr == WideningDecisions.end()) 1486 return CM_Unknown; 1487 return Itr->second.first; 1488 } 1489 1490 /// Return the vectorization cost for the given instruction \p I and vector 1491 /// width \p VF. 1492 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1493 assert(VF.isVector() && "Expected VF >=2"); 1494 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1495 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1496 "The cost is not calculated"); 1497 return WideningDecisions[InstOnVF].second; 1498 } 1499 1500 /// Return True if instruction \p I is an optimizable truncate whose operand 1501 /// is an induction variable. Such a truncate will be removed by adding a new 1502 /// induction variable with the destination type. 1503 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1504 // If the instruction is not a truncate, return false. 1505 auto *Trunc = dyn_cast<TruncInst>(I); 1506 if (!Trunc) 1507 return false; 1508 1509 // Get the source and destination types of the truncate. 1510 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1511 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1512 1513 // If the truncate is free for the given types, return false. Replacing a 1514 // free truncate with an induction variable would add an induction variable 1515 // update instruction to each iteration of the loop. We exclude from this 1516 // check the primary induction variable since it will need an update 1517 // instruction regardless. 1518 Value *Op = Trunc->getOperand(0); 1519 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1520 return false; 1521 1522 // If the truncated value is not an induction variable, return false. 1523 return Legal->isInductionPhi(Op); 1524 } 1525 1526 /// Collects the instructions to scalarize for each predicated instruction in 1527 /// the loop. 1528 void collectInstsToScalarize(ElementCount VF); 1529 1530 /// Collect Uniform and Scalar values for the given \p VF. 1531 /// The sets depend on CM decision for Load/Store instructions 1532 /// that may be vectorized as interleave, gather-scatter or scalarized. 1533 void collectUniformsAndScalars(ElementCount VF) { 1534 // Do the analysis once. 1535 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1536 return; 1537 setCostBasedWideningDecision(VF); 1538 collectLoopUniforms(VF); 1539 collectLoopScalars(VF); 1540 } 1541 1542 /// Returns true if the target machine supports masked store operation 1543 /// for the given \p DataType and kind of access to \p Ptr. 1544 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1545 return Legal->isConsecutivePtr(DataType, Ptr) && 1546 TTI.isLegalMaskedStore(DataType, Alignment); 1547 } 1548 1549 /// Returns true if the target machine supports masked load operation 1550 /// for the given \p DataType and kind of access to \p Ptr. 1551 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1552 return Legal->isConsecutivePtr(DataType, Ptr) && 1553 TTI.isLegalMaskedLoad(DataType, Alignment); 1554 } 1555 1556 /// Returns true if the target machine can represent \p V as a masked gather 1557 /// or scatter operation. 1558 bool isLegalGatherOrScatter(Value *V) { 1559 bool LI = isa<LoadInst>(V); 1560 bool SI = isa<StoreInst>(V); 1561 if (!LI && !SI) 1562 return false; 1563 auto *Ty = getLoadStoreType(V); 1564 Align Align = getLoadStoreAlignment(V); 1565 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1566 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1567 } 1568 1569 /// Returns true if the target machine supports all of the reduction 1570 /// variables found for the given VF. 1571 bool canVectorizeReductions(ElementCount VF) const { 1572 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1573 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1574 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1575 })); 1576 } 1577 1578 /// Returns true if \p I is an instruction that will be scalarized with 1579 /// predication. Such instructions include conditional stores and 1580 /// instructions that may divide by zero. 1581 /// If a non-zero VF has been calculated, we check if I will be scalarized 1582 /// predication for that VF. 1583 bool isScalarWithPredication(Instruction *I) const; 1584 1585 // Returns true if \p I is an instruction that will be predicated either 1586 // through scalar predication or masked load/store or masked gather/scatter. 1587 // Superset of instructions that return true for isScalarWithPredication. 1588 bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { 1589 // When we know the load is uniform and the original scalar loop was not 1590 // predicated we don't need to mark it as a predicated instruction. Any 1591 // vectorised blocks created when tail-folding are something artificial we 1592 // have introduced and we know there is always at least one active lane. 1593 // That's why we call Legal->blockNeedsPredication here because it doesn't 1594 // query tail-folding. 1595 if (IsKnownUniform && isa<LoadInst>(I) && 1596 !Legal->blockNeedsPredication(I->getParent())) 1597 return false; 1598 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1599 return false; 1600 // Loads and stores that need some form of masked operation are predicated 1601 // instructions. 1602 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1603 return Legal->isMaskRequired(I); 1604 return isScalarWithPredication(I); 1605 } 1606 1607 /// Returns true if \p I is a memory instruction with consecutive memory 1608 /// access that can be widened. 1609 bool 1610 memoryInstructionCanBeWidened(Instruction *I, 1611 ElementCount VF = ElementCount::getFixed(1)); 1612 1613 /// Returns true if \p I is a memory instruction in an interleaved-group 1614 /// of memory accesses that can be vectorized with wide vector loads/stores 1615 /// and shuffles. 1616 bool 1617 interleavedAccessCanBeWidened(Instruction *I, 1618 ElementCount VF = ElementCount::getFixed(1)); 1619 1620 /// Check if \p Instr belongs to any interleaved access group. 1621 bool isAccessInterleaved(Instruction *Instr) { 1622 return InterleaveInfo.isInterleaved(Instr); 1623 } 1624 1625 /// Get the interleaved access group that \p Instr belongs to. 1626 const InterleaveGroup<Instruction> * 1627 getInterleavedAccessGroup(Instruction *Instr) { 1628 return InterleaveInfo.getInterleaveGroup(Instr); 1629 } 1630 1631 /// Returns true if we're required to use a scalar epilogue for at least 1632 /// the final iteration of the original loop. 1633 bool requiresScalarEpilogue(ElementCount VF) const { 1634 if (!isScalarEpilogueAllowed()) 1635 return false; 1636 // If we might exit from anywhere but the latch, must run the exiting 1637 // iteration in scalar form. 1638 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1639 return true; 1640 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1641 } 1642 1643 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1644 /// loop hint annotation. 1645 bool isScalarEpilogueAllowed() const { 1646 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1647 } 1648 1649 /// Returns true if all loop blocks should be masked to fold tail loop. 1650 bool foldTailByMasking() const { return FoldTailByMasking; } 1651 1652 /// Returns true if the instructions in this block requires predication 1653 /// for any reason, e.g. because tail folding now requires a predicate 1654 /// or because the block in the original loop was predicated. 1655 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1656 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1657 } 1658 1659 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1660 /// nodes to the chain of instructions representing the reductions. Uses a 1661 /// MapVector to ensure deterministic iteration order. 1662 using ReductionChainMap = 1663 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1664 1665 /// Return the chain of instructions representing an inloop reduction. 1666 const ReductionChainMap &getInLoopReductionChains() const { 1667 return InLoopReductionChains; 1668 } 1669 1670 /// Returns true if the Phi is part of an inloop reduction. 1671 bool isInLoopReduction(PHINode *Phi) const { 1672 return InLoopReductionChains.count(Phi); 1673 } 1674 1675 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1676 /// with factor VF. Return the cost of the instruction, including 1677 /// scalarization overhead if it's needed. 1678 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1679 1680 /// Estimate cost of a call instruction CI if it were vectorized with factor 1681 /// VF. Return the cost of the instruction, including scalarization overhead 1682 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1683 /// scalarized - 1684 /// i.e. either vector version isn't available, or is too expensive. 1685 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1686 bool &NeedToScalarize) const; 1687 1688 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1689 /// that of B. 1690 bool isMoreProfitable(const VectorizationFactor &A, 1691 const VectorizationFactor &B) const; 1692 1693 /// Invalidates decisions already taken by the cost model. 1694 void invalidateCostModelingDecisions() { 1695 WideningDecisions.clear(); 1696 Uniforms.clear(); 1697 Scalars.clear(); 1698 } 1699 1700 private: 1701 unsigned NumPredStores = 0; 1702 1703 /// \return An upper bound for the vectorization factors for both 1704 /// fixed and scalable vectorization, where the minimum-known number of 1705 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1706 /// disabled or unsupported, then the scalable part will be equal to 1707 /// ElementCount::getScalable(0). 1708 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1709 ElementCount UserVF); 1710 1711 /// \return the maximized element count based on the targets vector 1712 /// registers and the loop trip-count, but limited to a maximum safe VF. 1713 /// This is a helper function of computeFeasibleMaxVF. 1714 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1715 /// issue that occurred on one of the buildbots which cannot be reproduced 1716 /// without having access to the properietary compiler (see comments on 1717 /// D98509). The issue is currently under investigation and this workaround 1718 /// will be removed as soon as possible. 1719 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1720 unsigned SmallestType, 1721 unsigned WidestType, 1722 const ElementCount &MaxSafeVF); 1723 1724 /// \return the maximum legal scalable VF, based on the safe max number 1725 /// of elements. 1726 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1727 1728 /// The vectorization cost is a combination of the cost itself and a boolean 1729 /// indicating whether any of the contributing operations will actually 1730 /// operate on vector values after type legalization in the backend. If this 1731 /// latter value is false, then all operations will be scalarized (i.e. no 1732 /// vectorization has actually taken place). 1733 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1734 1735 /// Returns the expected execution cost. The unit of the cost does 1736 /// not matter because we use the 'cost' units to compare different 1737 /// vector widths. The cost that is returned is *not* normalized by 1738 /// the factor width. If \p Invalid is not nullptr, this function 1739 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1740 /// each instruction that has an Invalid cost for the given VF. 1741 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1742 VectorizationCostTy 1743 expectedCost(ElementCount VF, 1744 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1745 1746 /// Returns the execution time cost of an instruction for a given vector 1747 /// width. Vector width of one means scalar. 1748 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1749 1750 /// The cost-computation logic from getInstructionCost which provides 1751 /// the vector type as an output parameter. 1752 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1753 Type *&VectorTy); 1754 1755 /// Return the cost of instructions in an inloop reduction pattern, if I is 1756 /// part of that pattern. 1757 Optional<InstructionCost> 1758 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1759 TTI::TargetCostKind CostKind); 1760 1761 /// Calculate vectorization cost of memory instruction \p I. 1762 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1763 1764 /// The cost computation for scalarized memory instruction. 1765 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1766 1767 /// The cost computation for interleaving group of memory instructions. 1768 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1769 1770 /// The cost computation for Gather/Scatter instruction. 1771 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1772 1773 /// The cost computation for widening instruction \p I with consecutive 1774 /// memory access. 1775 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1776 1777 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1778 /// Load: scalar load + broadcast. 1779 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1780 /// element) 1781 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1782 1783 /// Estimate the overhead of scalarizing an instruction. This is a 1784 /// convenience wrapper for the type-based getScalarizationOverhead API. 1785 InstructionCost getScalarizationOverhead(Instruction *I, 1786 ElementCount VF) const; 1787 1788 /// Returns whether the instruction is a load or store and will be a emitted 1789 /// as a vector operation. 1790 bool isConsecutiveLoadOrStore(Instruction *I); 1791 1792 /// Returns true if an artificially high cost for emulated masked memrefs 1793 /// should be used. 1794 bool useEmulatedMaskMemRefHack(Instruction *I); 1795 1796 /// Map of scalar integer values to the smallest bitwidth they can be legally 1797 /// represented as. The vector equivalents of these values should be truncated 1798 /// to this type. 1799 MapVector<Instruction *, uint64_t> MinBWs; 1800 1801 /// A type representing the costs for instructions if they were to be 1802 /// scalarized rather than vectorized. The entries are Instruction-Cost 1803 /// pairs. 1804 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1805 1806 /// A set containing all BasicBlocks that are known to present after 1807 /// vectorization as a predicated block. 1808 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1809 1810 /// Records whether it is allowed to have the original scalar loop execute at 1811 /// least once. This may be needed as a fallback loop in case runtime 1812 /// aliasing/dependence checks fail, or to handle the tail/remainder 1813 /// iterations when the trip count is unknown or doesn't divide by the VF, 1814 /// or as a peel-loop to handle gaps in interleave-groups. 1815 /// Under optsize and when the trip count is very small we don't allow any 1816 /// iterations to execute in the scalar loop. 1817 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1818 1819 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1820 bool FoldTailByMasking = false; 1821 1822 /// A map holding scalar costs for different vectorization factors. The 1823 /// presence of a cost for an instruction in the mapping indicates that the 1824 /// instruction will be scalarized when vectorizing with the associated 1825 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1826 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1827 1828 /// Holds the instructions known to be uniform after vectorization. 1829 /// The data is collected per VF. 1830 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1831 1832 /// Holds the instructions known to be scalar after vectorization. 1833 /// The data is collected per VF. 1834 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1835 1836 /// Holds the instructions (address computations) that are forced to be 1837 /// scalarized. 1838 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1839 1840 /// PHINodes of the reductions that should be expanded in-loop along with 1841 /// their associated chains of reduction operations, in program order from top 1842 /// (PHI) to bottom 1843 ReductionChainMap InLoopReductionChains; 1844 1845 /// A Map of inloop reduction operations and their immediate chain operand. 1846 /// FIXME: This can be removed once reductions can be costed correctly in 1847 /// vplan. This was added to allow quick lookup to the inloop operations, 1848 /// without having to loop through InLoopReductionChains. 1849 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1850 1851 /// Returns the expected difference in cost from scalarizing the expression 1852 /// feeding a predicated instruction \p PredInst. The instructions to 1853 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1854 /// non-negative return value implies the expression will be scalarized. 1855 /// Currently, only single-use chains are considered for scalarization. 1856 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1857 ElementCount VF); 1858 1859 /// Collect the instructions that are uniform after vectorization. An 1860 /// instruction is uniform if we represent it with a single scalar value in 1861 /// the vectorized loop corresponding to each vector iteration. Examples of 1862 /// uniform instructions include pointer operands of consecutive or 1863 /// interleaved memory accesses. Note that although uniformity implies an 1864 /// instruction will be scalar, the reverse is not true. In general, a 1865 /// scalarized instruction will be represented by VF scalar values in the 1866 /// vectorized loop, each corresponding to an iteration of the original 1867 /// scalar loop. 1868 void collectLoopUniforms(ElementCount VF); 1869 1870 /// Collect the instructions that are scalar after vectorization. An 1871 /// instruction is scalar if it is known to be uniform or will be scalarized 1872 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1873 /// to the list if they are used by a load/store instruction that is marked as 1874 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1875 /// VF values in the vectorized loop, each corresponding to an iteration of 1876 /// the original scalar loop. 1877 void collectLoopScalars(ElementCount VF); 1878 1879 /// Keeps cost model vectorization decision and cost for instructions. 1880 /// Right now it is used for memory instructions only. 1881 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1882 std::pair<InstWidening, InstructionCost>>; 1883 1884 DecisionList WideningDecisions; 1885 1886 /// Returns true if \p V is expected to be vectorized and it needs to be 1887 /// extracted. 1888 bool needsExtract(Value *V, ElementCount VF) const { 1889 Instruction *I = dyn_cast<Instruction>(V); 1890 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1891 TheLoop->isLoopInvariant(I)) 1892 return false; 1893 1894 // Assume we can vectorize V (and hence we need extraction) if the 1895 // scalars are not computed yet. This can happen, because it is called 1896 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1897 // the scalars are collected. That should be a safe assumption in most 1898 // cases, because we check if the operands have vectorizable types 1899 // beforehand in LoopVectorizationLegality. 1900 return Scalars.find(VF) == Scalars.end() || 1901 !isScalarAfterVectorization(I, VF); 1902 }; 1903 1904 /// Returns a range containing only operands needing to be extracted. 1905 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1906 ElementCount VF) const { 1907 return SmallVector<Value *, 4>(make_filter_range( 1908 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1909 } 1910 1911 /// Determines if we have the infrastructure to vectorize loop \p L and its 1912 /// epilogue, assuming the main loop is vectorized by \p VF. 1913 bool isCandidateForEpilogueVectorization(const Loop &L, 1914 const ElementCount VF) const; 1915 1916 /// Returns true if epilogue vectorization is considered profitable, and 1917 /// false otherwise. 1918 /// \p VF is the vectorization factor chosen for the original loop. 1919 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1920 1921 public: 1922 /// The loop that we evaluate. 1923 Loop *TheLoop; 1924 1925 /// Predicated scalar evolution analysis. 1926 PredicatedScalarEvolution &PSE; 1927 1928 /// Loop Info analysis. 1929 LoopInfo *LI; 1930 1931 /// Vectorization legality. 1932 LoopVectorizationLegality *Legal; 1933 1934 /// Vector target information. 1935 const TargetTransformInfo &TTI; 1936 1937 /// Target Library Info. 1938 const TargetLibraryInfo *TLI; 1939 1940 /// Demanded bits analysis. 1941 DemandedBits *DB; 1942 1943 /// Assumption cache. 1944 AssumptionCache *AC; 1945 1946 /// Interface to emit optimization remarks. 1947 OptimizationRemarkEmitter *ORE; 1948 1949 const Function *TheFunction; 1950 1951 /// Loop Vectorize Hint. 1952 const LoopVectorizeHints *Hints; 1953 1954 /// The interleave access information contains groups of interleaved accesses 1955 /// with the same stride and close to each other. 1956 InterleavedAccessInfo &InterleaveInfo; 1957 1958 /// Values to ignore in the cost model. 1959 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1960 1961 /// Values to ignore in the cost model when VF > 1. 1962 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1963 1964 /// All element types found in the loop. 1965 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1966 1967 /// Profitable vector factors. 1968 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1969 }; 1970 } // end namespace llvm 1971 1972 /// Helper struct to manage generating runtime checks for vectorization. 1973 /// 1974 /// The runtime checks are created up-front in temporary blocks to allow better 1975 /// estimating the cost and un-linked from the existing IR. After deciding to 1976 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1977 /// temporary blocks are completely removed. 1978 class GeneratedRTChecks { 1979 /// Basic block which contains the generated SCEV checks, if any. 1980 BasicBlock *SCEVCheckBlock = nullptr; 1981 1982 /// The value representing the result of the generated SCEV checks. If it is 1983 /// nullptr, either no SCEV checks have been generated or they have been used. 1984 Value *SCEVCheckCond = nullptr; 1985 1986 /// Basic block which contains the generated memory runtime checks, if any. 1987 BasicBlock *MemCheckBlock = nullptr; 1988 1989 /// The value representing the result of the generated memory runtime checks. 1990 /// If it is nullptr, either no memory runtime checks have been generated or 1991 /// they have been used. 1992 Value *MemRuntimeCheckCond = nullptr; 1993 1994 DominatorTree *DT; 1995 LoopInfo *LI; 1996 1997 SCEVExpander SCEVExp; 1998 SCEVExpander MemCheckExp; 1999 2000 public: 2001 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 2002 const DataLayout &DL) 2003 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 2004 MemCheckExp(SE, DL, "scev.check") {} 2005 2006 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 2007 /// accurately estimate the cost of the runtime checks. The blocks are 2008 /// un-linked from the IR and is added back during vector code generation. If 2009 /// there is no vector code generation, the check blocks are removed 2010 /// completely. 2011 void Create(Loop *L, const LoopAccessInfo &LAI, 2012 const SCEVUnionPredicate &UnionPred) { 2013 2014 BasicBlock *LoopHeader = L->getHeader(); 2015 BasicBlock *Preheader = L->getLoopPreheader(); 2016 2017 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2018 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2019 // may be used by SCEVExpander. The blocks will be un-linked from their 2020 // predecessors and removed from LI & DT at the end of the function. 2021 if (!UnionPred.isAlwaysTrue()) { 2022 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2023 nullptr, "vector.scevcheck"); 2024 2025 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2026 &UnionPred, SCEVCheckBlock->getTerminator()); 2027 } 2028 2029 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2030 if (RtPtrChecking.Need) { 2031 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2032 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2033 "vector.memcheck"); 2034 2035 MemRuntimeCheckCond = 2036 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2037 RtPtrChecking.getChecks(), MemCheckExp); 2038 assert(MemRuntimeCheckCond && 2039 "no RT checks generated although RtPtrChecking " 2040 "claimed checks are required"); 2041 } 2042 2043 if (!MemCheckBlock && !SCEVCheckBlock) 2044 return; 2045 2046 // Unhook the temporary block with the checks, update various places 2047 // accordingly. 2048 if (SCEVCheckBlock) 2049 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2050 if (MemCheckBlock) 2051 MemCheckBlock->replaceAllUsesWith(Preheader); 2052 2053 if (SCEVCheckBlock) { 2054 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2055 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2056 Preheader->getTerminator()->eraseFromParent(); 2057 } 2058 if (MemCheckBlock) { 2059 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2060 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2061 Preheader->getTerminator()->eraseFromParent(); 2062 } 2063 2064 DT->changeImmediateDominator(LoopHeader, Preheader); 2065 if (MemCheckBlock) { 2066 DT->eraseNode(MemCheckBlock); 2067 LI->removeBlock(MemCheckBlock); 2068 } 2069 if (SCEVCheckBlock) { 2070 DT->eraseNode(SCEVCheckBlock); 2071 LI->removeBlock(SCEVCheckBlock); 2072 } 2073 } 2074 2075 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2076 /// unused. 2077 ~GeneratedRTChecks() { 2078 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2079 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2080 if (!SCEVCheckCond) 2081 SCEVCleaner.markResultUsed(); 2082 2083 if (!MemRuntimeCheckCond) 2084 MemCheckCleaner.markResultUsed(); 2085 2086 if (MemRuntimeCheckCond) { 2087 auto &SE = *MemCheckExp.getSE(); 2088 // Memory runtime check generation creates compares that use expanded 2089 // values. Remove them before running the SCEVExpanderCleaners. 2090 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2091 if (MemCheckExp.isInsertedInstruction(&I)) 2092 continue; 2093 SE.forgetValue(&I); 2094 I.eraseFromParent(); 2095 } 2096 } 2097 MemCheckCleaner.cleanup(); 2098 SCEVCleaner.cleanup(); 2099 2100 if (SCEVCheckCond) 2101 SCEVCheckBlock->eraseFromParent(); 2102 if (MemRuntimeCheckCond) 2103 MemCheckBlock->eraseFromParent(); 2104 } 2105 2106 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2107 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2108 /// depending on the generated condition. 2109 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2110 BasicBlock *LoopVectorPreHeader, 2111 BasicBlock *LoopExitBlock) { 2112 if (!SCEVCheckCond) 2113 return nullptr; 2114 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2115 if (C->isZero()) 2116 return nullptr; 2117 2118 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2119 2120 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2121 // Create new preheader for vector loop. 2122 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2123 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2124 2125 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2126 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2127 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2128 SCEVCheckBlock); 2129 2130 DT->addNewBlock(SCEVCheckBlock, Pred); 2131 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2132 2133 ReplaceInstWithInst( 2134 SCEVCheckBlock->getTerminator(), 2135 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2136 // Mark the check as used, to prevent it from being removed during cleanup. 2137 SCEVCheckCond = nullptr; 2138 return SCEVCheckBlock; 2139 } 2140 2141 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2142 /// the branches to branch to the vector preheader or \p Bypass, depending on 2143 /// the generated condition. 2144 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2145 BasicBlock *LoopVectorPreHeader) { 2146 // Check if we generated code that checks in runtime if arrays overlap. 2147 if (!MemRuntimeCheckCond) 2148 return nullptr; 2149 2150 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2151 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2152 MemCheckBlock); 2153 2154 DT->addNewBlock(MemCheckBlock, Pred); 2155 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2156 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2157 2158 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2159 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2160 2161 ReplaceInstWithInst( 2162 MemCheckBlock->getTerminator(), 2163 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2164 MemCheckBlock->getTerminator()->setDebugLoc( 2165 Pred->getTerminator()->getDebugLoc()); 2166 2167 // Mark the check as used, to prevent it from being removed during cleanup. 2168 MemRuntimeCheckCond = nullptr; 2169 return MemCheckBlock; 2170 } 2171 }; 2172 2173 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2174 // vectorization. The loop needs to be annotated with #pragma omp simd 2175 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2176 // vector length information is not provided, vectorization is not considered 2177 // explicit. Interleave hints are not allowed either. These limitations will be 2178 // relaxed in the future. 2179 // Please, note that we are currently forced to abuse the pragma 'clang 2180 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2181 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2182 // provides *explicit vectorization hints* (LV can bypass legal checks and 2183 // assume that vectorization is legal). However, both hints are implemented 2184 // using the same metadata (llvm.loop.vectorize, processed by 2185 // LoopVectorizeHints). This will be fixed in the future when the native IR 2186 // representation for pragma 'omp simd' is introduced. 2187 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2188 OptimizationRemarkEmitter *ORE) { 2189 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2190 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2191 2192 // Only outer loops with an explicit vectorization hint are supported. 2193 // Unannotated outer loops are ignored. 2194 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2195 return false; 2196 2197 Function *Fn = OuterLp->getHeader()->getParent(); 2198 if (!Hints.allowVectorization(Fn, OuterLp, 2199 true /*VectorizeOnlyWhenForced*/)) { 2200 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2201 return false; 2202 } 2203 2204 if (Hints.getInterleave() > 1) { 2205 // TODO: Interleave support is future work. 2206 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2207 "outer loops.\n"); 2208 Hints.emitRemarkWithHints(); 2209 return false; 2210 } 2211 2212 return true; 2213 } 2214 2215 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2216 OptimizationRemarkEmitter *ORE, 2217 SmallVectorImpl<Loop *> &V) { 2218 // Collect inner loops and outer loops without irreducible control flow. For 2219 // now, only collect outer loops that have explicit vectorization hints. If we 2220 // are stress testing the VPlan H-CFG construction, we collect the outermost 2221 // loop of every loop nest. 2222 if (L.isInnermost() || VPlanBuildStressTest || 2223 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2224 LoopBlocksRPO RPOT(&L); 2225 RPOT.perform(LI); 2226 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2227 V.push_back(&L); 2228 // TODO: Collect inner loops inside marked outer loops in case 2229 // vectorization fails for the outer loop. Do not invoke 2230 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2231 // already known to be reducible. We can use an inherited attribute for 2232 // that. 2233 return; 2234 } 2235 } 2236 for (Loop *InnerL : L) 2237 collectSupportedLoops(*InnerL, LI, ORE, V); 2238 } 2239 2240 namespace { 2241 2242 /// The LoopVectorize Pass. 2243 struct LoopVectorize : public FunctionPass { 2244 /// Pass identification, replacement for typeid 2245 static char ID; 2246 2247 LoopVectorizePass Impl; 2248 2249 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2250 bool VectorizeOnlyWhenForced = false) 2251 : FunctionPass(ID), 2252 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2253 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2254 } 2255 2256 bool runOnFunction(Function &F) override { 2257 if (skipFunction(F)) 2258 return false; 2259 2260 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2261 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2262 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2263 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2264 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2265 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2266 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2267 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2268 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2269 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2270 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2271 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2272 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2273 2274 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2275 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2276 2277 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2278 GetLAA, *ORE, PSI).MadeAnyChange; 2279 } 2280 2281 void getAnalysisUsage(AnalysisUsage &AU) const override { 2282 AU.addRequired<AssumptionCacheTracker>(); 2283 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2284 AU.addRequired<DominatorTreeWrapperPass>(); 2285 AU.addRequired<LoopInfoWrapperPass>(); 2286 AU.addRequired<ScalarEvolutionWrapperPass>(); 2287 AU.addRequired<TargetTransformInfoWrapperPass>(); 2288 AU.addRequired<AAResultsWrapperPass>(); 2289 AU.addRequired<LoopAccessLegacyAnalysis>(); 2290 AU.addRequired<DemandedBitsWrapperPass>(); 2291 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2292 AU.addRequired<InjectTLIMappingsLegacy>(); 2293 2294 // We currently do not preserve loopinfo/dominator analyses with outer loop 2295 // vectorization. Until this is addressed, mark these analyses as preserved 2296 // only for non-VPlan-native path. 2297 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2298 if (!EnableVPlanNativePath) { 2299 AU.addPreserved<LoopInfoWrapperPass>(); 2300 AU.addPreserved<DominatorTreeWrapperPass>(); 2301 } 2302 2303 AU.addPreserved<BasicAAWrapperPass>(); 2304 AU.addPreserved<GlobalsAAWrapperPass>(); 2305 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2306 } 2307 }; 2308 2309 } // end anonymous namespace 2310 2311 //===----------------------------------------------------------------------===// 2312 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2313 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2314 //===----------------------------------------------------------------------===// 2315 2316 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2317 // We need to place the broadcast of invariant variables outside the loop, 2318 // but only if it's proven safe to do so. Else, broadcast will be inside 2319 // vector loop body. 2320 Instruction *Instr = dyn_cast<Instruction>(V); 2321 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2322 (!Instr || 2323 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2324 // Place the code for broadcasting invariant variables in the new preheader. 2325 IRBuilder<>::InsertPointGuard Guard(Builder); 2326 if (SafeToHoist) 2327 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2328 2329 // Broadcast the scalar into all locations in the vector. 2330 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2331 2332 return Shuf; 2333 } 2334 2335 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2336 const InductionDescriptor &II, Value *Step, Value *Start, 2337 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2338 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2339 "Expected either an induction phi-node or a truncate of it!"); 2340 2341 // Construct the initial value of the vector IV in the vector loop preheader 2342 auto CurrIP = Builder.saveIP(); 2343 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2344 if (isa<TruncInst>(EntryVal)) { 2345 assert(Start->getType()->isIntegerTy() && 2346 "Truncation requires an integer type"); 2347 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2348 Step = Builder.CreateTrunc(Step, TruncType); 2349 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2350 } 2351 2352 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2353 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2354 Value *SteppedStart = 2355 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2356 2357 // We create vector phi nodes for both integer and floating-point induction 2358 // variables. Here, we determine the kind of arithmetic we will perform. 2359 Instruction::BinaryOps AddOp; 2360 Instruction::BinaryOps MulOp; 2361 if (Step->getType()->isIntegerTy()) { 2362 AddOp = Instruction::Add; 2363 MulOp = Instruction::Mul; 2364 } else { 2365 AddOp = II.getInductionOpcode(); 2366 MulOp = Instruction::FMul; 2367 } 2368 2369 // Multiply the vectorization factor by the step using integer or 2370 // floating-point arithmetic as appropriate. 2371 Type *StepType = Step->getType(); 2372 Value *RuntimeVF; 2373 if (Step->getType()->isFloatingPointTy()) 2374 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); 2375 else 2376 RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2377 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2378 2379 // Create a vector splat to use in the induction update. 2380 // 2381 // FIXME: If the step is non-constant, we create the vector splat with 2382 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2383 // handle a constant vector splat. 2384 Value *SplatVF = isa<Constant>(Mul) 2385 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2386 : Builder.CreateVectorSplat(VF, Mul); 2387 Builder.restoreIP(CurrIP); 2388 2389 // We may need to add the step a number of times, depending on the unroll 2390 // factor. The last of those goes into the PHI. 2391 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2392 &*LoopVectorBody->getFirstInsertionPt()); 2393 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2394 Instruction *LastInduction = VecInd; 2395 for (unsigned Part = 0; Part < UF; ++Part) { 2396 State.set(Def, LastInduction, Part); 2397 2398 if (isa<TruncInst>(EntryVal)) 2399 addMetadata(LastInduction, EntryVal); 2400 2401 LastInduction = cast<Instruction>( 2402 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2403 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2404 } 2405 2406 // Move the last step to the end of the latch block. This ensures consistent 2407 // placement of all induction updates. 2408 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2409 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2410 auto *ICmp = cast<Instruction>(Br->getCondition()); 2411 LastInduction->moveBefore(ICmp); 2412 LastInduction->setName("vec.ind.next"); 2413 2414 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2415 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2416 } 2417 2418 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2419 return Cost->isScalarAfterVectorization(I, VF) || 2420 Cost->isProfitableToScalarize(I, VF); 2421 } 2422 2423 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2424 if (shouldScalarizeInstruction(IV)) 2425 return true; 2426 auto isScalarInst = [&](User *U) -> bool { 2427 auto *I = cast<Instruction>(U); 2428 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2429 }; 2430 return llvm::any_of(IV->users(), isScalarInst); 2431 } 2432 2433 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, 2434 const InductionDescriptor &ID, 2435 Value *Start, TruncInst *Trunc, 2436 VPValue *Def, 2437 VPTransformState &State) { 2438 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2439 "Primary induction variable must have an integer type"); 2440 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2441 2442 // The value from the original loop to which we are mapping the new induction 2443 // variable. 2444 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2445 2446 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2447 2448 // Generate code for the induction step. Note that induction steps are 2449 // required to be loop-invariant 2450 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2451 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2452 "Induction step should be loop invariant"); 2453 if (PSE.getSE()->isSCEVable(IV->getType())) { 2454 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2455 return Exp.expandCodeFor(Step, Step->getType(), 2456 LoopVectorPreHeader->getTerminator()); 2457 } 2458 return cast<SCEVUnknown>(Step)->getValue(); 2459 }; 2460 2461 // The scalar value to broadcast. This is derived from the canonical 2462 // induction variable. If a truncation type is given, truncate the canonical 2463 // induction variable and step. Otherwise, derive these values from the 2464 // induction descriptor. 2465 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2466 Value *ScalarIV = Induction; 2467 if (IV != OldInduction) { 2468 ScalarIV = IV->getType()->isIntegerTy() 2469 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2470 : Builder.CreateCast(Instruction::SIToFP, Induction, 2471 IV->getType()); 2472 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2473 ScalarIV->setName("offset.idx"); 2474 } 2475 if (Trunc) { 2476 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2477 assert(Step->getType()->isIntegerTy() && 2478 "Truncation requires an integer step"); 2479 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2480 Step = Builder.CreateTrunc(Step, TruncType); 2481 } 2482 return ScalarIV; 2483 }; 2484 2485 // Create the vector values from the scalar IV, in the absence of creating a 2486 // vector IV. 2487 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2488 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2489 for (unsigned Part = 0; Part < UF; ++Part) { 2490 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2491 Value *StartIdx; 2492 if (Step->getType()->isFloatingPointTy()) 2493 StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); 2494 else 2495 StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); 2496 2497 Value *EntryPart = 2498 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2499 State.set(Def, EntryPart, Part); 2500 if (Trunc) 2501 addMetadata(EntryPart, Trunc); 2502 } 2503 }; 2504 2505 // Fast-math-flags propagate from the original induction instruction. 2506 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2507 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2508 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2509 2510 // Now do the actual transformations, and start with creating the step value. 2511 Value *Step = CreateStepValue(ID.getStep()); 2512 if (VF.isZero() || VF.isScalar()) { 2513 Value *ScalarIV = CreateScalarIV(Step); 2514 CreateSplatIV(ScalarIV, Step); 2515 return; 2516 } 2517 2518 // Determine if we want a scalar version of the induction variable. This is 2519 // true if the induction variable itself is not widened, or if it has at 2520 // least one user in the loop that is not widened. 2521 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2522 if (!NeedsScalarIV) { 2523 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2524 return; 2525 } 2526 2527 // Try to create a new independent vector induction variable. If we can't 2528 // create the phi node, we will splat the scalar induction variable in each 2529 // loop iteration. 2530 if (!shouldScalarizeInstruction(EntryVal)) { 2531 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2532 Value *ScalarIV = CreateScalarIV(Step); 2533 // Create scalar steps that can be used by instructions we will later 2534 // scalarize. Note that the addition of the scalar steps will not increase 2535 // the number of instructions in the loop in the common case prior to 2536 // InstCombine. We will be trading one vector extract for each scalar step. 2537 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2538 return; 2539 } 2540 2541 // All IV users are scalar instructions, so only emit a scalar IV, not a 2542 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2543 // predicate used by the masked loads/stores. 2544 Value *ScalarIV = CreateScalarIV(Step); 2545 if (!Cost->isScalarEpilogueAllowed()) 2546 CreateSplatIV(ScalarIV, Step); 2547 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2548 } 2549 2550 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2551 Value *Step, 2552 Instruction::BinaryOps BinOp) { 2553 // Create and check the types. 2554 auto *ValVTy = cast<VectorType>(Val->getType()); 2555 ElementCount VLen = ValVTy->getElementCount(); 2556 2557 Type *STy = Val->getType()->getScalarType(); 2558 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2559 "Induction Step must be an integer or FP"); 2560 assert(Step->getType() == STy && "Step has wrong type"); 2561 2562 SmallVector<Constant *, 8> Indices; 2563 2564 // Create a vector of consecutive numbers from zero to VF. 2565 VectorType *InitVecValVTy = ValVTy; 2566 Type *InitVecValSTy = STy; 2567 if (STy->isFloatingPointTy()) { 2568 InitVecValSTy = 2569 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2570 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2571 } 2572 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2573 2574 // Splat the StartIdx 2575 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2576 2577 if (STy->isIntegerTy()) { 2578 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2579 Step = Builder.CreateVectorSplat(VLen, Step); 2580 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2581 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2582 // which can be found from the original scalar operations. 2583 Step = Builder.CreateMul(InitVec, Step); 2584 return Builder.CreateAdd(Val, Step, "induction"); 2585 } 2586 2587 // Floating point induction. 2588 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2589 "Binary Opcode should be specified for FP induction"); 2590 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2591 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2592 2593 Step = Builder.CreateVectorSplat(VLen, Step); 2594 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2595 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2596 } 2597 2598 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2599 Instruction *EntryVal, 2600 const InductionDescriptor &ID, 2601 VPValue *Def, 2602 VPTransformState &State) { 2603 // We shouldn't have to build scalar steps if we aren't vectorizing. 2604 assert(VF.isVector() && "VF should be greater than one"); 2605 // Get the value type and ensure it and the step have the same integer type. 2606 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2607 assert(ScalarIVTy == Step->getType() && 2608 "Val and Step should have the same type"); 2609 2610 // We build scalar steps for both integer and floating-point induction 2611 // variables. Here, we determine the kind of arithmetic we will perform. 2612 Instruction::BinaryOps AddOp; 2613 Instruction::BinaryOps MulOp; 2614 if (ScalarIVTy->isIntegerTy()) { 2615 AddOp = Instruction::Add; 2616 MulOp = Instruction::Mul; 2617 } else { 2618 AddOp = ID.getInductionOpcode(); 2619 MulOp = Instruction::FMul; 2620 } 2621 2622 // Determine the number of scalars we need to generate for each unroll 2623 // iteration. If EntryVal is uniform, we only need to generate the first 2624 // lane. Otherwise, we generate all VF values. 2625 bool IsUniform = 2626 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2627 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2628 // Compute the scalar steps and save the results in State. 2629 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2630 ScalarIVTy->getScalarSizeInBits()); 2631 Type *VecIVTy = nullptr; 2632 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2633 if (!IsUniform && VF.isScalable()) { 2634 VecIVTy = VectorType::get(ScalarIVTy, VF); 2635 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2636 SplatStep = Builder.CreateVectorSplat(VF, Step); 2637 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2638 } 2639 2640 for (unsigned Part = 0; Part < UF; ++Part) { 2641 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); 2642 2643 if (!IsUniform && VF.isScalable()) { 2644 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2645 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2646 if (ScalarIVTy->isFloatingPointTy()) 2647 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2648 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2649 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2650 State.set(Def, Add, Part); 2651 // It's useful to record the lane values too for the known minimum number 2652 // of elements so we do those below. This improves the code quality when 2653 // trying to extract the first element, for example. 2654 } 2655 2656 if (ScalarIVTy->isFloatingPointTy()) 2657 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2658 2659 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2660 Value *StartIdx = Builder.CreateBinOp( 2661 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2662 // The step returned by `createStepForVF` is a runtime-evaluated value 2663 // when VF is scalable. Otherwise, it should be folded into a Constant. 2664 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2665 "Expected StartIdx to be folded to a constant when VF is not " 2666 "scalable"); 2667 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2668 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2669 State.set(Def, Add, VPIteration(Part, Lane)); 2670 } 2671 } 2672 } 2673 2674 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2675 const VPIteration &Instance, 2676 VPTransformState &State) { 2677 Value *ScalarInst = State.get(Def, Instance); 2678 Value *VectorValue = State.get(Def, Instance.Part); 2679 VectorValue = Builder.CreateInsertElement( 2680 VectorValue, ScalarInst, 2681 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2682 State.set(Def, VectorValue, Instance.Part); 2683 } 2684 2685 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2686 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2687 return Builder.CreateVectorReverse(Vec, "reverse"); 2688 } 2689 2690 // Return whether we allow using masked interleave-groups (for dealing with 2691 // strided loads/stores that reside in predicated blocks, or for dealing 2692 // with gaps). 2693 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2694 // If an override option has been passed in for interleaved accesses, use it. 2695 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2696 return EnableMaskedInterleavedMemAccesses; 2697 2698 return TTI.enableMaskedInterleavedAccessVectorization(); 2699 } 2700 2701 // Try to vectorize the interleave group that \p Instr belongs to. 2702 // 2703 // E.g. Translate following interleaved load group (factor = 3): 2704 // for (i = 0; i < N; i+=3) { 2705 // R = Pic[i]; // Member of index 0 2706 // G = Pic[i+1]; // Member of index 1 2707 // B = Pic[i+2]; // Member of index 2 2708 // ... // do something to R, G, B 2709 // } 2710 // To: 2711 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2712 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2713 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2714 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2715 // 2716 // Or translate following interleaved store group (factor = 3): 2717 // for (i = 0; i < N; i+=3) { 2718 // ... do something to R, G, B 2719 // Pic[i] = R; // Member of index 0 2720 // Pic[i+1] = G; // Member of index 1 2721 // Pic[i+2] = B; // Member of index 2 2722 // } 2723 // To: 2724 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2725 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2726 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2727 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2728 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2729 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2730 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2731 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2732 VPValue *BlockInMask) { 2733 Instruction *Instr = Group->getInsertPos(); 2734 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2735 2736 // Prepare for the vector type of the interleaved load/store. 2737 Type *ScalarTy = getLoadStoreType(Instr); 2738 unsigned InterleaveFactor = Group->getFactor(); 2739 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2740 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2741 2742 // Prepare for the new pointers. 2743 SmallVector<Value *, 2> AddrParts; 2744 unsigned Index = Group->getIndex(Instr); 2745 2746 // TODO: extend the masked interleaved-group support to reversed access. 2747 assert((!BlockInMask || !Group->isReverse()) && 2748 "Reversed masked interleave-group not supported."); 2749 2750 // If the group is reverse, adjust the index to refer to the last vector lane 2751 // instead of the first. We adjust the index from the first vector lane, 2752 // rather than directly getting the pointer for lane VF - 1, because the 2753 // pointer operand of the interleaved access is supposed to be uniform. For 2754 // uniform instructions, we're only required to generate a value for the 2755 // first vector lane in each unroll iteration. 2756 if (Group->isReverse()) 2757 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2758 2759 for (unsigned Part = 0; Part < UF; Part++) { 2760 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2761 setDebugLocFromInst(AddrPart); 2762 2763 // Notice current instruction could be any index. Need to adjust the address 2764 // to the member of index 0. 2765 // 2766 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2767 // b = A[i]; // Member of index 0 2768 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2769 // 2770 // E.g. A[i+1] = a; // Member of index 1 2771 // A[i] = b; // Member of index 0 2772 // A[i+2] = c; // Member of index 2 (Current instruction) 2773 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2774 2775 bool InBounds = false; 2776 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2777 InBounds = gep->isInBounds(); 2778 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2779 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2780 2781 // Cast to the vector pointer type. 2782 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2783 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2784 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2785 } 2786 2787 setDebugLocFromInst(Instr); 2788 Value *PoisonVec = PoisonValue::get(VecTy); 2789 2790 Value *MaskForGaps = nullptr; 2791 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2792 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2793 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2794 } 2795 2796 // Vectorize the interleaved load group. 2797 if (isa<LoadInst>(Instr)) { 2798 // For each unroll part, create a wide load for the group. 2799 SmallVector<Value *, 2> NewLoads; 2800 for (unsigned Part = 0; Part < UF; Part++) { 2801 Instruction *NewLoad; 2802 if (BlockInMask || MaskForGaps) { 2803 assert(useMaskedInterleavedAccesses(*TTI) && 2804 "masked interleaved groups are not allowed."); 2805 Value *GroupMask = MaskForGaps; 2806 if (BlockInMask) { 2807 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2808 Value *ShuffledMask = Builder.CreateShuffleVector( 2809 BlockInMaskPart, 2810 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2811 "interleaved.mask"); 2812 GroupMask = MaskForGaps 2813 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2814 MaskForGaps) 2815 : ShuffledMask; 2816 } 2817 NewLoad = 2818 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2819 GroupMask, PoisonVec, "wide.masked.vec"); 2820 } 2821 else 2822 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2823 Group->getAlign(), "wide.vec"); 2824 Group->addMetadata(NewLoad); 2825 NewLoads.push_back(NewLoad); 2826 } 2827 2828 // For each member in the group, shuffle out the appropriate data from the 2829 // wide loads. 2830 unsigned J = 0; 2831 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2832 Instruction *Member = Group->getMember(I); 2833 2834 // Skip the gaps in the group. 2835 if (!Member) 2836 continue; 2837 2838 auto StrideMask = 2839 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2840 for (unsigned Part = 0; Part < UF; Part++) { 2841 Value *StridedVec = Builder.CreateShuffleVector( 2842 NewLoads[Part], StrideMask, "strided.vec"); 2843 2844 // If this member has different type, cast the result type. 2845 if (Member->getType() != ScalarTy) { 2846 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2847 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2848 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2849 } 2850 2851 if (Group->isReverse()) 2852 StridedVec = reverseVector(StridedVec); 2853 2854 State.set(VPDefs[J], StridedVec, Part); 2855 } 2856 ++J; 2857 } 2858 return; 2859 } 2860 2861 // The sub vector type for current instruction. 2862 auto *SubVT = VectorType::get(ScalarTy, VF); 2863 2864 // Vectorize the interleaved store group. 2865 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2866 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2867 "masked interleaved groups are not allowed."); 2868 assert((!MaskForGaps || !VF.isScalable()) && 2869 "masking gaps for scalable vectors is not yet supported."); 2870 for (unsigned Part = 0; Part < UF; Part++) { 2871 // Collect the stored vector from each member. 2872 SmallVector<Value *, 4> StoredVecs; 2873 for (unsigned i = 0; i < InterleaveFactor; i++) { 2874 assert((Group->getMember(i) || MaskForGaps) && 2875 "Fail to get a member from an interleaved store group"); 2876 Instruction *Member = Group->getMember(i); 2877 2878 // Skip the gaps in the group. 2879 if (!Member) { 2880 Value *Undef = PoisonValue::get(SubVT); 2881 StoredVecs.push_back(Undef); 2882 continue; 2883 } 2884 2885 Value *StoredVec = State.get(StoredValues[i], Part); 2886 2887 if (Group->isReverse()) 2888 StoredVec = reverseVector(StoredVec); 2889 2890 // If this member has different type, cast it to a unified type. 2891 2892 if (StoredVec->getType() != SubVT) 2893 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2894 2895 StoredVecs.push_back(StoredVec); 2896 } 2897 2898 // Concatenate all vectors into a wide vector. 2899 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2900 2901 // Interleave the elements in the wide vector. 2902 Value *IVec = Builder.CreateShuffleVector( 2903 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2904 "interleaved.vec"); 2905 2906 Instruction *NewStoreInstr; 2907 if (BlockInMask || MaskForGaps) { 2908 Value *GroupMask = MaskForGaps; 2909 if (BlockInMask) { 2910 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2911 Value *ShuffledMask = Builder.CreateShuffleVector( 2912 BlockInMaskPart, 2913 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2914 "interleaved.mask"); 2915 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2916 ShuffledMask, MaskForGaps) 2917 : ShuffledMask; 2918 } 2919 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2920 Group->getAlign(), GroupMask); 2921 } else 2922 NewStoreInstr = 2923 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2924 2925 Group->addMetadata(NewStoreInstr); 2926 } 2927 } 2928 2929 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2930 VPReplicateRecipe *RepRecipe, 2931 const VPIteration &Instance, 2932 bool IfPredicateInstr, 2933 VPTransformState &State) { 2934 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2935 2936 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2937 // the first lane and part. 2938 if (isa<NoAliasScopeDeclInst>(Instr)) 2939 if (!Instance.isFirstIteration()) 2940 return; 2941 2942 setDebugLocFromInst(Instr); 2943 2944 // Does this instruction return a value ? 2945 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2946 2947 Instruction *Cloned = Instr->clone(); 2948 if (!IsVoidRetTy) 2949 Cloned->setName(Instr->getName() + ".cloned"); 2950 2951 // If the scalarized instruction contributes to the address computation of a 2952 // widen masked load/store which was in a basic block that needed predication 2953 // and is not predicated after vectorization, we can't propagate 2954 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2955 // instruction could feed a poison value to the base address of the widen 2956 // load/store. 2957 if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) 2958 Cloned->dropPoisonGeneratingFlags(); 2959 2960 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2961 Builder.GetInsertPoint()); 2962 // Replace the operands of the cloned instructions with their scalar 2963 // equivalents in the new loop. 2964 for (auto &I : enumerate(RepRecipe->operands())) { 2965 auto InputInstance = Instance; 2966 VPValue *Operand = I.value(); 2967 if (State.Plan->isUniformAfterVectorization(Operand)) 2968 InputInstance.Lane = VPLane::getFirstLane(); 2969 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2970 } 2971 addNewMetadata(Cloned, Instr); 2972 2973 // Place the cloned scalar in the new loop. 2974 Builder.Insert(Cloned); 2975 2976 State.set(RepRecipe, Cloned, Instance); 2977 2978 // If we just cloned a new assumption, add it the assumption cache. 2979 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2980 AC->registerAssumption(II); 2981 2982 // End if-block. 2983 if (IfPredicateInstr) 2984 PredicatedInstructions.push_back(Cloned); 2985 } 2986 2987 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2988 Value *End, Value *Step, 2989 Instruction *DL) { 2990 BasicBlock *Header = L->getHeader(); 2991 BasicBlock *Latch = L->getLoopLatch(); 2992 // As we're just creating this loop, it's possible no latch exists 2993 // yet. If so, use the header as this will be a single block loop. 2994 if (!Latch) 2995 Latch = Header; 2996 2997 IRBuilder<> B(&*Header->getFirstInsertionPt()); 2998 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2999 setDebugLocFromInst(OldInst, &B); 3000 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3001 3002 B.SetInsertPoint(Latch->getTerminator()); 3003 setDebugLocFromInst(OldInst, &B); 3004 3005 // Create i+1 and fill the PHINode. 3006 // 3007 // If the tail is not folded, we know that End - Start >= Step (either 3008 // statically or through the minimum iteration checks). We also know that both 3009 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3010 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3011 // overflows and we can mark the induction increment as NUW. 3012 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3013 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3014 Induction->addIncoming(Start, L->getLoopPreheader()); 3015 Induction->addIncoming(Next, Latch); 3016 // Create the compare. 3017 Value *ICmp = B.CreateICmpEQ(Next, End); 3018 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3019 3020 // Now we have two terminators. Remove the old one from the block. 3021 Latch->getTerminator()->eraseFromParent(); 3022 3023 return Induction; 3024 } 3025 3026 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3027 if (TripCount) 3028 return TripCount; 3029 3030 assert(L && "Create Trip Count for null loop."); 3031 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3032 // Find the loop boundaries. 3033 ScalarEvolution *SE = PSE.getSE(); 3034 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3035 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3036 "Invalid loop count"); 3037 3038 Type *IdxTy = Legal->getWidestInductionType(); 3039 assert(IdxTy && "No type for induction"); 3040 3041 // The exit count might have the type of i64 while the phi is i32. This can 3042 // happen if we have an induction variable that is sign extended before the 3043 // compare. The only way that we get a backedge taken count is that the 3044 // induction variable was signed and as such will not overflow. In such a case 3045 // truncation is legal. 3046 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3047 IdxTy->getPrimitiveSizeInBits()) 3048 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3049 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3050 3051 // Get the total trip count from the count by adding 1. 3052 const SCEV *ExitCount = SE->getAddExpr( 3053 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3054 3055 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3056 3057 // Expand the trip count and place the new instructions in the preheader. 3058 // Notice that the pre-header does not change, only the loop body. 3059 SCEVExpander Exp(*SE, DL, "induction"); 3060 3061 // Count holds the overall loop count (N). 3062 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3063 L->getLoopPreheader()->getTerminator()); 3064 3065 if (TripCount->getType()->isPointerTy()) 3066 TripCount = 3067 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3068 L->getLoopPreheader()->getTerminator()); 3069 3070 return TripCount; 3071 } 3072 3073 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3074 if (VectorTripCount) 3075 return VectorTripCount; 3076 3077 Value *TC = getOrCreateTripCount(L); 3078 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3079 3080 Type *Ty = TC->getType(); 3081 // This is where we can make the step a runtime constant. 3082 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3083 3084 // If the tail is to be folded by masking, round the number of iterations N 3085 // up to a multiple of Step instead of rounding down. This is done by first 3086 // adding Step-1 and then rounding down. Note that it's ok if this addition 3087 // overflows: the vector induction variable will eventually wrap to zero given 3088 // that it starts at zero and its Step is a power of two; the loop will then 3089 // exit, with the last early-exit vector comparison also producing all-true. 3090 if (Cost->foldTailByMasking()) { 3091 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3092 "VF*UF must be a power of 2 when folding tail by masking"); 3093 assert(!VF.isScalable() && 3094 "Tail folding not yet supported for scalable vectors"); 3095 TC = Builder.CreateAdd( 3096 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3097 } 3098 3099 // Now we need to generate the expression for the part of the loop that the 3100 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3101 // iterations are not required for correctness, or N - Step, otherwise. Step 3102 // is equal to the vectorization factor (number of SIMD elements) times the 3103 // unroll factor (number of SIMD instructions). 3104 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3105 3106 // There are cases where we *must* run at least one iteration in the remainder 3107 // loop. See the cost model for when this can happen. If the step evenly 3108 // divides the trip count, we set the remainder to be equal to the step. If 3109 // the step does not evenly divide the trip count, no adjustment is necessary 3110 // since there will already be scalar iterations. Note that the minimum 3111 // iterations check ensures that N >= Step. 3112 if (Cost->requiresScalarEpilogue(VF)) { 3113 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3114 R = Builder.CreateSelect(IsZero, Step, R); 3115 } 3116 3117 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3118 3119 return VectorTripCount; 3120 } 3121 3122 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3123 const DataLayout &DL) { 3124 // Verify that V is a vector type with same number of elements as DstVTy. 3125 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3126 unsigned VF = DstFVTy->getNumElements(); 3127 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3128 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3129 Type *SrcElemTy = SrcVecTy->getElementType(); 3130 Type *DstElemTy = DstFVTy->getElementType(); 3131 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3132 "Vector elements must have same size"); 3133 3134 // Do a direct cast if element types are castable. 3135 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3136 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3137 } 3138 // V cannot be directly casted to desired vector type. 3139 // May happen when V is a floating point vector but DstVTy is a vector of 3140 // pointers or vice-versa. Handle this using a two-step bitcast using an 3141 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3142 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3143 "Only one type should be a pointer type"); 3144 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3145 "Only one type should be a floating point type"); 3146 Type *IntTy = 3147 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3148 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3149 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3150 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3151 } 3152 3153 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3154 BasicBlock *Bypass) { 3155 Value *Count = getOrCreateTripCount(L); 3156 // Reuse existing vector loop preheader for TC checks. 3157 // Note that new preheader block is generated for vector loop. 3158 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3159 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3160 3161 // Generate code to check if the loop's trip count is less than VF * UF, or 3162 // equal to it in case a scalar epilogue is required; this implies that the 3163 // vector trip count is zero. This check also covers the case where adding one 3164 // to the backedge-taken count overflowed leading to an incorrect trip count 3165 // of zero. In this case we will also jump to the scalar loop. 3166 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3167 : ICmpInst::ICMP_ULT; 3168 3169 // If tail is to be folded, vector loop takes care of all iterations. 3170 Value *CheckMinIters = Builder.getFalse(); 3171 if (!Cost->foldTailByMasking()) { 3172 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3173 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3174 } 3175 // Create new preheader for vector loop. 3176 LoopVectorPreHeader = 3177 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3178 "vector.ph"); 3179 3180 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3181 DT->getNode(Bypass)->getIDom()) && 3182 "TC check is expected to dominate Bypass"); 3183 3184 // Update dominator for Bypass & LoopExit (if needed). 3185 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3186 if (!Cost->requiresScalarEpilogue(VF)) 3187 // If there is an epilogue which must run, there's no edge from the 3188 // middle block to exit blocks and thus no need to update the immediate 3189 // dominator of the exit blocks. 3190 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3191 3192 ReplaceInstWithInst( 3193 TCCheckBlock->getTerminator(), 3194 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3195 LoopBypassBlocks.push_back(TCCheckBlock); 3196 } 3197 3198 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3199 3200 BasicBlock *const SCEVCheckBlock = 3201 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3202 if (!SCEVCheckBlock) 3203 return nullptr; 3204 3205 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3206 (OptForSizeBasedOnProfile && 3207 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3208 "Cannot SCEV check stride or overflow when optimizing for size"); 3209 3210 3211 // Update dominator only if this is first RT check. 3212 if (LoopBypassBlocks.empty()) { 3213 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3214 if (!Cost->requiresScalarEpilogue(VF)) 3215 // If there is an epilogue which must run, there's no edge from the 3216 // middle block to exit blocks and thus no need to update the immediate 3217 // dominator of the exit blocks. 3218 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3219 } 3220 3221 LoopBypassBlocks.push_back(SCEVCheckBlock); 3222 AddedSafetyChecks = true; 3223 return SCEVCheckBlock; 3224 } 3225 3226 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3227 BasicBlock *Bypass) { 3228 // VPlan-native path does not do any analysis for runtime checks currently. 3229 if (EnableVPlanNativePath) 3230 return nullptr; 3231 3232 BasicBlock *const MemCheckBlock = 3233 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3234 3235 // Check if we generated code that checks in runtime if arrays overlap. We put 3236 // the checks into a separate block to make the more common case of few 3237 // elements faster. 3238 if (!MemCheckBlock) 3239 return nullptr; 3240 3241 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3242 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3243 "Cannot emit memory checks when optimizing for size, unless forced " 3244 "to vectorize."); 3245 ORE->emit([&]() { 3246 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3247 L->getStartLoc(), L->getHeader()) 3248 << "Code-size may be reduced by not forcing " 3249 "vectorization, or by source-code modifications " 3250 "eliminating the need for runtime checks " 3251 "(e.g., adding 'restrict')."; 3252 }); 3253 } 3254 3255 LoopBypassBlocks.push_back(MemCheckBlock); 3256 3257 AddedSafetyChecks = true; 3258 3259 // We currently don't use LoopVersioning for the actual loop cloning but we 3260 // still use it to add the noalias metadata. 3261 LVer = std::make_unique<LoopVersioning>( 3262 *Legal->getLAI(), 3263 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3264 DT, PSE.getSE()); 3265 LVer->prepareNoAliasMetadata(); 3266 return MemCheckBlock; 3267 } 3268 3269 Value *InnerLoopVectorizer::emitTransformedIndex( 3270 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3271 const InductionDescriptor &ID) const { 3272 3273 SCEVExpander Exp(*SE, DL, "induction"); 3274 auto Step = ID.getStep(); 3275 auto StartValue = ID.getStartValue(); 3276 assert(Index->getType()->getScalarType() == Step->getType() && 3277 "Index scalar type does not match StepValue type"); 3278 3279 // Note: the IR at this point is broken. We cannot use SE to create any new 3280 // SCEV and then expand it, hoping that SCEV's simplification will give us 3281 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3282 // lead to various SCEV crashes. So all we can do is to use builder and rely 3283 // on InstCombine for future simplifications. Here we handle some trivial 3284 // cases only. 3285 auto CreateAdd = [&B](Value *X, Value *Y) { 3286 assert(X->getType() == Y->getType() && "Types don't match!"); 3287 if (auto *CX = dyn_cast<ConstantInt>(X)) 3288 if (CX->isZero()) 3289 return Y; 3290 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3291 if (CY->isZero()) 3292 return X; 3293 return B.CreateAdd(X, Y); 3294 }; 3295 3296 // We allow X to be a vector type, in which case Y will potentially be 3297 // splatted into a vector with the same element count. 3298 auto CreateMul = [&B](Value *X, Value *Y) { 3299 assert(X->getType()->getScalarType() == Y->getType() && 3300 "Types don't match!"); 3301 if (auto *CX = dyn_cast<ConstantInt>(X)) 3302 if (CX->isOne()) 3303 return Y; 3304 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3305 if (CY->isOne()) 3306 return X; 3307 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3308 if (XVTy && !isa<VectorType>(Y->getType())) 3309 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3310 return B.CreateMul(X, Y); 3311 }; 3312 3313 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3314 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3315 // the DomTree is not kept up-to-date for additional blocks generated in the 3316 // vector loop. By using the header as insertion point, we guarantee that the 3317 // expanded instructions dominate all their uses. 3318 auto GetInsertPoint = [this, &B]() { 3319 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3320 if (InsertBB != LoopVectorBody && 3321 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3322 return LoopVectorBody->getTerminator(); 3323 return &*B.GetInsertPoint(); 3324 }; 3325 3326 switch (ID.getKind()) { 3327 case InductionDescriptor::IK_IntInduction: { 3328 assert(!isa<VectorType>(Index->getType()) && 3329 "Vector indices not supported for integer inductions yet"); 3330 assert(Index->getType() == StartValue->getType() && 3331 "Index type does not match StartValue type"); 3332 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3333 return B.CreateSub(StartValue, Index); 3334 auto *Offset = CreateMul( 3335 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3336 return CreateAdd(StartValue, Offset); 3337 } 3338 case InductionDescriptor::IK_PtrInduction: { 3339 assert(isa<SCEVConstant>(Step) && 3340 "Expected constant step for pointer induction"); 3341 return B.CreateGEP( 3342 ID.getElementType(), StartValue, 3343 CreateMul(Index, 3344 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3345 GetInsertPoint()))); 3346 } 3347 case InductionDescriptor::IK_FpInduction: { 3348 assert(!isa<VectorType>(Index->getType()) && 3349 "Vector indices not supported for FP inductions yet"); 3350 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3351 auto InductionBinOp = ID.getInductionBinOp(); 3352 assert(InductionBinOp && 3353 (InductionBinOp->getOpcode() == Instruction::FAdd || 3354 InductionBinOp->getOpcode() == Instruction::FSub) && 3355 "Original bin op should be defined for FP induction"); 3356 3357 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3358 Value *MulExp = B.CreateFMul(StepValue, Index); 3359 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3360 "induction"); 3361 } 3362 case InductionDescriptor::IK_NoInduction: 3363 return nullptr; 3364 } 3365 llvm_unreachable("invalid enum"); 3366 } 3367 3368 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3369 LoopScalarBody = OrigLoop->getHeader(); 3370 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3371 assert(LoopVectorPreHeader && "Invalid loop structure"); 3372 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3373 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3374 "multiple exit loop without required epilogue?"); 3375 3376 LoopMiddleBlock = 3377 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3378 LI, nullptr, Twine(Prefix) + "middle.block"); 3379 LoopScalarPreHeader = 3380 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3381 nullptr, Twine(Prefix) + "scalar.ph"); 3382 3383 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3384 3385 // Set up the middle block terminator. Two cases: 3386 // 1) If we know that we must execute the scalar epilogue, emit an 3387 // unconditional branch. 3388 // 2) Otherwise, we must have a single unique exit block (due to how we 3389 // implement the multiple exit case). In this case, set up a conditonal 3390 // branch from the middle block to the loop scalar preheader, and the 3391 // exit block. completeLoopSkeleton will update the condition to use an 3392 // iteration check, if required to decide whether to execute the remainder. 3393 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3394 BranchInst::Create(LoopScalarPreHeader) : 3395 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3396 Builder.getTrue()); 3397 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3398 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3399 3400 // We intentionally don't let SplitBlock to update LoopInfo since 3401 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3402 // LoopVectorBody is explicitly added to the correct place few lines later. 3403 LoopVectorBody = 3404 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3405 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3406 3407 // Update dominator for loop exit. 3408 if (!Cost->requiresScalarEpilogue(VF)) 3409 // If there is an epilogue which must run, there's no edge from the 3410 // middle block to exit blocks and thus no need to update the immediate 3411 // dominator of the exit blocks. 3412 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3413 3414 // Create and register the new vector loop. 3415 Loop *Lp = LI->AllocateLoop(); 3416 Loop *ParentLoop = OrigLoop->getParentLoop(); 3417 3418 // Insert the new loop into the loop nest and register the new basic blocks 3419 // before calling any utilities such as SCEV that require valid LoopInfo. 3420 if (ParentLoop) { 3421 ParentLoop->addChildLoop(Lp); 3422 } else { 3423 LI->addTopLevelLoop(Lp); 3424 } 3425 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3426 return Lp; 3427 } 3428 3429 void InnerLoopVectorizer::createInductionResumeValues( 3430 Loop *L, Value *VectorTripCount, 3431 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3432 assert(VectorTripCount && L && "Expected valid arguments"); 3433 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3434 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3435 "Inconsistent information about additional bypass."); 3436 // We are going to resume the execution of the scalar loop. 3437 // Go over all of the induction variables that we found and fix the 3438 // PHIs that are left in the scalar version of the loop. 3439 // The starting values of PHI nodes depend on the counter of the last 3440 // iteration in the vectorized loop. 3441 // If we come from a bypass edge then we need to start from the original 3442 // start value. 3443 for (auto &InductionEntry : Legal->getInductionVars()) { 3444 PHINode *OrigPhi = InductionEntry.first; 3445 InductionDescriptor II = InductionEntry.second; 3446 3447 // Create phi nodes to merge from the backedge-taken check block. 3448 PHINode *BCResumeVal = 3449 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3450 LoopScalarPreHeader->getTerminator()); 3451 // Copy original phi DL over to the new one. 3452 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3453 Value *&EndValue = IVEndValues[OrigPhi]; 3454 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3455 if (OrigPhi == OldInduction) { 3456 // We know what the end value is. 3457 EndValue = VectorTripCount; 3458 } else { 3459 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3460 3461 // Fast-math-flags propagate from the original induction instruction. 3462 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3463 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3464 3465 Type *StepType = II.getStep()->getType(); 3466 Instruction::CastOps CastOp = 3467 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3468 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3469 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3470 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3471 EndValue->setName("ind.end"); 3472 3473 // Compute the end value for the additional bypass (if applicable). 3474 if (AdditionalBypass.first) { 3475 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3476 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3477 StepType, true); 3478 CRD = 3479 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3480 EndValueFromAdditionalBypass = 3481 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3482 EndValueFromAdditionalBypass->setName("ind.end"); 3483 } 3484 } 3485 // The new PHI merges the original incoming value, in case of a bypass, 3486 // or the value at the end of the vectorized loop. 3487 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3488 3489 // Fix the scalar body counter (PHI node). 3490 // The old induction's phi node in the scalar body needs the truncated 3491 // value. 3492 for (BasicBlock *BB : LoopBypassBlocks) 3493 BCResumeVal->addIncoming(II.getStartValue(), BB); 3494 3495 if (AdditionalBypass.first) 3496 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3497 EndValueFromAdditionalBypass); 3498 3499 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3500 } 3501 } 3502 3503 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3504 MDNode *OrigLoopID) { 3505 assert(L && "Expected valid loop."); 3506 3507 // The trip counts should be cached by now. 3508 Value *Count = getOrCreateTripCount(L); 3509 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3510 3511 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3512 3513 // Add a check in the middle block to see if we have completed 3514 // all of the iterations in the first vector loop. Three cases: 3515 // 1) If we require a scalar epilogue, there is no conditional branch as 3516 // we unconditionally branch to the scalar preheader. Do nothing. 3517 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3518 // Thus if tail is to be folded, we know we don't need to run the 3519 // remainder and we can use the previous value for the condition (true). 3520 // 3) Otherwise, construct a runtime check. 3521 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3522 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3523 Count, VectorTripCount, "cmp.n", 3524 LoopMiddleBlock->getTerminator()); 3525 3526 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3527 // of the corresponding compare because they may have ended up with 3528 // different line numbers and we want to avoid awkward line stepping while 3529 // debugging. Eg. if the compare has got a line number inside the loop. 3530 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3531 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3532 } 3533 3534 // Get ready to start creating new instructions into the vectorized body. 3535 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3536 "Inconsistent vector loop preheader"); 3537 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3538 3539 Optional<MDNode *> VectorizedLoopID = 3540 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3541 LLVMLoopVectorizeFollowupVectorized}); 3542 if (VectorizedLoopID.hasValue()) { 3543 L->setLoopID(VectorizedLoopID.getValue()); 3544 3545 // Do not setAlreadyVectorized if loop attributes have been defined 3546 // explicitly. 3547 return LoopVectorPreHeader; 3548 } 3549 3550 // Keep all loop hints from the original loop on the vector loop (we'll 3551 // replace the vectorizer-specific hints below). 3552 if (MDNode *LID = OrigLoop->getLoopID()) 3553 L->setLoopID(LID); 3554 3555 LoopVectorizeHints Hints(L, true, *ORE); 3556 Hints.setAlreadyVectorized(); 3557 3558 #ifdef EXPENSIVE_CHECKS 3559 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3560 LI->verify(*DT); 3561 #endif 3562 3563 return LoopVectorPreHeader; 3564 } 3565 3566 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3567 /* 3568 In this function we generate a new loop. The new loop will contain 3569 the vectorized instructions while the old loop will continue to run the 3570 scalar remainder. 3571 3572 [ ] <-- loop iteration number check. 3573 / | 3574 / v 3575 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3576 | / | 3577 | / v 3578 || [ ] <-- vector pre header. 3579 |/ | 3580 | v 3581 | [ ] \ 3582 | [ ]_| <-- vector loop. 3583 | | 3584 | v 3585 \ -[ ] <--- middle-block. 3586 \/ | 3587 /\ v 3588 | ->[ ] <--- new preheader. 3589 | | 3590 (opt) v <-- edge from middle to exit iff epilogue is not required. 3591 | [ ] \ 3592 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3593 \ | 3594 \ v 3595 >[ ] <-- exit block(s). 3596 ... 3597 */ 3598 3599 // Get the metadata of the original loop before it gets modified. 3600 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3601 3602 // Workaround! Compute the trip count of the original loop and cache it 3603 // before we start modifying the CFG. This code has a systemic problem 3604 // wherein it tries to run analysis over partially constructed IR; this is 3605 // wrong, and not simply for SCEV. The trip count of the original loop 3606 // simply happens to be prone to hitting this in practice. In theory, we 3607 // can hit the same issue for any SCEV, or ValueTracking query done during 3608 // mutation. See PR49900. 3609 getOrCreateTripCount(OrigLoop); 3610 3611 // Create an empty vector loop, and prepare basic blocks for the runtime 3612 // checks. 3613 Loop *Lp = createVectorLoopSkeleton(""); 3614 3615 // Now, compare the new count to zero. If it is zero skip the vector loop and 3616 // jump to the scalar loop. This check also covers the case where the 3617 // backedge-taken count is uint##_max: adding one to it will overflow leading 3618 // to an incorrect trip count of zero. In this (rare) case we will also jump 3619 // to the scalar loop. 3620 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3621 3622 // Generate the code to check any assumptions that we've made for SCEV 3623 // expressions. 3624 emitSCEVChecks(Lp, LoopScalarPreHeader); 3625 3626 // Generate the code that checks in runtime if arrays overlap. We put the 3627 // checks into a separate block to make the more common case of few elements 3628 // faster. 3629 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3630 3631 // Some loops have a single integer induction variable, while other loops 3632 // don't. One example is c++ iterators that often have multiple pointer 3633 // induction variables. In the code below we also support a case where we 3634 // don't have a single induction variable. 3635 // 3636 // We try to obtain an induction variable from the original loop as hard 3637 // as possible. However if we don't find one that: 3638 // - is an integer 3639 // - counts from zero, stepping by one 3640 // - is the size of the widest induction variable type 3641 // then we create a new one. 3642 OldInduction = Legal->getPrimaryInduction(); 3643 Type *IdxTy = Legal->getWidestInductionType(); 3644 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3645 // The loop step is equal to the vectorization factor (num of SIMD elements) 3646 // times the unroll factor (num of SIMD instructions). 3647 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3648 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3649 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3650 Induction = 3651 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3652 getDebugLocFromInstOrOperands(OldInduction)); 3653 3654 // Emit phis for the new starting index of the scalar loop. 3655 createInductionResumeValues(Lp, CountRoundDown); 3656 3657 return completeLoopSkeleton(Lp, OrigLoopID); 3658 } 3659 3660 // Fix up external users of the induction variable. At this point, we are 3661 // in LCSSA form, with all external PHIs that use the IV having one input value, 3662 // coming from the remainder loop. We need those PHIs to also have a correct 3663 // value for the IV when arriving directly from the middle block. 3664 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3665 const InductionDescriptor &II, 3666 Value *CountRoundDown, Value *EndValue, 3667 BasicBlock *MiddleBlock) { 3668 // There are two kinds of external IV usages - those that use the value 3669 // computed in the last iteration (the PHI) and those that use the penultimate 3670 // value (the value that feeds into the phi from the loop latch). 3671 // We allow both, but they, obviously, have different values. 3672 3673 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3674 3675 DenseMap<Value *, Value *> MissingVals; 3676 3677 // An external user of the last iteration's value should see the value that 3678 // the remainder loop uses to initialize its own IV. 3679 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3680 for (User *U : PostInc->users()) { 3681 Instruction *UI = cast<Instruction>(U); 3682 if (!OrigLoop->contains(UI)) { 3683 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3684 MissingVals[UI] = EndValue; 3685 } 3686 } 3687 3688 // An external user of the penultimate value need to see EndValue - Step. 3689 // The simplest way to get this is to recompute it from the constituent SCEVs, 3690 // that is Start + (Step * (CRD - 1)). 3691 for (User *U : OrigPhi->users()) { 3692 auto *UI = cast<Instruction>(U); 3693 if (!OrigLoop->contains(UI)) { 3694 const DataLayout &DL = 3695 OrigLoop->getHeader()->getModule()->getDataLayout(); 3696 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3697 3698 IRBuilder<> B(MiddleBlock->getTerminator()); 3699 3700 // Fast-math-flags propagate from the original induction instruction. 3701 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3702 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3703 3704 Value *CountMinusOne = B.CreateSub( 3705 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3706 Value *CMO = 3707 !II.getStep()->getType()->isIntegerTy() 3708 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3709 II.getStep()->getType()) 3710 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3711 CMO->setName("cast.cmo"); 3712 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3713 Escape->setName("ind.escape"); 3714 MissingVals[UI] = Escape; 3715 } 3716 } 3717 3718 for (auto &I : MissingVals) { 3719 PHINode *PHI = cast<PHINode>(I.first); 3720 // One corner case we have to handle is two IVs "chasing" each-other, 3721 // that is %IV2 = phi [...], [ %IV1, %latch ] 3722 // In this case, if IV1 has an external use, we need to avoid adding both 3723 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3724 // don't already have an incoming value for the middle block. 3725 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3726 PHI->addIncoming(I.second, MiddleBlock); 3727 } 3728 } 3729 3730 namespace { 3731 3732 struct CSEDenseMapInfo { 3733 static bool canHandle(const Instruction *I) { 3734 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3735 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3736 } 3737 3738 static inline Instruction *getEmptyKey() { 3739 return DenseMapInfo<Instruction *>::getEmptyKey(); 3740 } 3741 3742 static inline Instruction *getTombstoneKey() { 3743 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3744 } 3745 3746 static unsigned getHashValue(const Instruction *I) { 3747 assert(canHandle(I) && "Unknown instruction!"); 3748 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3749 I->value_op_end())); 3750 } 3751 3752 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3753 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3754 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3755 return LHS == RHS; 3756 return LHS->isIdenticalTo(RHS); 3757 } 3758 }; 3759 3760 } // end anonymous namespace 3761 3762 ///Perform cse of induction variable instructions. 3763 static void cse(BasicBlock *BB) { 3764 // Perform simple cse. 3765 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3766 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3767 if (!CSEDenseMapInfo::canHandle(&In)) 3768 continue; 3769 3770 // Check if we can replace this instruction with any of the 3771 // visited instructions. 3772 if (Instruction *V = CSEMap.lookup(&In)) { 3773 In.replaceAllUsesWith(V); 3774 In.eraseFromParent(); 3775 continue; 3776 } 3777 3778 CSEMap[&In] = &In; 3779 } 3780 } 3781 3782 InstructionCost 3783 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3784 bool &NeedToScalarize) const { 3785 Function *F = CI->getCalledFunction(); 3786 Type *ScalarRetTy = CI->getType(); 3787 SmallVector<Type *, 4> Tys, ScalarTys; 3788 for (auto &ArgOp : CI->args()) 3789 ScalarTys.push_back(ArgOp->getType()); 3790 3791 // Estimate cost of scalarized vector call. The source operands are assumed 3792 // to be vectors, so we need to extract individual elements from there, 3793 // execute VF scalar calls, and then gather the result into the vector return 3794 // value. 3795 InstructionCost ScalarCallCost = 3796 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3797 if (VF.isScalar()) 3798 return ScalarCallCost; 3799 3800 // Compute corresponding vector type for return value and arguments. 3801 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3802 for (Type *ScalarTy : ScalarTys) 3803 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3804 3805 // Compute costs of unpacking argument values for the scalar calls and 3806 // packing the return values to a vector. 3807 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3808 3809 InstructionCost Cost = 3810 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3811 3812 // If we can't emit a vector call for this function, then the currently found 3813 // cost is the cost we need to return. 3814 NeedToScalarize = true; 3815 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3816 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3817 3818 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3819 return Cost; 3820 3821 // If the corresponding vector cost is cheaper, return its cost. 3822 InstructionCost VectorCallCost = 3823 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3824 if (VectorCallCost < Cost) { 3825 NeedToScalarize = false; 3826 Cost = VectorCallCost; 3827 } 3828 return Cost; 3829 } 3830 3831 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3832 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3833 return Elt; 3834 return VectorType::get(Elt, VF); 3835 } 3836 3837 InstructionCost 3838 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3839 ElementCount VF) const { 3840 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3841 assert(ID && "Expected intrinsic call!"); 3842 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3843 FastMathFlags FMF; 3844 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3845 FMF = FPMO->getFastMathFlags(); 3846 3847 SmallVector<const Value *> Arguments(CI->args()); 3848 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3849 SmallVector<Type *> ParamTys; 3850 std::transform(FTy->param_begin(), FTy->param_end(), 3851 std::back_inserter(ParamTys), 3852 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3853 3854 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3855 dyn_cast<IntrinsicInst>(CI)); 3856 return TTI.getIntrinsicInstrCost(CostAttrs, 3857 TargetTransformInfo::TCK_RecipThroughput); 3858 } 3859 3860 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3861 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3862 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3863 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3864 } 3865 3866 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3867 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3868 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3869 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3870 } 3871 3872 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3873 // For every instruction `I` in MinBWs, truncate the operands, create a 3874 // truncated version of `I` and reextend its result. InstCombine runs 3875 // later and will remove any ext/trunc pairs. 3876 SmallPtrSet<Value *, 4> Erased; 3877 for (const auto &KV : Cost->getMinimalBitwidths()) { 3878 // If the value wasn't vectorized, we must maintain the original scalar 3879 // type. The absence of the value from State indicates that it 3880 // wasn't vectorized. 3881 // FIXME: Should not rely on getVPValue at this point. 3882 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3883 if (!State.hasAnyVectorValue(Def)) 3884 continue; 3885 for (unsigned Part = 0; Part < UF; ++Part) { 3886 Value *I = State.get(Def, Part); 3887 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3888 continue; 3889 Type *OriginalTy = I->getType(); 3890 Type *ScalarTruncatedTy = 3891 IntegerType::get(OriginalTy->getContext(), KV.second); 3892 auto *TruncatedTy = VectorType::get( 3893 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3894 if (TruncatedTy == OriginalTy) 3895 continue; 3896 3897 IRBuilder<> B(cast<Instruction>(I)); 3898 auto ShrinkOperand = [&](Value *V) -> Value * { 3899 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3900 if (ZI->getSrcTy() == TruncatedTy) 3901 return ZI->getOperand(0); 3902 return B.CreateZExtOrTrunc(V, TruncatedTy); 3903 }; 3904 3905 // The actual instruction modification depends on the instruction type, 3906 // unfortunately. 3907 Value *NewI = nullptr; 3908 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3909 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3910 ShrinkOperand(BO->getOperand(1))); 3911 3912 // Any wrapping introduced by shrinking this operation shouldn't be 3913 // considered undefined behavior. So, we can't unconditionally copy 3914 // arithmetic wrapping flags to NewI. 3915 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3916 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3917 NewI = 3918 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3919 ShrinkOperand(CI->getOperand(1))); 3920 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3921 NewI = B.CreateSelect(SI->getCondition(), 3922 ShrinkOperand(SI->getTrueValue()), 3923 ShrinkOperand(SI->getFalseValue())); 3924 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3925 switch (CI->getOpcode()) { 3926 default: 3927 llvm_unreachable("Unhandled cast!"); 3928 case Instruction::Trunc: 3929 NewI = ShrinkOperand(CI->getOperand(0)); 3930 break; 3931 case Instruction::SExt: 3932 NewI = B.CreateSExtOrTrunc( 3933 CI->getOperand(0), 3934 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3935 break; 3936 case Instruction::ZExt: 3937 NewI = B.CreateZExtOrTrunc( 3938 CI->getOperand(0), 3939 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3940 break; 3941 } 3942 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3943 auto Elements0 = 3944 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3945 auto *O0 = B.CreateZExtOrTrunc( 3946 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3947 auto Elements1 = 3948 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3949 auto *O1 = B.CreateZExtOrTrunc( 3950 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3951 3952 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3953 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3954 // Don't do anything with the operands, just extend the result. 3955 continue; 3956 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3957 auto Elements = 3958 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3959 auto *O0 = B.CreateZExtOrTrunc( 3960 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3961 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3962 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3963 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3964 auto Elements = 3965 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3966 auto *O0 = B.CreateZExtOrTrunc( 3967 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3968 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3969 } else { 3970 // If we don't know what to do, be conservative and don't do anything. 3971 continue; 3972 } 3973 3974 // Lastly, extend the result. 3975 NewI->takeName(cast<Instruction>(I)); 3976 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3977 I->replaceAllUsesWith(Res); 3978 cast<Instruction>(I)->eraseFromParent(); 3979 Erased.insert(I); 3980 State.reset(Def, Res, Part); 3981 } 3982 } 3983 3984 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3985 for (const auto &KV : Cost->getMinimalBitwidths()) { 3986 // If the value wasn't vectorized, we must maintain the original scalar 3987 // type. The absence of the value from State indicates that it 3988 // wasn't vectorized. 3989 // FIXME: Should not rely on getVPValue at this point. 3990 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3991 if (!State.hasAnyVectorValue(Def)) 3992 continue; 3993 for (unsigned Part = 0; Part < UF; ++Part) { 3994 Value *I = State.get(Def, Part); 3995 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3996 if (Inst && Inst->use_empty()) { 3997 Value *NewI = Inst->getOperand(0); 3998 Inst->eraseFromParent(); 3999 State.reset(Def, NewI, Part); 4000 } 4001 } 4002 } 4003 } 4004 4005 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4006 // Insert truncates and extends for any truncated instructions as hints to 4007 // InstCombine. 4008 if (VF.isVector()) 4009 truncateToMinimalBitwidths(State); 4010 4011 // Fix widened non-induction PHIs by setting up the PHI operands. 4012 if (OrigPHIsToFix.size()) { 4013 assert(EnableVPlanNativePath && 4014 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4015 fixNonInductionPHIs(State); 4016 } 4017 4018 // At this point every instruction in the original loop is widened to a 4019 // vector form. Now we need to fix the recurrences in the loop. These PHI 4020 // nodes are currently empty because we did not want to introduce cycles. 4021 // This is the second stage of vectorizing recurrences. 4022 fixCrossIterationPHIs(State); 4023 4024 // Forget the original basic block. 4025 PSE.getSE()->forgetLoop(OrigLoop); 4026 4027 // If we inserted an edge from the middle block to the unique exit block, 4028 // update uses outside the loop (phis) to account for the newly inserted 4029 // edge. 4030 if (!Cost->requiresScalarEpilogue(VF)) { 4031 // Fix-up external users of the induction variables. 4032 for (auto &Entry : Legal->getInductionVars()) 4033 fixupIVUsers(Entry.first, Entry.second, 4034 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4035 IVEndValues[Entry.first], LoopMiddleBlock); 4036 4037 fixLCSSAPHIs(State); 4038 } 4039 4040 for (Instruction *PI : PredicatedInstructions) 4041 sinkScalarOperands(&*PI); 4042 4043 // Remove redundant induction instructions. 4044 cse(LoopVectorBody); 4045 4046 // Set/update profile weights for the vector and remainder loops as original 4047 // loop iterations are now distributed among them. Note that original loop 4048 // represented by LoopScalarBody becomes remainder loop after vectorization. 4049 // 4050 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4051 // end up getting slightly roughened result but that should be OK since 4052 // profile is not inherently precise anyway. Note also possible bypass of 4053 // vector code caused by legality checks is ignored, assigning all the weight 4054 // to the vector loop, optimistically. 4055 // 4056 // For scalable vectorization we can't know at compile time how many iterations 4057 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4058 // vscale of '1'. 4059 setProfileInfoAfterUnrolling( 4060 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4061 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4062 } 4063 4064 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4065 // In order to support recurrences we need to be able to vectorize Phi nodes. 4066 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4067 // stage #2: We now need to fix the recurrences by adding incoming edges to 4068 // the currently empty PHI nodes. At this point every instruction in the 4069 // original loop is widened to a vector form so we can use them to construct 4070 // the incoming edges. 4071 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4072 for (VPRecipeBase &R : Header->phis()) { 4073 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4074 fixReduction(ReductionPhi, State); 4075 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4076 fixFirstOrderRecurrence(FOR, State); 4077 } 4078 } 4079 4080 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4081 VPTransformState &State) { 4082 // This is the second phase of vectorizing first-order recurrences. An 4083 // overview of the transformation is described below. Suppose we have the 4084 // following loop. 4085 // 4086 // for (int i = 0; i < n; ++i) 4087 // b[i] = a[i] - a[i - 1]; 4088 // 4089 // There is a first-order recurrence on "a". For this loop, the shorthand 4090 // scalar IR looks like: 4091 // 4092 // scalar.ph: 4093 // s_init = a[-1] 4094 // br scalar.body 4095 // 4096 // scalar.body: 4097 // i = phi [0, scalar.ph], [i+1, scalar.body] 4098 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4099 // s2 = a[i] 4100 // b[i] = s2 - s1 4101 // br cond, scalar.body, ... 4102 // 4103 // In this example, s1 is a recurrence because it's value depends on the 4104 // previous iteration. In the first phase of vectorization, we created a 4105 // vector phi v1 for s1. We now complete the vectorization and produce the 4106 // shorthand vector IR shown below (for VF = 4, UF = 1). 4107 // 4108 // vector.ph: 4109 // v_init = vector(..., ..., ..., a[-1]) 4110 // br vector.body 4111 // 4112 // vector.body 4113 // i = phi [0, vector.ph], [i+4, vector.body] 4114 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4115 // v2 = a[i, i+1, i+2, i+3]; 4116 // v3 = vector(v1(3), v2(0, 1, 2)) 4117 // b[i, i+1, i+2, i+3] = v2 - v3 4118 // br cond, vector.body, middle.block 4119 // 4120 // middle.block: 4121 // x = v2(3) 4122 // br scalar.ph 4123 // 4124 // scalar.ph: 4125 // s_init = phi [x, middle.block], [a[-1], otherwise] 4126 // br scalar.body 4127 // 4128 // After execution completes the vector loop, we extract the next value of 4129 // the recurrence (x) to use as the initial value in the scalar loop. 4130 4131 // Extract the last vector element in the middle block. This will be the 4132 // initial value for the recurrence when jumping to the scalar loop. 4133 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4134 Value *Incoming = State.get(PreviousDef, UF - 1); 4135 auto *ExtractForScalar = Incoming; 4136 auto *IdxTy = Builder.getInt32Ty(); 4137 if (VF.isVector()) { 4138 auto *One = ConstantInt::get(IdxTy, 1); 4139 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4140 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4141 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4142 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4143 "vector.recur.extract"); 4144 } 4145 // Extract the second last element in the middle block if the 4146 // Phi is used outside the loop. We need to extract the phi itself 4147 // and not the last element (the phi update in the current iteration). This 4148 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4149 // when the scalar loop is not run at all. 4150 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4151 if (VF.isVector()) { 4152 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4153 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4154 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4155 Incoming, Idx, "vector.recur.extract.for.phi"); 4156 } else if (UF > 1) 4157 // When loop is unrolled without vectorizing, initialize 4158 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4159 // of `Incoming`. This is analogous to the vectorized case above: extracting 4160 // the second last element when VF > 1. 4161 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4162 4163 // Fix the initial value of the original recurrence in the scalar loop. 4164 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4165 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4166 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4167 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4168 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4169 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4170 Start->addIncoming(Incoming, BB); 4171 } 4172 4173 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4174 Phi->setName("scalar.recur"); 4175 4176 // Finally, fix users of the recurrence outside the loop. The users will need 4177 // either the last value of the scalar recurrence or the last value of the 4178 // vector recurrence we extracted in the middle block. Since the loop is in 4179 // LCSSA form, we just need to find all the phi nodes for the original scalar 4180 // recurrence in the exit block, and then add an edge for the middle block. 4181 // Note that LCSSA does not imply single entry when the original scalar loop 4182 // had multiple exiting edges (as we always run the last iteration in the 4183 // scalar epilogue); in that case, there is no edge from middle to exit and 4184 // and thus no phis which needed updated. 4185 if (!Cost->requiresScalarEpilogue(VF)) 4186 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4187 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4188 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4189 } 4190 4191 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4192 VPTransformState &State) { 4193 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4194 // Get it's reduction variable descriptor. 4195 assert(Legal->isReductionVariable(OrigPhi) && 4196 "Unable to find the reduction variable"); 4197 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4198 4199 RecurKind RK = RdxDesc.getRecurrenceKind(); 4200 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4201 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4202 setDebugLocFromInst(ReductionStartValue); 4203 4204 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4205 // This is the vector-clone of the value that leaves the loop. 4206 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4207 4208 // Wrap flags are in general invalid after vectorization, clear them. 4209 clearReductionWrapFlags(RdxDesc, State); 4210 4211 // Before each round, move the insertion point right between 4212 // the PHIs and the values we are going to write. 4213 // This allows us to write both PHINodes and the extractelement 4214 // instructions. 4215 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4216 4217 setDebugLocFromInst(LoopExitInst); 4218 4219 Type *PhiTy = OrigPhi->getType(); 4220 // If tail is folded by masking, the vector value to leave the loop should be 4221 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4222 // instead of the former. For an inloop reduction the reduction will already 4223 // be predicated, and does not need to be handled here. 4224 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4225 for (unsigned Part = 0; Part < UF; ++Part) { 4226 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4227 Value *Sel = nullptr; 4228 for (User *U : VecLoopExitInst->users()) { 4229 if (isa<SelectInst>(U)) { 4230 assert(!Sel && "Reduction exit feeding two selects"); 4231 Sel = U; 4232 } else 4233 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4234 } 4235 assert(Sel && "Reduction exit feeds no select"); 4236 State.reset(LoopExitInstDef, Sel, Part); 4237 4238 // If the target can create a predicated operator for the reduction at no 4239 // extra cost in the loop (for example a predicated vadd), it can be 4240 // cheaper for the select to remain in the loop than be sunk out of it, 4241 // and so use the select value for the phi instead of the old 4242 // LoopExitValue. 4243 if (PreferPredicatedReductionSelect || 4244 TTI->preferPredicatedReductionSelect( 4245 RdxDesc.getOpcode(), PhiTy, 4246 TargetTransformInfo::ReductionFlags())) { 4247 auto *VecRdxPhi = 4248 cast<PHINode>(State.get(PhiR, Part)); 4249 VecRdxPhi->setIncomingValueForBlock( 4250 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4251 } 4252 } 4253 } 4254 4255 // If the vector reduction can be performed in a smaller type, we truncate 4256 // then extend the loop exit value to enable InstCombine to evaluate the 4257 // entire expression in the smaller type. 4258 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4259 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4260 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4261 Builder.SetInsertPoint( 4262 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4263 VectorParts RdxParts(UF); 4264 for (unsigned Part = 0; Part < UF; ++Part) { 4265 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4266 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4267 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4268 : Builder.CreateZExt(Trunc, VecTy); 4269 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4270 if (U != Trunc) { 4271 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4272 RdxParts[Part] = Extnd; 4273 } 4274 } 4275 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4276 for (unsigned Part = 0; Part < UF; ++Part) { 4277 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4278 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4279 } 4280 } 4281 4282 // Reduce all of the unrolled parts into a single vector. 4283 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4284 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4285 4286 // The middle block terminator has already been assigned a DebugLoc here (the 4287 // OrigLoop's single latch terminator). We want the whole middle block to 4288 // appear to execute on this line because: (a) it is all compiler generated, 4289 // (b) these instructions are always executed after evaluating the latch 4290 // conditional branch, and (c) other passes may add new predecessors which 4291 // terminate on this line. This is the easiest way to ensure we don't 4292 // accidentally cause an extra step back into the loop while debugging. 4293 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4294 if (PhiR->isOrdered()) 4295 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4296 else { 4297 // Floating-point operations should have some FMF to enable the reduction. 4298 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4299 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4300 for (unsigned Part = 1; Part < UF; ++Part) { 4301 Value *RdxPart = State.get(LoopExitInstDef, Part); 4302 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4303 ReducedPartRdx = Builder.CreateBinOp( 4304 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4305 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4306 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4307 ReducedPartRdx, RdxPart); 4308 else 4309 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4310 } 4311 } 4312 4313 // Create the reduction after the loop. Note that inloop reductions create the 4314 // target reduction in the loop using a Reduction recipe. 4315 if (VF.isVector() && !PhiR->isInLoop()) { 4316 ReducedPartRdx = 4317 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4318 // If the reduction can be performed in a smaller type, we need to extend 4319 // the reduction to the wider type before we branch to the original loop. 4320 if (PhiTy != RdxDesc.getRecurrenceType()) 4321 ReducedPartRdx = RdxDesc.isSigned() 4322 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4323 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4324 } 4325 4326 // Create a phi node that merges control-flow from the backedge-taken check 4327 // block and the middle block. 4328 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4329 LoopScalarPreHeader->getTerminator()); 4330 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4331 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4332 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4333 4334 // Now, we need to fix the users of the reduction variable 4335 // inside and outside of the scalar remainder loop. 4336 4337 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4338 // in the exit blocks. See comment on analogous loop in 4339 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4340 if (!Cost->requiresScalarEpilogue(VF)) 4341 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4342 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4343 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4344 4345 // Fix the scalar loop reduction variable with the incoming reduction sum 4346 // from the vector body and from the backedge value. 4347 int IncomingEdgeBlockIdx = 4348 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4349 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4350 // Pick the other block. 4351 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4352 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4353 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4354 } 4355 4356 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4357 VPTransformState &State) { 4358 RecurKind RK = RdxDesc.getRecurrenceKind(); 4359 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4360 return; 4361 4362 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4363 assert(LoopExitInstr && "null loop exit instruction"); 4364 SmallVector<Instruction *, 8> Worklist; 4365 SmallPtrSet<Instruction *, 8> Visited; 4366 Worklist.push_back(LoopExitInstr); 4367 Visited.insert(LoopExitInstr); 4368 4369 while (!Worklist.empty()) { 4370 Instruction *Cur = Worklist.pop_back_val(); 4371 if (isa<OverflowingBinaryOperator>(Cur)) 4372 for (unsigned Part = 0; Part < UF; ++Part) { 4373 // FIXME: Should not rely on getVPValue at this point. 4374 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4375 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4376 } 4377 4378 for (User *U : Cur->users()) { 4379 Instruction *UI = cast<Instruction>(U); 4380 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4381 Visited.insert(UI).second) 4382 Worklist.push_back(UI); 4383 } 4384 } 4385 } 4386 4387 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4388 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4389 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4390 // Some phis were already hand updated by the reduction and recurrence 4391 // code above, leave them alone. 4392 continue; 4393 4394 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4395 // Non-instruction incoming values will have only one value. 4396 4397 VPLane Lane = VPLane::getFirstLane(); 4398 if (isa<Instruction>(IncomingValue) && 4399 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4400 VF)) 4401 Lane = VPLane::getLastLaneForVF(VF); 4402 4403 // Can be a loop invariant incoming value or the last scalar value to be 4404 // extracted from the vectorized loop. 4405 // FIXME: Should not rely on getVPValue at this point. 4406 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4407 Value *lastIncomingValue = 4408 OrigLoop->isLoopInvariant(IncomingValue) 4409 ? IncomingValue 4410 : State.get(State.Plan->getVPValue(IncomingValue, true), 4411 VPIteration(UF - 1, Lane)); 4412 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4413 } 4414 } 4415 4416 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4417 // The basic block and loop containing the predicated instruction. 4418 auto *PredBB = PredInst->getParent(); 4419 auto *VectorLoop = LI->getLoopFor(PredBB); 4420 4421 // Initialize a worklist with the operands of the predicated instruction. 4422 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4423 4424 // Holds instructions that we need to analyze again. An instruction may be 4425 // reanalyzed if we don't yet know if we can sink it or not. 4426 SmallVector<Instruction *, 8> InstsToReanalyze; 4427 4428 // Returns true if a given use occurs in the predicated block. Phi nodes use 4429 // their operands in their corresponding predecessor blocks. 4430 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4431 auto *I = cast<Instruction>(U.getUser()); 4432 BasicBlock *BB = I->getParent(); 4433 if (auto *Phi = dyn_cast<PHINode>(I)) 4434 BB = Phi->getIncomingBlock( 4435 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4436 return BB == PredBB; 4437 }; 4438 4439 // Iteratively sink the scalarized operands of the predicated instruction 4440 // into the block we created for it. When an instruction is sunk, it's 4441 // operands are then added to the worklist. The algorithm ends after one pass 4442 // through the worklist doesn't sink a single instruction. 4443 bool Changed; 4444 do { 4445 // Add the instructions that need to be reanalyzed to the worklist, and 4446 // reset the changed indicator. 4447 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4448 InstsToReanalyze.clear(); 4449 Changed = false; 4450 4451 while (!Worklist.empty()) { 4452 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4453 4454 // We can't sink an instruction if it is a phi node, is not in the loop, 4455 // or may have side effects. 4456 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4457 I->mayHaveSideEffects()) 4458 continue; 4459 4460 // If the instruction is already in PredBB, check if we can sink its 4461 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4462 // sinking the scalar instruction I, hence it appears in PredBB; but it 4463 // may have failed to sink I's operands (recursively), which we try 4464 // (again) here. 4465 if (I->getParent() == PredBB) { 4466 Worklist.insert(I->op_begin(), I->op_end()); 4467 continue; 4468 } 4469 4470 // It's legal to sink the instruction if all its uses occur in the 4471 // predicated block. Otherwise, there's nothing to do yet, and we may 4472 // need to reanalyze the instruction. 4473 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4474 InstsToReanalyze.push_back(I); 4475 continue; 4476 } 4477 4478 // Move the instruction to the beginning of the predicated block, and add 4479 // it's operands to the worklist. 4480 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4481 Worklist.insert(I->op_begin(), I->op_end()); 4482 4483 // The sinking may have enabled other instructions to be sunk, so we will 4484 // need to iterate. 4485 Changed = true; 4486 } 4487 } while (Changed); 4488 } 4489 4490 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4491 for (PHINode *OrigPhi : OrigPHIsToFix) { 4492 VPWidenPHIRecipe *VPPhi = 4493 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4494 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4495 // Make sure the builder has a valid insert point. 4496 Builder.SetInsertPoint(NewPhi); 4497 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4498 VPValue *Inc = VPPhi->getIncomingValue(i); 4499 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4500 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4501 } 4502 } 4503 } 4504 4505 bool InnerLoopVectorizer::useOrderedReductions( 4506 const RecurrenceDescriptor &RdxDesc) { 4507 return Cost->useOrderedReductions(RdxDesc); 4508 } 4509 4510 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4511 VPWidenPHIRecipe *PhiR, 4512 VPTransformState &State) { 4513 PHINode *P = cast<PHINode>(PN); 4514 if (EnableVPlanNativePath) { 4515 // Currently we enter here in the VPlan-native path for non-induction 4516 // PHIs where all control flow is uniform. We simply widen these PHIs. 4517 // Create a vector phi with no operands - the vector phi operands will be 4518 // set at the end of vector code generation. 4519 Type *VecTy = (State.VF.isScalar()) 4520 ? PN->getType() 4521 : VectorType::get(PN->getType(), State.VF); 4522 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4523 State.set(PhiR, VecPhi, 0); 4524 OrigPHIsToFix.push_back(P); 4525 4526 return; 4527 } 4528 4529 assert(PN->getParent() == OrigLoop->getHeader() && 4530 "Non-header phis should have been handled elsewhere"); 4531 4532 // In order to support recurrences we need to be able to vectorize Phi nodes. 4533 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4534 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4535 // this value when we vectorize all of the instructions that use the PHI. 4536 4537 assert(!Legal->isReductionVariable(P) && 4538 "reductions should be handled elsewhere"); 4539 4540 setDebugLocFromInst(P); 4541 4542 // This PHINode must be an induction variable. 4543 // Make sure that we know about it. 4544 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4545 4546 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4547 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4548 4549 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4550 // which can be found from the original scalar operations. 4551 switch (II.getKind()) { 4552 case InductionDescriptor::IK_NoInduction: 4553 llvm_unreachable("Unknown induction"); 4554 case InductionDescriptor::IK_IntInduction: 4555 case InductionDescriptor::IK_FpInduction: 4556 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4557 case InductionDescriptor::IK_PtrInduction: { 4558 // Handle the pointer induction variable case. 4559 assert(P->getType()->isPointerTy() && "Unexpected type."); 4560 4561 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4562 // This is the normalized GEP that starts counting at zero. 4563 Value *PtrInd = 4564 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4565 // Determine the number of scalars we need to generate for each unroll 4566 // iteration. If the instruction is uniform, we only need to generate the 4567 // first lane. Otherwise, we generate all VF values. 4568 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4569 assert((IsUniform || !State.VF.isScalable()) && 4570 "Cannot scalarize a scalable VF"); 4571 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4572 4573 for (unsigned Part = 0; Part < UF; ++Part) { 4574 Value *PartStart = 4575 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4576 4577 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4578 Value *Idx = Builder.CreateAdd( 4579 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4580 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4581 Value *SclrGep = 4582 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4583 SclrGep->setName("next.gep"); 4584 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4585 } 4586 } 4587 return; 4588 } 4589 assert(isa<SCEVConstant>(II.getStep()) && 4590 "Induction step not a SCEV constant!"); 4591 Type *PhiType = II.getStep()->getType(); 4592 4593 // Build a pointer phi 4594 Value *ScalarStartValue = II.getStartValue(); 4595 Type *ScStValueType = ScalarStartValue->getType(); 4596 PHINode *NewPointerPhi = 4597 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4598 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4599 4600 // A pointer induction, performed by using a gep 4601 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4602 Instruction *InductionLoc = LoopLatch->getTerminator(); 4603 const SCEV *ScalarStep = II.getStep(); 4604 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4605 Value *ScalarStepValue = 4606 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4607 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4608 Value *NumUnrolledElems = 4609 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4610 Value *InductionGEP = GetElementPtrInst::Create( 4611 II.getElementType(), NewPointerPhi, 4612 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4613 InductionLoc); 4614 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4615 4616 // Create UF many actual address geps that use the pointer 4617 // phi as base and a vectorized version of the step value 4618 // (<step*0, ..., step*N>) as offset. 4619 for (unsigned Part = 0; Part < State.UF; ++Part) { 4620 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4621 Value *StartOffsetScalar = 4622 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4623 Value *StartOffset = 4624 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4625 // Create a vector of consecutive numbers from zero to VF. 4626 StartOffset = 4627 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4628 4629 Value *GEP = Builder.CreateGEP( 4630 II.getElementType(), NewPointerPhi, 4631 Builder.CreateMul( 4632 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4633 "vector.gep")); 4634 State.set(PhiR, GEP, Part); 4635 } 4636 } 4637 } 4638 } 4639 4640 /// A helper function for checking whether an integer division-related 4641 /// instruction may divide by zero (in which case it must be predicated if 4642 /// executed conditionally in the scalar code). 4643 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4644 /// Non-zero divisors that are non compile-time constants will not be 4645 /// converted into multiplication, so we will still end up scalarizing 4646 /// the division, but can do so w/o predication. 4647 static bool mayDivideByZero(Instruction &I) { 4648 assert((I.getOpcode() == Instruction::UDiv || 4649 I.getOpcode() == Instruction::SDiv || 4650 I.getOpcode() == Instruction::URem || 4651 I.getOpcode() == Instruction::SRem) && 4652 "Unexpected instruction"); 4653 Value *Divisor = I.getOperand(1); 4654 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4655 return !CInt || CInt->isZero(); 4656 } 4657 4658 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4659 VPUser &ArgOperands, 4660 VPTransformState &State) { 4661 assert(!isa<DbgInfoIntrinsic>(I) && 4662 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4663 setDebugLocFromInst(&I); 4664 4665 Module *M = I.getParent()->getParent()->getParent(); 4666 auto *CI = cast<CallInst>(&I); 4667 4668 SmallVector<Type *, 4> Tys; 4669 for (Value *ArgOperand : CI->args()) 4670 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4671 4672 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4673 4674 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4675 // version of the instruction. 4676 // Is it beneficial to perform intrinsic call compared to lib call? 4677 bool NeedToScalarize = false; 4678 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4679 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4680 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4681 assert((UseVectorIntrinsic || !NeedToScalarize) && 4682 "Instruction should be scalarized elsewhere."); 4683 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4684 "Either the intrinsic cost or vector call cost must be valid"); 4685 4686 for (unsigned Part = 0; Part < UF; ++Part) { 4687 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4688 SmallVector<Value *, 4> Args; 4689 for (auto &I : enumerate(ArgOperands.operands())) { 4690 // Some intrinsics have a scalar argument - don't replace it with a 4691 // vector. 4692 Value *Arg; 4693 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4694 Arg = State.get(I.value(), Part); 4695 else { 4696 Arg = State.get(I.value(), VPIteration(0, 0)); 4697 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4698 TysForDecl.push_back(Arg->getType()); 4699 } 4700 Args.push_back(Arg); 4701 } 4702 4703 Function *VectorF; 4704 if (UseVectorIntrinsic) { 4705 // Use vector version of the intrinsic. 4706 if (VF.isVector()) 4707 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4708 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4709 assert(VectorF && "Can't retrieve vector intrinsic."); 4710 } else { 4711 // Use vector version of the function call. 4712 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4713 #ifndef NDEBUG 4714 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4715 "Can't create vector function."); 4716 #endif 4717 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4718 } 4719 SmallVector<OperandBundleDef, 1> OpBundles; 4720 CI->getOperandBundlesAsDefs(OpBundles); 4721 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4722 4723 if (isa<FPMathOperator>(V)) 4724 V->copyFastMathFlags(CI); 4725 4726 State.set(Def, V, Part); 4727 addMetadata(V, &I); 4728 } 4729 } 4730 4731 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4732 // We should not collect Scalars more than once per VF. Right now, this 4733 // function is called from collectUniformsAndScalars(), which already does 4734 // this check. Collecting Scalars for VF=1 does not make any sense. 4735 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4736 "This function should not be visited twice for the same VF"); 4737 4738 SmallSetVector<Instruction *, 8> Worklist; 4739 4740 // These sets are used to seed the analysis with pointers used by memory 4741 // accesses that will remain scalar. 4742 SmallSetVector<Instruction *, 8> ScalarPtrs; 4743 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4744 auto *Latch = TheLoop->getLoopLatch(); 4745 4746 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4747 // The pointer operands of loads and stores will be scalar as long as the 4748 // memory access is not a gather or scatter operation. The value operand of a 4749 // store will remain scalar if the store is scalarized. 4750 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4751 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4752 assert(WideningDecision != CM_Unknown && 4753 "Widening decision should be ready at this moment"); 4754 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4755 if (Ptr == Store->getValueOperand()) 4756 return WideningDecision == CM_Scalarize; 4757 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4758 "Ptr is neither a value or pointer operand"); 4759 return WideningDecision != CM_GatherScatter; 4760 }; 4761 4762 // A helper that returns true if the given value is a bitcast or 4763 // getelementptr instruction contained in the loop. 4764 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4765 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4766 isa<GetElementPtrInst>(V)) && 4767 !TheLoop->isLoopInvariant(V); 4768 }; 4769 4770 // A helper that evaluates a memory access's use of a pointer. If the use will 4771 // be a scalar use and the pointer is only used by memory accesses, we place 4772 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4773 // PossibleNonScalarPtrs. 4774 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4775 // We only care about bitcast and getelementptr instructions contained in 4776 // the loop. 4777 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4778 return; 4779 4780 // If the pointer has already been identified as scalar (e.g., if it was 4781 // also identified as uniform), there's nothing to do. 4782 auto *I = cast<Instruction>(Ptr); 4783 if (Worklist.count(I)) 4784 return; 4785 4786 // If the use of the pointer will be a scalar use, and all users of the 4787 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4788 // place the pointer in PossibleNonScalarPtrs. 4789 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4790 return isa<LoadInst>(U) || isa<StoreInst>(U); 4791 })) 4792 ScalarPtrs.insert(I); 4793 else 4794 PossibleNonScalarPtrs.insert(I); 4795 }; 4796 4797 // We seed the scalars analysis with three classes of instructions: (1) 4798 // instructions marked uniform-after-vectorization and (2) bitcast, 4799 // getelementptr and (pointer) phi instructions used by memory accesses 4800 // requiring a scalar use. 4801 // 4802 // (1) Add to the worklist all instructions that have been identified as 4803 // uniform-after-vectorization. 4804 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4805 4806 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4807 // memory accesses requiring a scalar use. The pointer operands of loads and 4808 // stores will be scalar as long as the memory accesses is not a gather or 4809 // scatter operation. The value operand of a store will remain scalar if the 4810 // store is scalarized. 4811 for (auto *BB : TheLoop->blocks()) 4812 for (auto &I : *BB) { 4813 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4814 evaluatePtrUse(Load, Load->getPointerOperand()); 4815 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4816 evaluatePtrUse(Store, Store->getPointerOperand()); 4817 evaluatePtrUse(Store, Store->getValueOperand()); 4818 } 4819 } 4820 for (auto *I : ScalarPtrs) 4821 if (!PossibleNonScalarPtrs.count(I)) { 4822 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4823 Worklist.insert(I); 4824 } 4825 4826 // Insert the forced scalars. 4827 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4828 // induction variable when the PHI user is scalarized. 4829 auto ForcedScalar = ForcedScalars.find(VF); 4830 if (ForcedScalar != ForcedScalars.end()) 4831 for (auto *I : ForcedScalar->second) 4832 Worklist.insert(I); 4833 4834 // Expand the worklist by looking through any bitcasts and getelementptr 4835 // instructions we've already identified as scalar. This is similar to the 4836 // expansion step in collectLoopUniforms(); however, here we're only 4837 // expanding to include additional bitcasts and getelementptr instructions. 4838 unsigned Idx = 0; 4839 while (Idx != Worklist.size()) { 4840 Instruction *Dst = Worklist[Idx++]; 4841 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4842 continue; 4843 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4844 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4845 auto *J = cast<Instruction>(U); 4846 return !TheLoop->contains(J) || Worklist.count(J) || 4847 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4848 isScalarUse(J, Src)); 4849 })) { 4850 Worklist.insert(Src); 4851 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4852 } 4853 } 4854 4855 // An induction variable will remain scalar if all users of the induction 4856 // variable and induction variable update remain scalar. 4857 for (auto &Induction : Legal->getInductionVars()) { 4858 auto *Ind = Induction.first; 4859 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4860 4861 // If tail-folding is applied, the primary induction variable will be used 4862 // to feed a vector compare. 4863 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4864 continue; 4865 4866 // Returns true if \p Indvar is a pointer induction that is used directly by 4867 // load/store instruction \p I. 4868 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4869 Instruction *I) { 4870 return Induction.second.getKind() == 4871 InductionDescriptor::IK_PtrInduction && 4872 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4873 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4874 }; 4875 4876 // Determine if all users of the induction variable are scalar after 4877 // vectorization. 4878 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4879 auto *I = cast<Instruction>(U); 4880 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4881 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4882 }); 4883 if (!ScalarInd) 4884 continue; 4885 4886 // Determine if all users of the induction variable update instruction are 4887 // scalar after vectorization. 4888 auto ScalarIndUpdate = 4889 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4890 auto *I = cast<Instruction>(U); 4891 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4892 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4893 }); 4894 if (!ScalarIndUpdate) 4895 continue; 4896 4897 // The induction variable and its update instruction will remain scalar. 4898 Worklist.insert(Ind); 4899 Worklist.insert(IndUpdate); 4900 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4901 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4902 << "\n"); 4903 } 4904 4905 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4906 } 4907 4908 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 4909 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4910 return false; 4911 switch(I->getOpcode()) { 4912 default: 4913 break; 4914 case Instruction::Load: 4915 case Instruction::Store: { 4916 if (!Legal->isMaskRequired(I)) 4917 return false; 4918 auto *Ptr = getLoadStorePointerOperand(I); 4919 auto *Ty = getLoadStoreType(I); 4920 const Align Alignment = getLoadStoreAlignment(I); 4921 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4922 TTI.isLegalMaskedGather(Ty, Alignment)) 4923 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4924 TTI.isLegalMaskedScatter(Ty, Alignment)); 4925 } 4926 case Instruction::UDiv: 4927 case Instruction::SDiv: 4928 case Instruction::SRem: 4929 case Instruction::URem: 4930 return mayDivideByZero(*I); 4931 } 4932 return false; 4933 } 4934 4935 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4936 Instruction *I, ElementCount VF) { 4937 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4938 assert(getWideningDecision(I, VF) == CM_Unknown && 4939 "Decision should not be set yet."); 4940 auto *Group = getInterleavedAccessGroup(I); 4941 assert(Group && "Must have a group."); 4942 4943 // If the instruction's allocated size doesn't equal it's type size, it 4944 // requires padding and will be scalarized. 4945 auto &DL = I->getModule()->getDataLayout(); 4946 auto *ScalarTy = getLoadStoreType(I); 4947 if (hasIrregularType(ScalarTy, DL)) 4948 return false; 4949 4950 // Check if masking is required. 4951 // A Group may need masking for one of two reasons: it resides in a block that 4952 // needs predication, or it was decided to use masking to deal with gaps 4953 // (either a gap at the end of a load-access that may result in a speculative 4954 // load, or any gaps in a store-access). 4955 bool PredicatedAccessRequiresMasking = 4956 blockNeedsPredicationForAnyReason(I->getParent()) && 4957 Legal->isMaskRequired(I); 4958 bool LoadAccessWithGapsRequiresEpilogMasking = 4959 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4960 !isScalarEpilogueAllowed(); 4961 bool StoreAccessWithGapsRequiresMasking = 4962 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4963 if (!PredicatedAccessRequiresMasking && 4964 !LoadAccessWithGapsRequiresEpilogMasking && 4965 !StoreAccessWithGapsRequiresMasking) 4966 return true; 4967 4968 // If masked interleaving is required, we expect that the user/target had 4969 // enabled it, because otherwise it either wouldn't have been created or 4970 // it should have been invalidated by the CostModel. 4971 assert(useMaskedInterleavedAccesses(TTI) && 4972 "Masked interleave-groups for predicated accesses are not enabled."); 4973 4974 if (Group->isReverse()) 4975 return false; 4976 4977 auto *Ty = getLoadStoreType(I); 4978 const Align Alignment = getLoadStoreAlignment(I); 4979 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4980 : TTI.isLegalMaskedStore(Ty, Alignment); 4981 } 4982 4983 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4984 Instruction *I, ElementCount VF) { 4985 // Get and ensure we have a valid memory instruction. 4986 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4987 4988 auto *Ptr = getLoadStorePointerOperand(I); 4989 auto *ScalarTy = getLoadStoreType(I); 4990 4991 // In order to be widened, the pointer should be consecutive, first of all. 4992 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4993 return false; 4994 4995 // If the instruction is a store located in a predicated block, it will be 4996 // scalarized. 4997 if (isScalarWithPredication(I)) 4998 return false; 4999 5000 // If the instruction's allocated size doesn't equal it's type size, it 5001 // requires padding and will be scalarized. 5002 auto &DL = I->getModule()->getDataLayout(); 5003 if (hasIrregularType(ScalarTy, DL)) 5004 return false; 5005 5006 return true; 5007 } 5008 5009 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5010 // We should not collect Uniforms more than once per VF. Right now, 5011 // this function is called from collectUniformsAndScalars(), which 5012 // already does this check. Collecting Uniforms for VF=1 does not make any 5013 // sense. 5014 5015 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5016 "This function should not be visited twice for the same VF"); 5017 5018 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5019 // not analyze again. Uniforms.count(VF) will return 1. 5020 Uniforms[VF].clear(); 5021 5022 // We now know that the loop is vectorizable! 5023 // Collect instructions inside the loop that will remain uniform after 5024 // vectorization. 5025 5026 // Global values, params and instructions outside of current loop are out of 5027 // scope. 5028 auto isOutOfScope = [&](Value *V) -> bool { 5029 Instruction *I = dyn_cast<Instruction>(V); 5030 return (!I || !TheLoop->contains(I)); 5031 }; 5032 5033 // Worklist containing uniform instructions demanding lane 0. 5034 SetVector<Instruction *> Worklist; 5035 BasicBlock *Latch = TheLoop->getLoopLatch(); 5036 5037 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5038 // that are scalar with predication must not be considered uniform after 5039 // vectorization, because that would create an erroneous replicating region 5040 // where only a single instance out of VF should be formed. 5041 // TODO: optimize such seldom cases if found important, see PR40816. 5042 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5043 if (isOutOfScope(I)) { 5044 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5045 << *I << "\n"); 5046 return; 5047 } 5048 if (isScalarWithPredication(I)) { 5049 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5050 << *I << "\n"); 5051 return; 5052 } 5053 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5054 Worklist.insert(I); 5055 }; 5056 5057 // Start with the conditional branch. If the branch condition is an 5058 // instruction contained in the loop that is only used by the branch, it is 5059 // uniform. 5060 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5061 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5062 addToWorklistIfAllowed(Cmp); 5063 5064 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5065 InstWidening WideningDecision = getWideningDecision(I, VF); 5066 assert(WideningDecision != CM_Unknown && 5067 "Widening decision should be ready at this moment"); 5068 5069 // A uniform memory op is itself uniform. We exclude uniform stores 5070 // here as they demand the last lane, not the first one. 5071 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5072 assert(WideningDecision == CM_Scalarize); 5073 return true; 5074 } 5075 5076 return (WideningDecision == CM_Widen || 5077 WideningDecision == CM_Widen_Reverse || 5078 WideningDecision == CM_Interleave); 5079 }; 5080 5081 5082 // Returns true if Ptr is the pointer operand of a memory access instruction 5083 // I, and I is known to not require scalarization. 5084 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5085 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5086 }; 5087 5088 // Holds a list of values which are known to have at least one uniform use. 5089 // Note that there may be other uses which aren't uniform. A "uniform use" 5090 // here is something which only demands lane 0 of the unrolled iterations; 5091 // it does not imply that all lanes produce the same value (e.g. this is not 5092 // the usual meaning of uniform) 5093 SetVector<Value *> HasUniformUse; 5094 5095 // Scan the loop for instructions which are either a) known to have only 5096 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5097 for (auto *BB : TheLoop->blocks()) 5098 for (auto &I : *BB) { 5099 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5100 switch (II->getIntrinsicID()) { 5101 case Intrinsic::sideeffect: 5102 case Intrinsic::experimental_noalias_scope_decl: 5103 case Intrinsic::assume: 5104 case Intrinsic::lifetime_start: 5105 case Intrinsic::lifetime_end: 5106 if (TheLoop->hasLoopInvariantOperands(&I)) 5107 addToWorklistIfAllowed(&I); 5108 break; 5109 default: 5110 break; 5111 } 5112 } 5113 5114 // ExtractValue instructions must be uniform, because the operands are 5115 // known to be loop-invariant. 5116 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5117 assert(isOutOfScope(EVI->getAggregateOperand()) && 5118 "Expected aggregate value to be loop invariant"); 5119 addToWorklistIfAllowed(EVI); 5120 continue; 5121 } 5122 5123 // If there's no pointer operand, there's nothing to do. 5124 auto *Ptr = getLoadStorePointerOperand(&I); 5125 if (!Ptr) 5126 continue; 5127 5128 // A uniform memory op is itself uniform. We exclude uniform stores 5129 // here as they demand the last lane, not the first one. 5130 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5131 addToWorklistIfAllowed(&I); 5132 5133 if (isUniformDecision(&I, VF)) { 5134 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5135 HasUniformUse.insert(Ptr); 5136 } 5137 } 5138 5139 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5140 // demanding) users. Since loops are assumed to be in LCSSA form, this 5141 // disallows uses outside the loop as well. 5142 for (auto *V : HasUniformUse) { 5143 if (isOutOfScope(V)) 5144 continue; 5145 auto *I = cast<Instruction>(V); 5146 auto UsersAreMemAccesses = 5147 llvm::all_of(I->users(), [&](User *U) -> bool { 5148 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5149 }); 5150 if (UsersAreMemAccesses) 5151 addToWorklistIfAllowed(I); 5152 } 5153 5154 // Expand Worklist in topological order: whenever a new instruction 5155 // is added , its users should be already inside Worklist. It ensures 5156 // a uniform instruction will only be used by uniform instructions. 5157 unsigned idx = 0; 5158 while (idx != Worklist.size()) { 5159 Instruction *I = Worklist[idx++]; 5160 5161 for (auto OV : I->operand_values()) { 5162 // isOutOfScope operands cannot be uniform instructions. 5163 if (isOutOfScope(OV)) 5164 continue; 5165 // First order recurrence Phi's should typically be considered 5166 // non-uniform. 5167 auto *OP = dyn_cast<PHINode>(OV); 5168 if (OP && Legal->isFirstOrderRecurrence(OP)) 5169 continue; 5170 // If all the users of the operand are uniform, then add the 5171 // operand into the uniform worklist. 5172 auto *OI = cast<Instruction>(OV); 5173 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5174 auto *J = cast<Instruction>(U); 5175 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5176 })) 5177 addToWorklistIfAllowed(OI); 5178 } 5179 } 5180 5181 // For an instruction to be added into Worklist above, all its users inside 5182 // the loop should also be in Worklist. However, this condition cannot be 5183 // true for phi nodes that form a cyclic dependence. We must process phi 5184 // nodes separately. An induction variable will remain uniform if all users 5185 // of the induction variable and induction variable update remain uniform. 5186 // The code below handles both pointer and non-pointer induction variables. 5187 for (auto &Induction : Legal->getInductionVars()) { 5188 auto *Ind = Induction.first; 5189 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5190 5191 // Determine if all users of the induction variable are uniform after 5192 // vectorization. 5193 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5194 auto *I = cast<Instruction>(U); 5195 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5196 isVectorizedMemAccessUse(I, Ind); 5197 }); 5198 if (!UniformInd) 5199 continue; 5200 5201 // Determine if all users of the induction variable update instruction are 5202 // uniform after vectorization. 5203 auto UniformIndUpdate = 5204 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5205 auto *I = cast<Instruction>(U); 5206 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5207 isVectorizedMemAccessUse(I, IndUpdate); 5208 }); 5209 if (!UniformIndUpdate) 5210 continue; 5211 5212 // The induction variable and its update instruction will remain uniform. 5213 addToWorklistIfAllowed(Ind); 5214 addToWorklistIfAllowed(IndUpdate); 5215 } 5216 5217 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5218 } 5219 5220 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5221 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5222 5223 if (Legal->getRuntimePointerChecking()->Need) { 5224 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5225 "runtime pointer checks needed. Enable vectorization of this " 5226 "loop with '#pragma clang loop vectorize(enable)' when " 5227 "compiling with -Os/-Oz", 5228 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5229 return true; 5230 } 5231 5232 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5233 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5234 "runtime SCEV checks needed. Enable vectorization of this " 5235 "loop with '#pragma clang loop vectorize(enable)' when " 5236 "compiling with -Os/-Oz", 5237 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5238 return true; 5239 } 5240 5241 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5242 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5243 reportVectorizationFailure("Runtime stride check for small trip count", 5244 "runtime stride == 1 checks needed. Enable vectorization of " 5245 "this loop without such check by compiling with -Os/-Oz", 5246 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5247 return true; 5248 } 5249 5250 return false; 5251 } 5252 5253 ElementCount 5254 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5255 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5256 return ElementCount::getScalable(0); 5257 5258 if (Hints->isScalableVectorizationDisabled()) { 5259 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5260 "ScalableVectorizationDisabled", ORE, TheLoop); 5261 return ElementCount::getScalable(0); 5262 } 5263 5264 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5265 5266 auto MaxScalableVF = ElementCount::getScalable( 5267 std::numeric_limits<ElementCount::ScalarTy>::max()); 5268 5269 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5270 // FIXME: While for scalable vectors this is currently sufficient, this should 5271 // be replaced by a more detailed mechanism that filters out specific VFs, 5272 // instead of invalidating vectorization for a whole set of VFs based on the 5273 // MaxVF. 5274 5275 // Disable scalable vectorization if the loop contains unsupported reductions. 5276 if (!canVectorizeReductions(MaxScalableVF)) { 5277 reportVectorizationInfo( 5278 "Scalable vectorization not supported for the reduction " 5279 "operations found in this loop.", 5280 "ScalableVFUnfeasible", ORE, TheLoop); 5281 return ElementCount::getScalable(0); 5282 } 5283 5284 // Disable scalable vectorization if the loop contains any instructions 5285 // with element types not supported for scalable vectors. 5286 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5287 return !Ty->isVoidTy() && 5288 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5289 })) { 5290 reportVectorizationInfo("Scalable vectorization is not supported " 5291 "for all element types found in this loop.", 5292 "ScalableVFUnfeasible", ORE, TheLoop); 5293 return ElementCount::getScalable(0); 5294 } 5295 5296 if (Legal->isSafeForAnyVectorWidth()) 5297 return MaxScalableVF; 5298 5299 // Limit MaxScalableVF by the maximum safe dependence distance. 5300 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5301 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5302 MaxVScale = 5303 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5304 MaxScalableVF = ElementCount::getScalable( 5305 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5306 if (!MaxScalableVF) 5307 reportVectorizationInfo( 5308 "Max legal vector width too small, scalable vectorization " 5309 "unfeasible.", 5310 "ScalableVFUnfeasible", ORE, TheLoop); 5311 5312 return MaxScalableVF; 5313 } 5314 5315 FixedScalableVFPair 5316 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5317 ElementCount UserVF) { 5318 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5319 unsigned SmallestType, WidestType; 5320 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5321 5322 // Get the maximum safe dependence distance in bits computed by LAA. 5323 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5324 // the memory accesses that is most restrictive (involved in the smallest 5325 // dependence distance). 5326 unsigned MaxSafeElements = 5327 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5328 5329 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5330 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5331 5332 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5333 << ".\n"); 5334 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5335 << ".\n"); 5336 5337 // First analyze the UserVF, fall back if the UserVF should be ignored. 5338 if (UserVF) { 5339 auto MaxSafeUserVF = 5340 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5341 5342 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5343 // If `VF=vscale x N` is safe, then so is `VF=N` 5344 if (UserVF.isScalable()) 5345 return FixedScalableVFPair( 5346 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5347 else 5348 return UserVF; 5349 } 5350 5351 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5352 5353 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5354 // is better to ignore the hint and let the compiler choose a suitable VF. 5355 if (!UserVF.isScalable()) { 5356 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5357 << " is unsafe, clamping to max safe VF=" 5358 << MaxSafeFixedVF << ".\n"); 5359 ORE->emit([&]() { 5360 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5361 TheLoop->getStartLoc(), 5362 TheLoop->getHeader()) 5363 << "User-specified vectorization factor " 5364 << ore::NV("UserVectorizationFactor", UserVF) 5365 << " is unsafe, clamping to maximum safe vectorization factor " 5366 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5367 }); 5368 return MaxSafeFixedVF; 5369 } 5370 5371 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5372 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5373 << " is ignored because scalable vectors are not " 5374 "available.\n"); 5375 ORE->emit([&]() { 5376 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5377 TheLoop->getStartLoc(), 5378 TheLoop->getHeader()) 5379 << "User-specified vectorization factor " 5380 << ore::NV("UserVectorizationFactor", UserVF) 5381 << " is ignored because the target does not support scalable " 5382 "vectors. The compiler will pick a more suitable value."; 5383 }); 5384 } else { 5385 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5386 << " is unsafe. Ignoring scalable UserVF.\n"); 5387 ORE->emit([&]() { 5388 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5389 TheLoop->getStartLoc(), 5390 TheLoop->getHeader()) 5391 << "User-specified vectorization factor " 5392 << ore::NV("UserVectorizationFactor", UserVF) 5393 << " is unsafe. Ignoring the hint to let the compiler pick a " 5394 "more suitable value."; 5395 }); 5396 } 5397 } 5398 5399 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5400 << " / " << WidestType << " bits.\n"); 5401 5402 FixedScalableVFPair Result(ElementCount::getFixed(1), 5403 ElementCount::getScalable(0)); 5404 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5405 WidestType, MaxSafeFixedVF)) 5406 Result.FixedVF = MaxVF; 5407 5408 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5409 WidestType, MaxSafeScalableVF)) 5410 if (MaxVF.isScalable()) { 5411 Result.ScalableVF = MaxVF; 5412 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5413 << "\n"); 5414 } 5415 5416 return Result; 5417 } 5418 5419 FixedScalableVFPair 5420 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5421 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5422 // TODO: It may by useful to do since it's still likely to be dynamically 5423 // uniform if the target can skip. 5424 reportVectorizationFailure( 5425 "Not inserting runtime ptr check for divergent target", 5426 "runtime pointer checks needed. Not enabled for divergent target", 5427 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5428 return FixedScalableVFPair::getNone(); 5429 } 5430 5431 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5432 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5433 if (TC == 1) { 5434 reportVectorizationFailure("Single iteration (non) loop", 5435 "loop trip count is one, irrelevant for vectorization", 5436 "SingleIterationLoop", ORE, TheLoop); 5437 return FixedScalableVFPair::getNone(); 5438 } 5439 5440 switch (ScalarEpilogueStatus) { 5441 case CM_ScalarEpilogueAllowed: 5442 return computeFeasibleMaxVF(TC, UserVF); 5443 case CM_ScalarEpilogueNotAllowedUsePredicate: 5444 LLVM_FALLTHROUGH; 5445 case CM_ScalarEpilogueNotNeededUsePredicate: 5446 LLVM_DEBUG( 5447 dbgs() << "LV: vector predicate hint/switch found.\n" 5448 << "LV: Not allowing scalar epilogue, creating predicated " 5449 << "vector loop.\n"); 5450 break; 5451 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5452 // fallthrough as a special case of OptForSize 5453 case CM_ScalarEpilogueNotAllowedOptSize: 5454 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5455 LLVM_DEBUG( 5456 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5457 else 5458 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5459 << "count.\n"); 5460 5461 // Bail if runtime checks are required, which are not good when optimising 5462 // for size. 5463 if (runtimeChecksRequired()) 5464 return FixedScalableVFPair::getNone(); 5465 5466 break; 5467 } 5468 5469 // The only loops we can vectorize without a scalar epilogue, are loops with 5470 // a bottom-test and a single exiting block. We'd have to handle the fact 5471 // that not every instruction executes on the last iteration. This will 5472 // require a lane mask which varies through the vector loop body. (TODO) 5473 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5474 // If there was a tail-folding hint/switch, but we can't fold the tail by 5475 // masking, fallback to a vectorization with a scalar epilogue. 5476 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5477 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5478 "scalar epilogue instead.\n"); 5479 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5480 return computeFeasibleMaxVF(TC, UserVF); 5481 } 5482 return FixedScalableVFPair::getNone(); 5483 } 5484 5485 // Now try the tail folding 5486 5487 // Invalidate interleave groups that require an epilogue if we can't mask 5488 // the interleave-group. 5489 if (!useMaskedInterleavedAccesses(TTI)) { 5490 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5491 "No decisions should have been taken at this point"); 5492 // Note: There is no need to invalidate any cost modeling decisions here, as 5493 // non where taken so far. 5494 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5495 } 5496 5497 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5498 // Avoid tail folding if the trip count is known to be a multiple of any VF 5499 // we chose. 5500 // FIXME: The condition below pessimises the case for fixed-width vectors, 5501 // when scalable VFs are also candidates for vectorization. 5502 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5503 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5504 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5505 "MaxFixedVF must be a power of 2"); 5506 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5507 : MaxFixedVF.getFixedValue(); 5508 ScalarEvolution *SE = PSE.getSE(); 5509 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5510 const SCEV *ExitCount = SE->getAddExpr( 5511 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5512 const SCEV *Rem = SE->getURemExpr( 5513 SE->applyLoopGuards(ExitCount, TheLoop), 5514 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5515 if (Rem->isZero()) { 5516 // Accept MaxFixedVF if we do not have a tail. 5517 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5518 return MaxFactors; 5519 } 5520 } 5521 5522 // For scalable vectors, don't use tail folding as this is currently not yet 5523 // supported. The code is likely to have ended up here if the tripcount is 5524 // low, in which case it makes sense not to use scalable vectors. 5525 if (MaxFactors.ScalableVF.isVector()) 5526 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5527 5528 // If we don't know the precise trip count, or if the trip count that we 5529 // found modulo the vectorization factor is not zero, try to fold the tail 5530 // by masking. 5531 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5532 if (Legal->prepareToFoldTailByMasking()) { 5533 FoldTailByMasking = true; 5534 return MaxFactors; 5535 } 5536 5537 // If there was a tail-folding hint/switch, but we can't fold the tail by 5538 // masking, fallback to a vectorization with a scalar epilogue. 5539 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5540 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5541 "scalar epilogue instead.\n"); 5542 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5543 return MaxFactors; 5544 } 5545 5546 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5547 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5548 return FixedScalableVFPair::getNone(); 5549 } 5550 5551 if (TC == 0) { 5552 reportVectorizationFailure( 5553 "Unable to calculate the loop count due to complex control flow", 5554 "unable to calculate the loop count due to complex control flow", 5555 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5556 return FixedScalableVFPair::getNone(); 5557 } 5558 5559 reportVectorizationFailure( 5560 "Cannot optimize for size and vectorize at the same time.", 5561 "cannot optimize for size and vectorize at the same time. " 5562 "Enable vectorization of this loop with '#pragma clang loop " 5563 "vectorize(enable)' when compiling with -Os/-Oz", 5564 "NoTailLoopWithOptForSize", ORE, TheLoop); 5565 return FixedScalableVFPair::getNone(); 5566 } 5567 5568 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5569 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5570 const ElementCount &MaxSafeVF) { 5571 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5572 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5573 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5574 : TargetTransformInfo::RGK_FixedWidthVector); 5575 5576 // Convenience function to return the minimum of two ElementCounts. 5577 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5578 assert((LHS.isScalable() == RHS.isScalable()) && 5579 "Scalable flags must match"); 5580 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5581 }; 5582 5583 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5584 // Note that both WidestRegister and WidestType may not be a powers of 2. 5585 auto MaxVectorElementCount = ElementCount::get( 5586 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5587 ComputeScalableMaxVF); 5588 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5589 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5590 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5591 5592 if (!MaxVectorElementCount) { 5593 LLVM_DEBUG(dbgs() << "LV: The target has no " 5594 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5595 << " vector registers.\n"); 5596 return ElementCount::getFixed(1); 5597 } 5598 5599 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5600 if (ConstTripCount && 5601 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5602 isPowerOf2_32(ConstTripCount)) { 5603 // We need to clamp the VF to be the ConstTripCount. There is no point in 5604 // choosing a higher viable VF as done in the loop below. If 5605 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5606 // the TC is less than or equal to the known number of lanes. 5607 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5608 << ConstTripCount << "\n"); 5609 return TripCountEC; 5610 } 5611 5612 ElementCount MaxVF = MaxVectorElementCount; 5613 if (TTI.shouldMaximizeVectorBandwidth() || 5614 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5615 auto MaxVectorElementCountMaxBW = ElementCount::get( 5616 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5617 ComputeScalableMaxVF); 5618 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5619 5620 // Collect all viable vectorization factors larger than the default MaxVF 5621 // (i.e. MaxVectorElementCount). 5622 SmallVector<ElementCount, 8> VFs; 5623 for (ElementCount VS = MaxVectorElementCount * 2; 5624 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5625 VFs.push_back(VS); 5626 5627 // For each VF calculate its register usage. 5628 auto RUs = calculateRegisterUsage(VFs); 5629 5630 // Select the largest VF which doesn't require more registers than existing 5631 // ones. 5632 for (int i = RUs.size() - 1; i >= 0; --i) { 5633 bool Selected = true; 5634 for (auto &pair : RUs[i].MaxLocalUsers) { 5635 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5636 if (pair.second > TargetNumRegisters) 5637 Selected = false; 5638 } 5639 if (Selected) { 5640 MaxVF = VFs[i]; 5641 break; 5642 } 5643 } 5644 if (ElementCount MinVF = 5645 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5646 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5647 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5648 << ") with target's minimum: " << MinVF << '\n'); 5649 MaxVF = MinVF; 5650 } 5651 } 5652 } 5653 return MaxVF; 5654 } 5655 5656 bool LoopVectorizationCostModel::isMoreProfitable( 5657 const VectorizationFactor &A, const VectorizationFactor &B) const { 5658 InstructionCost CostA = A.Cost; 5659 InstructionCost CostB = B.Cost; 5660 5661 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5662 5663 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5664 MaxTripCount) { 5665 // If we are folding the tail and the trip count is a known (possibly small) 5666 // constant, the trip count will be rounded up to an integer number of 5667 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5668 // which we compare directly. When not folding the tail, the total cost will 5669 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5670 // approximated with the per-lane cost below instead of using the tripcount 5671 // as here. 5672 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5673 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5674 return RTCostA < RTCostB; 5675 } 5676 5677 // Improve estimate for the vector width if it is scalable. 5678 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5679 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5680 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5681 if (A.Width.isScalable()) 5682 EstimatedWidthA *= VScale.getValue(); 5683 if (B.Width.isScalable()) 5684 EstimatedWidthB *= VScale.getValue(); 5685 } 5686 5687 // When set to preferred, for now assume vscale may be larger than 1 (or the 5688 // one being tuned for), so that scalable vectorization is slightly favorable 5689 // over fixed-width vectorization. 5690 if (Hints->isScalableVectorizationPreferred()) 5691 if (A.Width.isScalable() && !B.Width.isScalable()) 5692 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5693 5694 // To avoid the need for FP division: 5695 // (CostA / A.Width) < (CostB / B.Width) 5696 // <=> (CostA * B.Width) < (CostB * A.Width) 5697 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5698 } 5699 5700 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5701 const ElementCountSet &VFCandidates) { 5702 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5703 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5704 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5705 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5706 "Expected Scalar VF to be a candidate"); 5707 5708 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5709 VectorizationFactor ChosenFactor = ScalarCost; 5710 5711 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5712 if (ForceVectorization && VFCandidates.size() > 1) { 5713 // Ignore scalar width, because the user explicitly wants vectorization. 5714 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5715 // evaluation. 5716 ChosenFactor.Cost = InstructionCost::getMax(); 5717 } 5718 5719 SmallVector<InstructionVFPair> InvalidCosts; 5720 for (const auto &i : VFCandidates) { 5721 // The cost for scalar VF=1 is already calculated, so ignore it. 5722 if (i.isScalar()) 5723 continue; 5724 5725 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5726 VectorizationFactor Candidate(i, C.first); 5727 5728 #ifndef NDEBUG 5729 unsigned AssumedMinimumVscale = 1; 5730 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5731 AssumedMinimumVscale = VScale.getValue(); 5732 unsigned Width = 5733 Candidate.Width.isScalable() 5734 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5735 : Candidate.Width.getFixedValue(); 5736 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5737 << " costs: " << (Candidate.Cost / Width)); 5738 if (i.isScalable()) 5739 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5740 << AssumedMinimumVscale << ")"); 5741 LLVM_DEBUG(dbgs() << ".\n"); 5742 #endif 5743 5744 if (!C.second && !ForceVectorization) { 5745 LLVM_DEBUG( 5746 dbgs() << "LV: Not considering vector loop of width " << i 5747 << " because it will not generate any vector instructions.\n"); 5748 continue; 5749 } 5750 5751 // If profitable add it to ProfitableVF list. 5752 if (isMoreProfitable(Candidate, ScalarCost)) 5753 ProfitableVFs.push_back(Candidate); 5754 5755 if (isMoreProfitable(Candidate, ChosenFactor)) 5756 ChosenFactor = Candidate; 5757 } 5758 5759 // Emit a report of VFs with invalid costs in the loop. 5760 if (!InvalidCosts.empty()) { 5761 // Group the remarks per instruction, keeping the instruction order from 5762 // InvalidCosts. 5763 std::map<Instruction *, unsigned> Numbering; 5764 unsigned I = 0; 5765 for (auto &Pair : InvalidCosts) 5766 if (!Numbering.count(Pair.first)) 5767 Numbering[Pair.first] = I++; 5768 5769 // Sort the list, first on instruction(number) then on VF. 5770 llvm::sort(InvalidCosts, 5771 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5772 if (Numbering[A.first] != Numbering[B.first]) 5773 return Numbering[A.first] < Numbering[B.first]; 5774 ElementCountComparator ECC; 5775 return ECC(A.second, B.second); 5776 }); 5777 5778 // For a list of ordered instruction-vf pairs: 5779 // [(load, vf1), (load, vf2), (store, vf1)] 5780 // Group the instructions together to emit separate remarks for: 5781 // load (vf1, vf2) 5782 // store (vf1) 5783 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5784 auto Subset = ArrayRef<InstructionVFPair>(); 5785 do { 5786 if (Subset.empty()) 5787 Subset = Tail.take_front(1); 5788 5789 Instruction *I = Subset.front().first; 5790 5791 // If the next instruction is different, or if there are no other pairs, 5792 // emit a remark for the collated subset. e.g. 5793 // [(load, vf1), (load, vf2))] 5794 // to emit: 5795 // remark: invalid costs for 'load' at VF=(vf, vf2) 5796 if (Subset == Tail || Tail[Subset.size()].first != I) { 5797 std::string OutString; 5798 raw_string_ostream OS(OutString); 5799 assert(!Subset.empty() && "Unexpected empty range"); 5800 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5801 for (auto &Pair : Subset) 5802 OS << (Pair.second == Subset.front().second ? "" : ", ") 5803 << Pair.second; 5804 OS << "):"; 5805 if (auto *CI = dyn_cast<CallInst>(I)) 5806 OS << " call to " << CI->getCalledFunction()->getName(); 5807 else 5808 OS << " " << I->getOpcodeName(); 5809 OS.flush(); 5810 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5811 Tail = Tail.drop_front(Subset.size()); 5812 Subset = {}; 5813 } else 5814 // Grow the subset by one element 5815 Subset = Tail.take_front(Subset.size() + 1); 5816 } while (!Tail.empty()); 5817 } 5818 5819 if (!EnableCondStoresVectorization && NumPredStores) { 5820 reportVectorizationFailure("There are conditional stores.", 5821 "store that is conditionally executed prevents vectorization", 5822 "ConditionalStore", ORE, TheLoop); 5823 ChosenFactor = ScalarCost; 5824 } 5825 5826 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5827 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5828 << "LV: Vectorization seems to be not beneficial, " 5829 << "but was forced by a user.\n"); 5830 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5831 return ChosenFactor; 5832 } 5833 5834 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5835 const Loop &L, ElementCount VF) const { 5836 // Cross iteration phis such as reductions need special handling and are 5837 // currently unsupported. 5838 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5839 return Legal->isFirstOrderRecurrence(&Phi) || 5840 Legal->isReductionVariable(&Phi); 5841 })) 5842 return false; 5843 5844 // Phis with uses outside of the loop require special handling and are 5845 // currently unsupported. 5846 for (auto &Entry : Legal->getInductionVars()) { 5847 // Look for uses of the value of the induction at the last iteration. 5848 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5849 for (User *U : PostInc->users()) 5850 if (!L.contains(cast<Instruction>(U))) 5851 return false; 5852 // Look for uses of penultimate value of the induction. 5853 for (User *U : Entry.first->users()) 5854 if (!L.contains(cast<Instruction>(U))) 5855 return false; 5856 } 5857 5858 // Induction variables that are widened require special handling that is 5859 // currently not supported. 5860 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5861 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5862 this->isProfitableToScalarize(Entry.first, VF)); 5863 })) 5864 return false; 5865 5866 // Epilogue vectorization code has not been auditted to ensure it handles 5867 // non-latch exits properly. It may be fine, but it needs auditted and 5868 // tested. 5869 if (L.getExitingBlock() != L.getLoopLatch()) 5870 return false; 5871 5872 return true; 5873 } 5874 5875 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5876 const ElementCount VF) const { 5877 // FIXME: We need a much better cost-model to take different parameters such 5878 // as register pressure, code size increase and cost of extra branches into 5879 // account. For now we apply a very crude heuristic and only consider loops 5880 // with vectorization factors larger than a certain value. 5881 // We also consider epilogue vectorization unprofitable for targets that don't 5882 // consider interleaving beneficial (eg. MVE). 5883 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5884 return false; 5885 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5886 return true; 5887 return false; 5888 } 5889 5890 VectorizationFactor 5891 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5892 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5893 VectorizationFactor Result = VectorizationFactor::Disabled(); 5894 if (!EnableEpilogueVectorization) { 5895 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5896 return Result; 5897 } 5898 5899 if (!isScalarEpilogueAllowed()) { 5900 LLVM_DEBUG( 5901 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5902 "allowed.\n";); 5903 return Result; 5904 } 5905 5906 // Not really a cost consideration, but check for unsupported cases here to 5907 // simplify the logic. 5908 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5909 LLVM_DEBUG( 5910 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5911 "not a supported candidate.\n";); 5912 return Result; 5913 } 5914 5915 if (EpilogueVectorizationForceVF > 1) { 5916 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5917 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5918 if (LVP.hasPlanWithVF(ForcedEC)) 5919 return {ForcedEC, 0}; 5920 else { 5921 LLVM_DEBUG( 5922 dbgs() 5923 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5924 return Result; 5925 } 5926 } 5927 5928 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5929 TheLoop->getHeader()->getParent()->hasMinSize()) { 5930 LLVM_DEBUG( 5931 dbgs() 5932 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5933 return Result; 5934 } 5935 5936 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5937 if (MainLoopVF.isScalable()) 5938 LLVM_DEBUG( 5939 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 5940 "yet supported. Converting to fixed-width (VF=" 5941 << FixedMainLoopVF << ") instead\n"); 5942 5943 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 5944 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5945 "this loop\n"); 5946 return Result; 5947 } 5948 5949 for (auto &NextVF : ProfitableVFs) 5950 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 5951 (Result.Width.getFixedValue() == 1 || 5952 isMoreProfitable(NextVF, Result)) && 5953 LVP.hasPlanWithVF(NextVF.Width)) 5954 Result = NextVF; 5955 5956 if (Result != VectorizationFactor::Disabled()) 5957 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5958 << Result.Width.getFixedValue() << "\n";); 5959 return Result; 5960 } 5961 5962 std::pair<unsigned, unsigned> 5963 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5964 unsigned MinWidth = -1U; 5965 unsigned MaxWidth = 8; 5966 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5967 for (Type *T : ElementTypesInLoop) { 5968 MinWidth = std::min<unsigned>( 5969 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5970 MaxWidth = std::max<unsigned>( 5971 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5972 } 5973 return {MinWidth, MaxWidth}; 5974 } 5975 5976 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5977 ElementTypesInLoop.clear(); 5978 // For each block. 5979 for (BasicBlock *BB : TheLoop->blocks()) { 5980 // For each instruction in the loop. 5981 for (Instruction &I : BB->instructionsWithoutDebug()) { 5982 Type *T = I.getType(); 5983 5984 // Skip ignored values. 5985 if (ValuesToIgnore.count(&I)) 5986 continue; 5987 5988 // Only examine Loads, Stores and PHINodes. 5989 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5990 continue; 5991 5992 // Examine PHI nodes that are reduction variables. Update the type to 5993 // account for the recurrence type. 5994 if (auto *PN = dyn_cast<PHINode>(&I)) { 5995 if (!Legal->isReductionVariable(PN)) 5996 continue; 5997 const RecurrenceDescriptor &RdxDesc = 5998 Legal->getReductionVars().find(PN)->second; 5999 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6000 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6001 RdxDesc.getRecurrenceType(), 6002 TargetTransformInfo::ReductionFlags())) 6003 continue; 6004 T = RdxDesc.getRecurrenceType(); 6005 } 6006 6007 // Examine the stored values. 6008 if (auto *ST = dyn_cast<StoreInst>(&I)) 6009 T = ST->getValueOperand()->getType(); 6010 6011 // Ignore loaded pointer types and stored pointer types that are not 6012 // vectorizable. 6013 // 6014 // FIXME: The check here attempts to predict whether a load or store will 6015 // be vectorized. We only know this for certain after a VF has 6016 // been selected. Here, we assume that if an access can be 6017 // vectorized, it will be. We should also look at extending this 6018 // optimization to non-pointer types. 6019 // 6020 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6021 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6022 continue; 6023 6024 ElementTypesInLoop.insert(T); 6025 } 6026 } 6027 } 6028 6029 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6030 unsigned LoopCost) { 6031 // -- The interleave heuristics -- 6032 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6033 // There are many micro-architectural considerations that we can't predict 6034 // at this level. For example, frontend pressure (on decode or fetch) due to 6035 // code size, or the number and capabilities of the execution ports. 6036 // 6037 // We use the following heuristics to select the interleave count: 6038 // 1. If the code has reductions, then we interleave to break the cross 6039 // iteration dependency. 6040 // 2. If the loop is really small, then we interleave to reduce the loop 6041 // overhead. 6042 // 3. We don't interleave if we think that we will spill registers to memory 6043 // due to the increased register pressure. 6044 6045 if (!isScalarEpilogueAllowed()) 6046 return 1; 6047 6048 // We used the distance for the interleave count. 6049 if (Legal->getMaxSafeDepDistBytes() != -1U) 6050 return 1; 6051 6052 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6053 const bool HasReductions = !Legal->getReductionVars().empty(); 6054 // Do not interleave loops with a relatively small known or estimated trip 6055 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6056 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6057 // because with the above conditions interleaving can expose ILP and break 6058 // cross iteration dependences for reductions. 6059 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6060 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6061 return 1; 6062 6063 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6064 // We divide by these constants so assume that we have at least one 6065 // instruction that uses at least one register. 6066 for (auto& pair : R.MaxLocalUsers) { 6067 pair.second = std::max(pair.second, 1U); 6068 } 6069 6070 // We calculate the interleave count using the following formula. 6071 // Subtract the number of loop invariants from the number of available 6072 // registers. These registers are used by all of the interleaved instances. 6073 // Next, divide the remaining registers by the number of registers that is 6074 // required by the loop, in order to estimate how many parallel instances 6075 // fit without causing spills. All of this is rounded down if necessary to be 6076 // a power of two. We want power of two interleave count to simplify any 6077 // addressing operations or alignment considerations. 6078 // We also want power of two interleave counts to ensure that the induction 6079 // variable of the vector loop wraps to zero, when tail is folded by masking; 6080 // this currently happens when OptForSize, in which case IC is set to 1 above. 6081 unsigned IC = UINT_MAX; 6082 6083 for (auto& pair : R.MaxLocalUsers) { 6084 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6085 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6086 << " registers of " 6087 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6088 if (VF.isScalar()) { 6089 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6090 TargetNumRegisters = ForceTargetNumScalarRegs; 6091 } else { 6092 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6093 TargetNumRegisters = ForceTargetNumVectorRegs; 6094 } 6095 unsigned MaxLocalUsers = pair.second; 6096 unsigned LoopInvariantRegs = 0; 6097 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6098 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6099 6100 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6101 // Don't count the induction variable as interleaved. 6102 if (EnableIndVarRegisterHeur) { 6103 TmpIC = 6104 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6105 std::max(1U, (MaxLocalUsers - 1))); 6106 } 6107 6108 IC = std::min(IC, TmpIC); 6109 } 6110 6111 // Clamp the interleave ranges to reasonable counts. 6112 unsigned MaxInterleaveCount = 6113 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6114 6115 // Check if the user has overridden the max. 6116 if (VF.isScalar()) { 6117 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6118 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6119 } else { 6120 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6121 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6122 } 6123 6124 // If trip count is known or estimated compile time constant, limit the 6125 // interleave count to be less than the trip count divided by VF, provided it 6126 // is at least 1. 6127 // 6128 // For scalable vectors we can't know if interleaving is beneficial. It may 6129 // not be beneficial for small loops if none of the lanes in the second vector 6130 // iterations is enabled. However, for larger loops, there is likely to be a 6131 // similar benefit as for fixed-width vectors. For now, we choose to leave 6132 // the InterleaveCount as if vscale is '1', although if some information about 6133 // the vector is known (e.g. min vector size), we can make a better decision. 6134 if (BestKnownTC) { 6135 MaxInterleaveCount = 6136 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6137 // Make sure MaxInterleaveCount is greater than 0. 6138 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6139 } 6140 6141 assert(MaxInterleaveCount > 0 && 6142 "Maximum interleave count must be greater than 0"); 6143 6144 // Clamp the calculated IC to be between the 1 and the max interleave count 6145 // that the target and trip count allows. 6146 if (IC > MaxInterleaveCount) 6147 IC = MaxInterleaveCount; 6148 else 6149 // Make sure IC is greater than 0. 6150 IC = std::max(1u, IC); 6151 6152 assert(IC > 0 && "Interleave count must be greater than 0."); 6153 6154 // If we did not calculate the cost for VF (because the user selected the VF) 6155 // then we calculate the cost of VF here. 6156 if (LoopCost == 0) { 6157 InstructionCost C = expectedCost(VF).first; 6158 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6159 LoopCost = *C.getValue(); 6160 } 6161 6162 assert(LoopCost && "Non-zero loop cost expected"); 6163 6164 // Interleave if we vectorized this loop and there is a reduction that could 6165 // benefit from interleaving. 6166 if (VF.isVector() && HasReductions) { 6167 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6168 return IC; 6169 } 6170 6171 // Note that if we've already vectorized the loop we will have done the 6172 // runtime check and so interleaving won't require further checks. 6173 bool InterleavingRequiresRuntimePointerCheck = 6174 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6175 6176 // We want to interleave small loops in order to reduce the loop overhead and 6177 // potentially expose ILP opportunities. 6178 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6179 << "LV: IC is " << IC << '\n' 6180 << "LV: VF is " << VF << '\n'); 6181 const bool AggressivelyInterleaveReductions = 6182 TTI.enableAggressiveInterleaving(HasReductions); 6183 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6184 // We assume that the cost overhead is 1 and we use the cost model 6185 // to estimate the cost of the loop and interleave until the cost of the 6186 // loop overhead is about 5% of the cost of the loop. 6187 unsigned SmallIC = 6188 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6189 6190 // Interleave until store/load ports (estimated by max interleave count) are 6191 // saturated. 6192 unsigned NumStores = Legal->getNumStores(); 6193 unsigned NumLoads = Legal->getNumLoads(); 6194 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6195 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6196 6197 // There is little point in interleaving for reductions containing selects 6198 // and compares when VF=1 since it may just create more overhead than it's 6199 // worth for loops with small trip counts. This is because we still have to 6200 // do the final reduction after the loop. 6201 bool HasSelectCmpReductions = 6202 HasReductions && 6203 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6204 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6205 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6206 RdxDesc.getRecurrenceKind()); 6207 }); 6208 if (HasSelectCmpReductions) { 6209 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6210 return 1; 6211 } 6212 6213 // If we have a scalar reduction (vector reductions are already dealt with 6214 // by this point), we can increase the critical path length if the loop 6215 // we're interleaving is inside another loop. For tree-wise reductions 6216 // set the limit to 2, and for ordered reductions it's best to disable 6217 // interleaving entirely. 6218 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6219 bool HasOrderedReductions = 6220 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6221 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6222 return RdxDesc.isOrdered(); 6223 }); 6224 if (HasOrderedReductions) { 6225 LLVM_DEBUG( 6226 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6227 return 1; 6228 } 6229 6230 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6231 SmallIC = std::min(SmallIC, F); 6232 StoresIC = std::min(StoresIC, F); 6233 LoadsIC = std::min(LoadsIC, F); 6234 } 6235 6236 if (EnableLoadStoreRuntimeInterleave && 6237 std::max(StoresIC, LoadsIC) > SmallIC) { 6238 LLVM_DEBUG( 6239 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6240 return std::max(StoresIC, LoadsIC); 6241 } 6242 6243 // If there are scalar reductions and TTI has enabled aggressive 6244 // interleaving for reductions, we will interleave to expose ILP. 6245 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6246 AggressivelyInterleaveReductions) { 6247 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6248 // Interleave no less than SmallIC but not as aggressive as the normal IC 6249 // to satisfy the rare situation when resources are too limited. 6250 return std::max(IC / 2, SmallIC); 6251 } else { 6252 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6253 return SmallIC; 6254 } 6255 } 6256 6257 // Interleave if this is a large loop (small loops are already dealt with by 6258 // this point) that could benefit from interleaving. 6259 if (AggressivelyInterleaveReductions) { 6260 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6261 return IC; 6262 } 6263 6264 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6265 return 1; 6266 } 6267 6268 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6269 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6270 // This function calculates the register usage by measuring the highest number 6271 // of values that are alive at a single location. Obviously, this is a very 6272 // rough estimation. We scan the loop in a topological order in order and 6273 // assign a number to each instruction. We use RPO to ensure that defs are 6274 // met before their users. We assume that each instruction that has in-loop 6275 // users starts an interval. We record every time that an in-loop value is 6276 // used, so we have a list of the first and last occurrences of each 6277 // instruction. Next, we transpose this data structure into a multi map that 6278 // holds the list of intervals that *end* at a specific location. This multi 6279 // map allows us to perform a linear search. We scan the instructions linearly 6280 // and record each time that a new interval starts, by placing it in a set. 6281 // If we find this value in the multi-map then we remove it from the set. 6282 // The max register usage is the maximum size of the set. 6283 // We also search for instructions that are defined outside the loop, but are 6284 // used inside the loop. We need this number separately from the max-interval 6285 // usage number because when we unroll, loop-invariant values do not take 6286 // more register. 6287 LoopBlocksDFS DFS(TheLoop); 6288 DFS.perform(LI); 6289 6290 RegisterUsage RU; 6291 6292 // Each 'key' in the map opens a new interval. The values 6293 // of the map are the index of the 'last seen' usage of the 6294 // instruction that is the key. 6295 using IntervalMap = DenseMap<Instruction *, unsigned>; 6296 6297 // Maps instruction to its index. 6298 SmallVector<Instruction *, 64> IdxToInstr; 6299 // Marks the end of each interval. 6300 IntervalMap EndPoint; 6301 // Saves the list of instruction indices that are used in the loop. 6302 SmallPtrSet<Instruction *, 8> Ends; 6303 // Saves the list of values that are used in the loop but are 6304 // defined outside the loop, such as arguments and constants. 6305 SmallPtrSet<Value *, 8> LoopInvariants; 6306 6307 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6308 for (Instruction &I : BB->instructionsWithoutDebug()) { 6309 IdxToInstr.push_back(&I); 6310 6311 // Save the end location of each USE. 6312 for (Value *U : I.operands()) { 6313 auto *Instr = dyn_cast<Instruction>(U); 6314 6315 // Ignore non-instruction values such as arguments, constants, etc. 6316 if (!Instr) 6317 continue; 6318 6319 // If this instruction is outside the loop then record it and continue. 6320 if (!TheLoop->contains(Instr)) { 6321 LoopInvariants.insert(Instr); 6322 continue; 6323 } 6324 6325 // Overwrite previous end points. 6326 EndPoint[Instr] = IdxToInstr.size(); 6327 Ends.insert(Instr); 6328 } 6329 } 6330 } 6331 6332 // Saves the list of intervals that end with the index in 'key'. 6333 using InstrList = SmallVector<Instruction *, 2>; 6334 DenseMap<unsigned, InstrList> TransposeEnds; 6335 6336 // Transpose the EndPoints to a list of values that end at each index. 6337 for (auto &Interval : EndPoint) 6338 TransposeEnds[Interval.second].push_back(Interval.first); 6339 6340 SmallPtrSet<Instruction *, 8> OpenIntervals; 6341 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6342 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6343 6344 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6345 6346 // A lambda that gets the register usage for the given type and VF. 6347 const auto &TTICapture = TTI; 6348 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6349 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6350 return 0; 6351 InstructionCost::CostType RegUsage = 6352 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6353 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6354 "Nonsensical values for register usage."); 6355 return RegUsage; 6356 }; 6357 6358 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6359 Instruction *I = IdxToInstr[i]; 6360 6361 // Remove all of the instructions that end at this location. 6362 InstrList &List = TransposeEnds[i]; 6363 for (Instruction *ToRemove : List) 6364 OpenIntervals.erase(ToRemove); 6365 6366 // Ignore instructions that are never used within the loop. 6367 if (!Ends.count(I)) 6368 continue; 6369 6370 // Skip ignored values. 6371 if (ValuesToIgnore.count(I)) 6372 continue; 6373 6374 // For each VF find the maximum usage of registers. 6375 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6376 // Count the number of live intervals. 6377 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6378 6379 if (VFs[j].isScalar()) { 6380 for (auto Inst : OpenIntervals) { 6381 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6382 if (RegUsage.find(ClassID) == RegUsage.end()) 6383 RegUsage[ClassID] = 1; 6384 else 6385 RegUsage[ClassID] += 1; 6386 } 6387 } else { 6388 collectUniformsAndScalars(VFs[j]); 6389 for (auto Inst : OpenIntervals) { 6390 // Skip ignored values for VF > 1. 6391 if (VecValuesToIgnore.count(Inst)) 6392 continue; 6393 if (isScalarAfterVectorization(Inst, VFs[j])) { 6394 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6395 if (RegUsage.find(ClassID) == RegUsage.end()) 6396 RegUsage[ClassID] = 1; 6397 else 6398 RegUsage[ClassID] += 1; 6399 } else { 6400 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6401 if (RegUsage.find(ClassID) == RegUsage.end()) 6402 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6403 else 6404 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6405 } 6406 } 6407 } 6408 6409 for (auto& pair : RegUsage) { 6410 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6411 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6412 else 6413 MaxUsages[j][pair.first] = pair.second; 6414 } 6415 } 6416 6417 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6418 << OpenIntervals.size() << '\n'); 6419 6420 // Add the current instruction to the list of open intervals. 6421 OpenIntervals.insert(I); 6422 } 6423 6424 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6425 SmallMapVector<unsigned, unsigned, 4> Invariant; 6426 6427 for (auto Inst : LoopInvariants) { 6428 unsigned Usage = 6429 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6430 unsigned ClassID = 6431 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6432 if (Invariant.find(ClassID) == Invariant.end()) 6433 Invariant[ClassID] = Usage; 6434 else 6435 Invariant[ClassID] += Usage; 6436 } 6437 6438 LLVM_DEBUG({ 6439 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6440 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6441 << " item\n"; 6442 for (const auto &pair : MaxUsages[i]) { 6443 dbgs() << "LV(REG): RegisterClass: " 6444 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6445 << " registers\n"; 6446 } 6447 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6448 << " item\n"; 6449 for (const auto &pair : Invariant) { 6450 dbgs() << "LV(REG): RegisterClass: " 6451 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6452 << " registers\n"; 6453 } 6454 }); 6455 6456 RU.LoopInvariantRegs = Invariant; 6457 RU.MaxLocalUsers = MaxUsages[i]; 6458 RUs[i] = RU; 6459 } 6460 6461 return RUs; 6462 } 6463 6464 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6465 // TODO: Cost model for emulated masked load/store is completely 6466 // broken. This hack guides the cost model to use an artificially 6467 // high enough value to practically disable vectorization with such 6468 // operations, except where previously deployed legality hack allowed 6469 // using very low cost values. This is to avoid regressions coming simply 6470 // from moving "masked load/store" check from legality to cost model. 6471 // Masked Load/Gather emulation was previously never allowed. 6472 // Limited number of Masked Store/Scatter emulation was allowed. 6473 assert(isPredicatedInst(I) && 6474 "Expecting a scalar emulated instruction"); 6475 return isa<LoadInst>(I) || 6476 (isa<StoreInst>(I) && 6477 NumPredStores > NumberOfStoresToPredicate); 6478 } 6479 6480 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6481 // If we aren't vectorizing the loop, or if we've already collected the 6482 // instructions to scalarize, there's nothing to do. Collection may already 6483 // have occurred if we have a user-selected VF and are now computing the 6484 // expected cost for interleaving. 6485 if (VF.isScalar() || VF.isZero() || 6486 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6487 return; 6488 6489 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6490 // not profitable to scalarize any instructions, the presence of VF in the 6491 // map will indicate that we've analyzed it already. 6492 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6493 6494 // Find all the instructions that are scalar with predication in the loop and 6495 // determine if it would be better to not if-convert the blocks they are in. 6496 // If so, we also record the instructions to scalarize. 6497 for (BasicBlock *BB : TheLoop->blocks()) { 6498 if (!blockNeedsPredicationForAnyReason(BB)) 6499 continue; 6500 for (Instruction &I : *BB) 6501 if (isScalarWithPredication(&I)) { 6502 ScalarCostsTy ScalarCosts; 6503 // Do not apply discount if scalable, because that would lead to 6504 // invalid scalarization costs. 6505 // Do not apply discount logic if hacked cost is needed 6506 // for emulated masked memrefs. 6507 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6508 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6509 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6510 // Remember that BB will remain after vectorization. 6511 PredicatedBBsAfterVectorization.insert(BB); 6512 } 6513 } 6514 } 6515 6516 int LoopVectorizationCostModel::computePredInstDiscount( 6517 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6518 assert(!isUniformAfterVectorization(PredInst, VF) && 6519 "Instruction marked uniform-after-vectorization will be predicated"); 6520 6521 // Initialize the discount to zero, meaning that the scalar version and the 6522 // vector version cost the same. 6523 InstructionCost Discount = 0; 6524 6525 // Holds instructions to analyze. The instructions we visit are mapped in 6526 // ScalarCosts. Those instructions are the ones that would be scalarized if 6527 // we find that the scalar version costs less. 6528 SmallVector<Instruction *, 8> Worklist; 6529 6530 // Returns true if the given instruction can be scalarized. 6531 auto canBeScalarized = [&](Instruction *I) -> bool { 6532 // We only attempt to scalarize instructions forming a single-use chain 6533 // from the original predicated block that would otherwise be vectorized. 6534 // Although not strictly necessary, we give up on instructions we know will 6535 // already be scalar to avoid traversing chains that are unlikely to be 6536 // beneficial. 6537 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6538 isScalarAfterVectorization(I, VF)) 6539 return false; 6540 6541 // If the instruction is scalar with predication, it will be analyzed 6542 // separately. We ignore it within the context of PredInst. 6543 if (isScalarWithPredication(I)) 6544 return false; 6545 6546 // If any of the instruction's operands are uniform after vectorization, 6547 // the instruction cannot be scalarized. This prevents, for example, a 6548 // masked load from being scalarized. 6549 // 6550 // We assume we will only emit a value for lane zero of an instruction 6551 // marked uniform after vectorization, rather than VF identical values. 6552 // Thus, if we scalarize an instruction that uses a uniform, we would 6553 // create uses of values corresponding to the lanes we aren't emitting code 6554 // for. This behavior can be changed by allowing getScalarValue to clone 6555 // the lane zero values for uniforms rather than asserting. 6556 for (Use &U : I->operands()) 6557 if (auto *J = dyn_cast<Instruction>(U.get())) 6558 if (isUniformAfterVectorization(J, VF)) 6559 return false; 6560 6561 // Otherwise, we can scalarize the instruction. 6562 return true; 6563 }; 6564 6565 // Compute the expected cost discount from scalarizing the entire expression 6566 // feeding the predicated instruction. We currently only consider expressions 6567 // that are single-use instruction chains. 6568 Worklist.push_back(PredInst); 6569 while (!Worklist.empty()) { 6570 Instruction *I = Worklist.pop_back_val(); 6571 6572 // If we've already analyzed the instruction, there's nothing to do. 6573 if (ScalarCosts.find(I) != ScalarCosts.end()) 6574 continue; 6575 6576 // Compute the cost of the vector instruction. Note that this cost already 6577 // includes the scalarization overhead of the predicated instruction. 6578 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6579 6580 // Compute the cost of the scalarized instruction. This cost is the cost of 6581 // the instruction as if it wasn't if-converted and instead remained in the 6582 // predicated block. We will scale this cost by block probability after 6583 // computing the scalarization overhead. 6584 InstructionCost ScalarCost = 6585 VF.getFixedValue() * 6586 getInstructionCost(I, ElementCount::getFixed(1)).first; 6587 6588 // Compute the scalarization overhead of needed insertelement instructions 6589 // and phi nodes. 6590 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6591 ScalarCost += TTI.getScalarizationOverhead( 6592 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6593 APInt::getAllOnes(VF.getFixedValue()), true, false); 6594 ScalarCost += 6595 VF.getFixedValue() * 6596 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6597 } 6598 6599 // Compute the scalarization overhead of needed extractelement 6600 // instructions. For each of the instruction's operands, if the operand can 6601 // be scalarized, add it to the worklist; otherwise, account for the 6602 // overhead. 6603 for (Use &U : I->operands()) 6604 if (auto *J = dyn_cast<Instruction>(U.get())) { 6605 assert(VectorType::isValidElementType(J->getType()) && 6606 "Instruction has non-scalar type"); 6607 if (canBeScalarized(J)) 6608 Worklist.push_back(J); 6609 else if (needsExtract(J, VF)) { 6610 ScalarCost += TTI.getScalarizationOverhead( 6611 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6612 APInt::getAllOnes(VF.getFixedValue()), false, true); 6613 } 6614 } 6615 6616 // Scale the total scalar cost by block probability. 6617 ScalarCost /= getReciprocalPredBlockProb(); 6618 6619 // Compute the discount. A non-negative discount means the vector version 6620 // of the instruction costs more, and scalarizing would be beneficial. 6621 Discount += VectorCost - ScalarCost; 6622 ScalarCosts[I] = ScalarCost; 6623 } 6624 6625 return *Discount.getValue(); 6626 } 6627 6628 LoopVectorizationCostModel::VectorizationCostTy 6629 LoopVectorizationCostModel::expectedCost( 6630 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6631 VectorizationCostTy Cost; 6632 6633 // For each block. 6634 for (BasicBlock *BB : TheLoop->blocks()) { 6635 VectorizationCostTy BlockCost; 6636 6637 // For each instruction in the old loop. 6638 for (Instruction &I : BB->instructionsWithoutDebug()) { 6639 // Skip ignored values. 6640 if (ValuesToIgnore.count(&I) || 6641 (VF.isVector() && VecValuesToIgnore.count(&I))) 6642 continue; 6643 6644 VectorizationCostTy C = getInstructionCost(&I, VF); 6645 6646 // Check if we should override the cost. 6647 if (C.first.isValid() && 6648 ForceTargetInstructionCost.getNumOccurrences() > 0) 6649 C.first = InstructionCost(ForceTargetInstructionCost); 6650 6651 // Keep a list of instructions with invalid costs. 6652 if (Invalid && !C.first.isValid()) 6653 Invalid->emplace_back(&I, VF); 6654 6655 BlockCost.first += C.first; 6656 BlockCost.second |= C.second; 6657 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6658 << " for VF " << VF << " For instruction: " << I 6659 << '\n'); 6660 } 6661 6662 // If we are vectorizing a predicated block, it will have been 6663 // if-converted. This means that the block's instructions (aside from 6664 // stores and instructions that may divide by zero) will now be 6665 // unconditionally executed. For the scalar case, we may not always execute 6666 // the predicated block, if it is an if-else block. Thus, scale the block's 6667 // cost by the probability of executing it. blockNeedsPredication from 6668 // Legal is used so as to not include all blocks in tail folded loops. 6669 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6670 BlockCost.first /= getReciprocalPredBlockProb(); 6671 6672 Cost.first += BlockCost.first; 6673 Cost.second |= BlockCost.second; 6674 } 6675 6676 return Cost; 6677 } 6678 6679 /// Gets Address Access SCEV after verifying that the access pattern 6680 /// is loop invariant except the induction variable dependence. 6681 /// 6682 /// This SCEV can be sent to the Target in order to estimate the address 6683 /// calculation cost. 6684 static const SCEV *getAddressAccessSCEV( 6685 Value *Ptr, 6686 LoopVectorizationLegality *Legal, 6687 PredicatedScalarEvolution &PSE, 6688 const Loop *TheLoop) { 6689 6690 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6691 if (!Gep) 6692 return nullptr; 6693 6694 // We are looking for a gep with all loop invariant indices except for one 6695 // which should be an induction variable. 6696 auto SE = PSE.getSE(); 6697 unsigned NumOperands = Gep->getNumOperands(); 6698 for (unsigned i = 1; i < NumOperands; ++i) { 6699 Value *Opd = Gep->getOperand(i); 6700 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6701 !Legal->isInductionVariable(Opd)) 6702 return nullptr; 6703 } 6704 6705 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6706 return PSE.getSCEV(Ptr); 6707 } 6708 6709 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6710 return Legal->hasStride(I->getOperand(0)) || 6711 Legal->hasStride(I->getOperand(1)); 6712 } 6713 6714 InstructionCost 6715 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6716 ElementCount VF) { 6717 assert(VF.isVector() && 6718 "Scalarization cost of instruction implies vectorization."); 6719 if (VF.isScalable()) 6720 return InstructionCost::getInvalid(); 6721 6722 Type *ValTy = getLoadStoreType(I); 6723 auto SE = PSE.getSE(); 6724 6725 unsigned AS = getLoadStoreAddressSpace(I); 6726 Value *Ptr = getLoadStorePointerOperand(I); 6727 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6728 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6729 // that it is being called from this specific place. 6730 6731 // Figure out whether the access is strided and get the stride value 6732 // if it's known in compile time 6733 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6734 6735 // Get the cost of the scalar memory instruction and address computation. 6736 InstructionCost Cost = 6737 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6738 6739 // Don't pass *I here, since it is scalar but will actually be part of a 6740 // vectorized loop where the user of it is a vectorized instruction. 6741 const Align Alignment = getLoadStoreAlignment(I); 6742 Cost += VF.getKnownMinValue() * 6743 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6744 AS, TTI::TCK_RecipThroughput); 6745 6746 // Get the overhead of the extractelement and insertelement instructions 6747 // we might create due to scalarization. 6748 Cost += getScalarizationOverhead(I, VF); 6749 6750 // If we have a predicated load/store, it will need extra i1 extracts and 6751 // conditional branches, but may not be executed for each vector lane. Scale 6752 // the cost by the probability of executing the predicated block. 6753 if (isPredicatedInst(I)) { 6754 Cost /= getReciprocalPredBlockProb(); 6755 6756 // Add the cost of an i1 extract and a branch 6757 auto *Vec_i1Ty = 6758 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6759 Cost += TTI.getScalarizationOverhead( 6760 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6761 /*Insert=*/false, /*Extract=*/true); 6762 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6763 6764 if (useEmulatedMaskMemRefHack(I)) 6765 // Artificially setting to a high enough value to practically disable 6766 // vectorization with such operations. 6767 Cost = 3000000; 6768 } 6769 6770 return Cost; 6771 } 6772 6773 InstructionCost 6774 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6775 ElementCount VF) { 6776 Type *ValTy = getLoadStoreType(I); 6777 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6778 Value *Ptr = getLoadStorePointerOperand(I); 6779 unsigned AS = getLoadStoreAddressSpace(I); 6780 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6781 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6782 6783 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6784 "Stride should be 1 or -1 for consecutive memory access"); 6785 const Align Alignment = getLoadStoreAlignment(I); 6786 InstructionCost Cost = 0; 6787 if (Legal->isMaskRequired(I)) 6788 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6789 CostKind); 6790 else 6791 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6792 CostKind, I); 6793 6794 bool Reverse = ConsecutiveStride < 0; 6795 if (Reverse) 6796 Cost += 6797 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6798 return Cost; 6799 } 6800 6801 InstructionCost 6802 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6803 ElementCount VF) { 6804 assert(Legal->isUniformMemOp(*I)); 6805 6806 Type *ValTy = getLoadStoreType(I); 6807 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6808 const Align Alignment = getLoadStoreAlignment(I); 6809 unsigned AS = getLoadStoreAddressSpace(I); 6810 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6811 if (isa<LoadInst>(I)) { 6812 return TTI.getAddressComputationCost(ValTy) + 6813 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6814 CostKind) + 6815 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6816 } 6817 StoreInst *SI = cast<StoreInst>(I); 6818 6819 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6820 return TTI.getAddressComputationCost(ValTy) + 6821 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6822 CostKind) + 6823 (isLoopInvariantStoreValue 6824 ? 0 6825 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6826 VF.getKnownMinValue() - 1)); 6827 } 6828 6829 InstructionCost 6830 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6831 ElementCount VF) { 6832 Type *ValTy = getLoadStoreType(I); 6833 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6834 const Align Alignment = getLoadStoreAlignment(I); 6835 const Value *Ptr = getLoadStorePointerOperand(I); 6836 6837 return TTI.getAddressComputationCost(VectorTy) + 6838 TTI.getGatherScatterOpCost( 6839 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6840 TargetTransformInfo::TCK_RecipThroughput, I); 6841 } 6842 6843 InstructionCost 6844 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6845 ElementCount VF) { 6846 // TODO: Once we have support for interleaving with scalable vectors 6847 // we can calculate the cost properly here. 6848 if (VF.isScalable()) 6849 return InstructionCost::getInvalid(); 6850 6851 Type *ValTy = getLoadStoreType(I); 6852 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6853 unsigned AS = getLoadStoreAddressSpace(I); 6854 6855 auto Group = getInterleavedAccessGroup(I); 6856 assert(Group && "Fail to get an interleaved access group."); 6857 6858 unsigned InterleaveFactor = Group->getFactor(); 6859 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6860 6861 // Holds the indices of existing members in the interleaved group. 6862 SmallVector<unsigned, 4> Indices; 6863 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6864 if (Group->getMember(IF)) 6865 Indices.push_back(IF); 6866 6867 // Calculate the cost of the whole interleaved group. 6868 bool UseMaskForGaps = 6869 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6870 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6871 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6872 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6873 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6874 6875 if (Group->isReverse()) { 6876 // TODO: Add support for reversed masked interleaved access. 6877 assert(!Legal->isMaskRequired(I) && 6878 "Reverse masked interleaved access not supported."); 6879 Cost += 6880 Group->getNumMembers() * 6881 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6882 } 6883 return Cost; 6884 } 6885 6886 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6887 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6888 using namespace llvm::PatternMatch; 6889 // Early exit for no inloop reductions 6890 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6891 return None; 6892 auto *VectorTy = cast<VectorType>(Ty); 6893 6894 // We are looking for a pattern of, and finding the minimal acceptable cost: 6895 // reduce(mul(ext(A), ext(B))) or 6896 // reduce(mul(A, B)) or 6897 // reduce(ext(A)) or 6898 // reduce(A). 6899 // The basic idea is that we walk down the tree to do that, finding the root 6900 // reduction instruction in InLoopReductionImmediateChains. From there we find 6901 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6902 // of the components. If the reduction cost is lower then we return it for the 6903 // reduction instruction and 0 for the other instructions in the pattern. If 6904 // it is not we return an invalid cost specifying the orignal cost method 6905 // should be used. 6906 Instruction *RetI = I; 6907 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6908 if (!RetI->hasOneUser()) 6909 return None; 6910 RetI = RetI->user_back(); 6911 } 6912 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6913 RetI->user_back()->getOpcode() == Instruction::Add) { 6914 if (!RetI->hasOneUser()) 6915 return None; 6916 RetI = RetI->user_back(); 6917 } 6918 6919 // Test if the found instruction is a reduction, and if not return an invalid 6920 // cost specifying the parent to use the original cost modelling. 6921 if (!InLoopReductionImmediateChains.count(RetI)) 6922 return None; 6923 6924 // Find the reduction this chain is a part of and calculate the basic cost of 6925 // the reduction on its own. 6926 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6927 Instruction *ReductionPhi = LastChain; 6928 while (!isa<PHINode>(ReductionPhi)) 6929 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6930 6931 const RecurrenceDescriptor &RdxDesc = 6932 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6933 6934 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6935 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6936 6937 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6938 // normal fmul instruction to the cost of the fadd reduction. 6939 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6940 BaseCost += 6941 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6942 6943 // If we're using ordered reductions then we can just return the base cost 6944 // here, since getArithmeticReductionCost calculates the full ordered 6945 // reduction cost when FP reassociation is not allowed. 6946 if (useOrderedReductions(RdxDesc)) 6947 return BaseCost; 6948 6949 // Get the operand that was not the reduction chain and match it to one of the 6950 // patterns, returning the better cost if it is found. 6951 Instruction *RedOp = RetI->getOperand(1) == LastChain 6952 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6953 : dyn_cast<Instruction>(RetI->getOperand(1)); 6954 6955 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6956 6957 Instruction *Op0, *Op1; 6958 if (RedOp && 6959 match(RedOp, 6960 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6961 match(Op0, m_ZExtOrSExt(m_Value())) && 6962 Op0->getOpcode() == Op1->getOpcode() && 6963 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6964 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6965 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6966 6967 // Matched reduce(ext(mul(ext(A), ext(B))) 6968 // Note that the extend opcodes need to all match, or if A==B they will have 6969 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6970 // which is equally fine. 6971 bool IsUnsigned = isa<ZExtInst>(Op0); 6972 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6973 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6974 6975 InstructionCost ExtCost = 6976 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6977 TTI::CastContextHint::None, CostKind, Op0); 6978 InstructionCost MulCost = 6979 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6980 InstructionCost Ext2Cost = 6981 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6982 TTI::CastContextHint::None, CostKind, RedOp); 6983 6984 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6985 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6986 CostKind); 6987 6988 if (RedCost.isValid() && 6989 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6990 return I == RetI ? RedCost : 0; 6991 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6992 !TheLoop->isLoopInvariant(RedOp)) { 6993 // Matched reduce(ext(A)) 6994 bool IsUnsigned = isa<ZExtInst>(RedOp); 6995 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6996 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6997 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6998 CostKind); 6999 7000 InstructionCost ExtCost = 7001 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7002 TTI::CastContextHint::None, CostKind, RedOp); 7003 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7004 return I == RetI ? RedCost : 0; 7005 } else if (RedOp && 7006 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7007 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7008 Op0->getOpcode() == Op1->getOpcode() && 7009 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7010 bool IsUnsigned = isa<ZExtInst>(Op0); 7011 Type *Op0Ty = Op0->getOperand(0)->getType(); 7012 Type *Op1Ty = Op1->getOperand(0)->getType(); 7013 Type *LargestOpTy = 7014 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 7015 : Op0Ty; 7016 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 7017 7018 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 7019 // different sizes. We take the largest type as the ext to reduce, and add 7020 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 7021 InstructionCost ExtCost0 = TTI.getCastInstrCost( 7022 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 7023 TTI::CastContextHint::None, CostKind, Op0); 7024 InstructionCost ExtCost1 = TTI.getCastInstrCost( 7025 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 7026 TTI::CastContextHint::None, CostKind, Op1); 7027 InstructionCost MulCost = 7028 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7029 7030 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7031 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7032 CostKind); 7033 InstructionCost ExtraExtCost = 0; 7034 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7035 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7036 ExtraExtCost = TTI.getCastInstrCost( 7037 ExtraExtOp->getOpcode(), ExtType, 7038 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7039 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7040 } 7041 7042 if (RedCost.isValid() && 7043 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7044 return I == RetI ? RedCost : 0; 7045 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7046 // Matched reduce(mul()) 7047 InstructionCost MulCost = 7048 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7049 7050 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7051 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7052 CostKind); 7053 7054 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7055 return I == RetI ? RedCost : 0; 7056 } 7057 } 7058 7059 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7060 } 7061 7062 InstructionCost 7063 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7064 ElementCount VF) { 7065 // Calculate scalar cost only. Vectorization cost should be ready at this 7066 // moment. 7067 if (VF.isScalar()) { 7068 Type *ValTy = getLoadStoreType(I); 7069 const Align Alignment = getLoadStoreAlignment(I); 7070 unsigned AS = getLoadStoreAddressSpace(I); 7071 7072 return TTI.getAddressComputationCost(ValTy) + 7073 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7074 TTI::TCK_RecipThroughput, I); 7075 } 7076 return getWideningCost(I, VF); 7077 } 7078 7079 LoopVectorizationCostModel::VectorizationCostTy 7080 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7081 ElementCount VF) { 7082 // If we know that this instruction will remain uniform, check the cost of 7083 // the scalar version. 7084 if (isUniformAfterVectorization(I, VF)) 7085 VF = ElementCount::getFixed(1); 7086 7087 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7088 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7089 7090 // Forced scalars do not have any scalarization overhead. 7091 auto ForcedScalar = ForcedScalars.find(VF); 7092 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7093 auto InstSet = ForcedScalar->second; 7094 if (InstSet.count(I)) 7095 return VectorizationCostTy( 7096 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7097 VF.getKnownMinValue()), 7098 false); 7099 } 7100 7101 Type *VectorTy; 7102 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7103 7104 bool TypeNotScalarized = false; 7105 if (VF.isVector() && VectorTy->isVectorTy()) { 7106 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7107 if (NumParts) 7108 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7109 else 7110 C = InstructionCost::getInvalid(); 7111 } 7112 return VectorizationCostTy(C, TypeNotScalarized); 7113 } 7114 7115 InstructionCost 7116 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7117 ElementCount VF) const { 7118 7119 // There is no mechanism yet to create a scalable scalarization loop, 7120 // so this is currently Invalid. 7121 if (VF.isScalable()) 7122 return InstructionCost::getInvalid(); 7123 7124 if (VF.isScalar()) 7125 return 0; 7126 7127 InstructionCost Cost = 0; 7128 Type *RetTy = ToVectorTy(I->getType(), VF); 7129 if (!RetTy->isVoidTy() && 7130 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7131 Cost += TTI.getScalarizationOverhead( 7132 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7133 false); 7134 7135 // Some targets keep addresses scalar. 7136 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7137 return Cost; 7138 7139 // Some targets support efficient element stores. 7140 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7141 return Cost; 7142 7143 // Collect operands to consider. 7144 CallInst *CI = dyn_cast<CallInst>(I); 7145 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7146 7147 // Skip operands that do not require extraction/scalarization and do not incur 7148 // any overhead. 7149 SmallVector<Type *> Tys; 7150 for (auto *V : filterExtractingOperands(Ops, VF)) 7151 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7152 return Cost + TTI.getOperandsScalarizationOverhead( 7153 filterExtractingOperands(Ops, VF), Tys); 7154 } 7155 7156 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7157 if (VF.isScalar()) 7158 return; 7159 NumPredStores = 0; 7160 for (BasicBlock *BB : TheLoop->blocks()) { 7161 // For each instruction in the old loop. 7162 for (Instruction &I : *BB) { 7163 Value *Ptr = getLoadStorePointerOperand(&I); 7164 if (!Ptr) 7165 continue; 7166 7167 // TODO: We should generate better code and update the cost model for 7168 // predicated uniform stores. Today they are treated as any other 7169 // predicated store (see added test cases in 7170 // invariant-store-vectorization.ll). 7171 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7172 NumPredStores++; 7173 7174 if (Legal->isUniformMemOp(I)) { 7175 // TODO: Avoid replicating loads and stores instead of 7176 // relying on instcombine to remove them. 7177 // Load: Scalar load + broadcast 7178 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7179 InstructionCost Cost; 7180 if (isa<StoreInst>(&I) && VF.isScalable() && 7181 isLegalGatherOrScatter(&I)) { 7182 Cost = getGatherScatterCost(&I, VF); 7183 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7184 } else { 7185 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7186 "Cannot yet scalarize uniform stores"); 7187 Cost = getUniformMemOpCost(&I, VF); 7188 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7189 } 7190 continue; 7191 } 7192 7193 // We assume that widening is the best solution when possible. 7194 if (memoryInstructionCanBeWidened(&I, VF)) { 7195 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7196 int ConsecutiveStride = Legal->isConsecutivePtr( 7197 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7198 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7199 "Expected consecutive stride."); 7200 InstWidening Decision = 7201 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7202 setWideningDecision(&I, VF, Decision, Cost); 7203 continue; 7204 } 7205 7206 // Choose between Interleaving, Gather/Scatter or Scalarization. 7207 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7208 unsigned NumAccesses = 1; 7209 if (isAccessInterleaved(&I)) { 7210 auto Group = getInterleavedAccessGroup(&I); 7211 assert(Group && "Fail to get an interleaved access group."); 7212 7213 // Make one decision for the whole group. 7214 if (getWideningDecision(&I, VF) != CM_Unknown) 7215 continue; 7216 7217 NumAccesses = Group->getNumMembers(); 7218 if (interleavedAccessCanBeWidened(&I, VF)) 7219 InterleaveCost = getInterleaveGroupCost(&I, VF); 7220 } 7221 7222 InstructionCost GatherScatterCost = 7223 isLegalGatherOrScatter(&I) 7224 ? getGatherScatterCost(&I, VF) * NumAccesses 7225 : InstructionCost::getInvalid(); 7226 7227 InstructionCost ScalarizationCost = 7228 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7229 7230 // Choose better solution for the current VF, 7231 // write down this decision and use it during vectorization. 7232 InstructionCost Cost; 7233 InstWidening Decision; 7234 if (InterleaveCost <= GatherScatterCost && 7235 InterleaveCost < ScalarizationCost) { 7236 Decision = CM_Interleave; 7237 Cost = InterleaveCost; 7238 } else if (GatherScatterCost < ScalarizationCost) { 7239 Decision = CM_GatherScatter; 7240 Cost = GatherScatterCost; 7241 } else { 7242 Decision = CM_Scalarize; 7243 Cost = ScalarizationCost; 7244 } 7245 // If the instructions belongs to an interleave group, the whole group 7246 // receives the same decision. The whole group receives the cost, but 7247 // the cost will actually be assigned to one instruction. 7248 if (auto Group = getInterleavedAccessGroup(&I)) 7249 setWideningDecision(Group, VF, Decision, Cost); 7250 else 7251 setWideningDecision(&I, VF, Decision, Cost); 7252 } 7253 } 7254 7255 // Make sure that any load of address and any other address computation 7256 // remains scalar unless there is gather/scatter support. This avoids 7257 // inevitable extracts into address registers, and also has the benefit of 7258 // activating LSR more, since that pass can't optimize vectorized 7259 // addresses. 7260 if (TTI.prefersVectorizedAddressing()) 7261 return; 7262 7263 // Start with all scalar pointer uses. 7264 SmallPtrSet<Instruction *, 8> AddrDefs; 7265 for (BasicBlock *BB : TheLoop->blocks()) 7266 for (Instruction &I : *BB) { 7267 Instruction *PtrDef = 7268 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7269 if (PtrDef && TheLoop->contains(PtrDef) && 7270 getWideningDecision(&I, VF) != CM_GatherScatter) 7271 AddrDefs.insert(PtrDef); 7272 } 7273 7274 // Add all instructions used to generate the addresses. 7275 SmallVector<Instruction *, 4> Worklist; 7276 append_range(Worklist, AddrDefs); 7277 while (!Worklist.empty()) { 7278 Instruction *I = Worklist.pop_back_val(); 7279 for (auto &Op : I->operands()) 7280 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7281 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7282 AddrDefs.insert(InstOp).second) 7283 Worklist.push_back(InstOp); 7284 } 7285 7286 for (auto *I : AddrDefs) { 7287 if (isa<LoadInst>(I)) { 7288 // Setting the desired widening decision should ideally be handled in 7289 // by cost functions, but since this involves the task of finding out 7290 // if the loaded register is involved in an address computation, it is 7291 // instead changed here when we know this is the case. 7292 InstWidening Decision = getWideningDecision(I, VF); 7293 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7294 // Scalarize a widened load of address. 7295 setWideningDecision( 7296 I, VF, CM_Scalarize, 7297 (VF.getKnownMinValue() * 7298 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7299 else if (auto Group = getInterleavedAccessGroup(I)) { 7300 // Scalarize an interleave group of address loads. 7301 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7302 if (Instruction *Member = Group->getMember(I)) 7303 setWideningDecision( 7304 Member, VF, CM_Scalarize, 7305 (VF.getKnownMinValue() * 7306 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7307 } 7308 } 7309 } else 7310 // Make sure I gets scalarized and a cost estimate without 7311 // scalarization overhead. 7312 ForcedScalars[VF].insert(I); 7313 } 7314 } 7315 7316 InstructionCost 7317 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7318 Type *&VectorTy) { 7319 Type *RetTy = I->getType(); 7320 if (canTruncateToMinimalBitwidth(I, VF)) 7321 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7322 auto SE = PSE.getSE(); 7323 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7324 7325 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7326 ElementCount VF) -> bool { 7327 if (VF.isScalar()) 7328 return true; 7329 7330 auto Scalarized = InstsToScalarize.find(VF); 7331 assert(Scalarized != InstsToScalarize.end() && 7332 "VF not yet analyzed for scalarization profitability"); 7333 return !Scalarized->second.count(I) && 7334 llvm::all_of(I->users(), [&](User *U) { 7335 auto *UI = cast<Instruction>(U); 7336 return !Scalarized->second.count(UI); 7337 }); 7338 }; 7339 (void) hasSingleCopyAfterVectorization; 7340 7341 if (isScalarAfterVectorization(I, VF)) { 7342 // With the exception of GEPs and PHIs, after scalarization there should 7343 // only be one copy of the instruction generated in the loop. This is 7344 // because the VF is either 1, or any instructions that need scalarizing 7345 // have already been dealt with by the the time we get here. As a result, 7346 // it means we don't have to multiply the instruction cost by VF. 7347 assert(I->getOpcode() == Instruction::GetElementPtr || 7348 I->getOpcode() == Instruction::PHI || 7349 (I->getOpcode() == Instruction::BitCast && 7350 I->getType()->isPointerTy()) || 7351 hasSingleCopyAfterVectorization(I, VF)); 7352 VectorTy = RetTy; 7353 } else 7354 VectorTy = ToVectorTy(RetTy, VF); 7355 7356 // TODO: We need to estimate the cost of intrinsic calls. 7357 switch (I->getOpcode()) { 7358 case Instruction::GetElementPtr: 7359 // We mark this instruction as zero-cost because the cost of GEPs in 7360 // vectorized code depends on whether the corresponding memory instruction 7361 // is scalarized or not. Therefore, we handle GEPs with the memory 7362 // instruction cost. 7363 return 0; 7364 case Instruction::Br: { 7365 // In cases of scalarized and predicated instructions, there will be VF 7366 // predicated blocks in the vectorized loop. Each branch around these 7367 // blocks requires also an extract of its vector compare i1 element. 7368 bool ScalarPredicatedBB = false; 7369 BranchInst *BI = cast<BranchInst>(I); 7370 if (VF.isVector() && BI->isConditional() && 7371 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7372 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7373 ScalarPredicatedBB = true; 7374 7375 if (ScalarPredicatedBB) { 7376 // Not possible to scalarize scalable vector with predicated instructions. 7377 if (VF.isScalable()) 7378 return InstructionCost::getInvalid(); 7379 // Return cost for branches around scalarized and predicated blocks. 7380 auto *Vec_i1Ty = 7381 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7382 return ( 7383 TTI.getScalarizationOverhead( 7384 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7385 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7386 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7387 // The back-edge branch will remain, as will all scalar branches. 7388 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7389 else 7390 // This branch will be eliminated by if-conversion. 7391 return 0; 7392 // Note: We currently assume zero cost for an unconditional branch inside 7393 // a predicated block since it will become a fall-through, although we 7394 // may decide in the future to call TTI for all branches. 7395 } 7396 case Instruction::PHI: { 7397 auto *Phi = cast<PHINode>(I); 7398 7399 // First-order recurrences are replaced by vector shuffles inside the loop. 7400 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7401 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7402 return TTI.getShuffleCost( 7403 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7404 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7405 7406 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7407 // converted into select instructions. We require N - 1 selects per phi 7408 // node, where N is the number of incoming values. 7409 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7410 return (Phi->getNumIncomingValues() - 1) * 7411 TTI.getCmpSelInstrCost( 7412 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7413 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7414 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7415 7416 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7417 } 7418 case Instruction::UDiv: 7419 case Instruction::SDiv: 7420 case Instruction::URem: 7421 case Instruction::SRem: 7422 // If we have a predicated instruction, it may not be executed for each 7423 // vector lane. Get the scalarization cost and scale this amount by the 7424 // probability of executing the predicated block. If the instruction is not 7425 // predicated, we fall through to the next case. 7426 if (VF.isVector() && isScalarWithPredication(I)) { 7427 InstructionCost Cost = 0; 7428 7429 // These instructions have a non-void type, so account for the phi nodes 7430 // that we will create. This cost is likely to be zero. The phi node 7431 // cost, if any, should be scaled by the block probability because it 7432 // models a copy at the end of each predicated block. 7433 Cost += VF.getKnownMinValue() * 7434 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7435 7436 // The cost of the non-predicated instruction. 7437 Cost += VF.getKnownMinValue() * 7438 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7439 7440 // The cost of insertelement and extractelement instructions needed for 7441 // scalarization. 7442 Cost += getScalarizationOverhead(I, VF); 7443 7444 // Scale the cost by the probability of executing the predicated blocks. 7445 // This assumes the predicated block for each vector lane is equally 7446 // likely. 7447 return Cost / getReciprocalPredBlockProb(); 7448 } 7449 LLVM_FALLTHROUGH; 7450 case Instruction::Add: 7451 case Instruction::FAdd: 7452 case Instruction::Sub: 7453 case Instruction::FSub: 7454 case Instruction::Mul: 7455 case Instruction::FMul: 7456 case Instruction::FDiv: 7457 case Instruction::FRem: 7458 case Instruction::Shl: 7459 case Instruction::LShr: 7460 case Instruction::AShr: 7461 case Instruction::And: 7462 case Instruction::Or: 7463 case Instruction::Xor: { 7464 // Since we will replace the stride by 1 the multiplication should go away. 7465 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7466 return 0; 7467 7468 // Detect reduction patterns 7469 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7470 return *RedCost; 7471 7472 // Certain instructions can be cheaper to vectorize if they have a constant 7473 // second vector operand. One example of this are shifts on x86. 7474 Value *Op2 = I->getOperand(1); 7475 TargetTransformInfo::OperandValueProperties Op2VP; 7476 TargetTransformInfo::OperandValueKind Op2VK = 7477 TTI.getOperandInfo(Op2, Op2VP); 7478 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7479 Op2VK = TargetTransformInfo::OK_UniformValue; 7480 7481 SmallVector<const Value *, 4> Operands(I->operand_values()); 7482 return TTI.getArithmeticInstrCost( 7483 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7484 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7485 } 7486 case Instruction::FNeg: { 7487 return TTI.getArithmeticInstrCost( 7488 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7489 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7490 TargetTransformInfo::OP_None, I->getOperand(0), I); 7491 } 7492 case Instruction::Select: { 7493 SelectInst *SI = cast<SelectInst>(I); 7494 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7495 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7496 7497 const Value *Op0, *Op1; 7498 using namespace llvm::PatternMatch; 7499 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7500 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7501 // select x, y, false --> x & y 7502 // select x, true, y --> x | y 7503 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7504 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7505 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7506 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7507 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7508 Op1->getType()->getScalarSizeInBits() == 1); 7509 7510 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7511 return TTI.getArithmeticInstrCost( 7512 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7513 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7514 } 7515 7516 Type *CondTy = SI->getCondition()->getType(); 7517 if (!ScalarCond) 7518 CondTy = VectorType::get(CondTy, VF); 7519 7520 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7521 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7522 Pred = Cmp->getPredicate(); 7523 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7524 CostKind, I); 7525 } 7526 case Instruction::ICmp: 7527 case Instruction::FCmp: { 7528 Type *ValTy = I->getOperand(0)->getType(); 7529 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7530 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7531 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7532 VectorTy = ToVectorTy(ValTy, VF); 7533 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7534 cast<CmpInst>(I)->getPredicate(), CostKind, 7535 I); 7536 } 7537 case Instruction::Store: 7538 case Instruction::Load: { 7539 ElementCount Width = VF; 7540 if (Width.isVector()) { 7541 InstWidening Decision = getWideningDecision(I, Width); 7542 assert(Decision != CM_Unknown && 7543 "CM decision should be taken at this point"); 7544 if (Decision == CM_Scalarize) 7545 Width = ElementCount::getFixed(1); 7546 } 7547 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7548 return getMemoryInstructionCost(I, VF); 7549 } 7550 case Instruction::BitCast: 7551 if (I->getType()->isPointerTy()) 7552 return 0; 7553 LLVM_FALLTHROUGH; 7554 case Instruction::ZExt: 7555 case Instruction::SExt: 7556 case Instruction::FPToUI: 7557 case Instruction::FPToSI: 7558 case Instruction::FPExt: 7559 case Instruction::PtrToInt: 7560 case Instruction::IntToPtr: 7561 case Instruction::SIToFP: 7562 case Instruction::UIToFP: 7563 case Instruction::Trunc: 7564 case Instruction::FPTrunc: { 7565 // Computes the CastContextHint from a Load/Store instruction. 7566 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7567 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7568 "Expected a load or a store!"); 7569 7570 if (VF.isScalar() || !TheLoop->contains(I)) 7571 return TTI::CastContextHint::Normal; 7572 7573 switch (getWideningDecision(I, VF)) { 7574 case LoopVectorizationCostModel::CM_GatherScatter: 7575 return TTI::CastContextHint::GatherScatter; 7576 case LoopVectorizationCostModel::CM_Interleave: 7577 return TTI::CastContextHint::Interleave; 7578 case LoopVectorizationCostModel::CM_Scalarize: 7579 case LoopVectorizationCostModel::CM_Widen: 7580 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7581 : TTI::CastContextHint::Normal; 7582 case LoopVectorizationCostModel::CM_Widen_Reverse: 7583 return TTI::CastContextHint::Reversed; 7584 case LoopVectorizationCostModel::CM_Unknown: 7585 llvm_unreachable("Instr did not go through cost modelling?"); 7586 } 7587 7588 llvm_unreachable("Unhandled case!"); 7589 }; 7590 7591 unsigned Opcode = I->getOpcode(); 7592 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7593 // For Trunc, the context is the only user, which must be a StoreInst. 7594 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7595 if (I->hasOneUse()) 7596 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7597 CCH = ComputeCCH(Store); 7598 } 7599 // For Z/Sext, the context is the operand, which must be a LoadInst. 7600 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7601 Opcode == Instruction::FPExt) { 7602 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7603 CCH = ComputeCCH(Load); 7604 } 7605 7606 // We optimize the truncation of induction variables having constant 7607 // integer steps. The cost of these truncations is the same as the scalar 7608 // operation. 7609 if (isOptimizableIVTruncate(I, VF)) { 7610 auto *Trunc = cast<TruncInst>(I); 7611 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7612 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7613 } 7614 7615 // Detect reduction patterns 7616 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7617 return *RedCost; 7618 7619 Type *SrcScalarTy = I->getOperand(0)->getType(); 7620 Type *SrcVecTy = 7621 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7622 if (canTruncateToMinimalBitwidth(I, VF)) { 7623 // This cast is going to be shrunk. This may remove the cast or it might 7624 // turn it into slightly different cast. For example, if MinBW == 16, 7625 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7626 // 7627 // Calculate the modified src and dest types. 7628 Type *MinVecTy = VectorTy; 7629 if (Opcode == Instruction::Trunc) { 7630 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7631 VectorTy = 7632 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7633 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7634 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7635 VectorTy = 7636 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7637 } 7638 } 7639 7640 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7641 } 7642 case Instruction::Call: { 7643 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7644 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7645 return *RedCost; 7646 bool NeedToScalarize; 7647 CallInst *CI = cast<CallInst>(I); 7648 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7649 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7650 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7651 return std::min(CallCost, IntrinsicCost); 7652 } 7653 return CallCost; 7654 } 7655 case Instruction::ExtractValue: 7656 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7657 case Instruction::Alloca: 7658 // We cannot easily widen alloca to a scalable alloca, as 7659 // the result would need to be a vector of pointers. 7660 if (VF.isScalable()) 7661 return InstructionCost::getInvalid(); 7662 LLVM_FALLTHROUGH; 7663 default: 7664 // This opcode is unknown. Assume that it is the same as 'mul'. 7665 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7666 } // end of switch. 7667 } 7668 7669 char LoopVectorize::ID = 0; 7670 7671 static const char lv_name[] = "Loop Vectorization"; 7672 7673 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7674 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7675 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7676 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7677 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7678 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7679 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7680 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7681 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7682 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7683 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7684 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7685 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7686 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7687 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7688 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7689 7690 namespace llvm { 7691 7692 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7693 7694 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7695 bool VectorizeOnlyWhenForced) { 7696 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7697 } 7698 7699 } // end namespace llvm 7700 7701 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7702 // Check if the pointer operand of a load or store instruction is 7703 // consecutive. 7704 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7705 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7706 return false; 7707 } 7708 7709 void LoopVectorizationCostModel::collectValuesToIgnore() { 7710 // Ignore ephemeral values. 7711 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7712 7713 // Ignore type-promoting instructions we identified during reduction 7714 // detection. 7715 for (auto &Reduction : Legal->getReductionVars()) { 7716 const RecurrenceDescriptor &RedDes = Reduction.second; 7717 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7718 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7719 } 7720 // Ignore type-casting instructions we identified during induction 7721 // detection. 7722 for (auto &Induction : Legal->getInductionVars()) { 7723 const InductionDescriptor &IndDes = Induction.second; 7724 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7725 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7726 } 7727 } 7728 7729 void LoopVectorizationCostModel::collectInLoopReductions() { 7730 for (auto &Reduction : Legal->getReductionVars()) { 7731 PHINode *Phi = Reduction.first; 7732 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7733 7734 // We don't collect reductions that are type promoted (yet). 7735 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7736 continue; 7737 7738 // If the target would prefer this reduction to happen "in-loop", then we 7739 // want to record it as such. 7740 unsigned Opcode = RdxDesc.getOpcode(); 7741 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7742 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7743 TargetTransformInfo::ReductionFlags())) 7744 continue; 7745 7746 // Check that we can correctly put the reductions into the loop, by 7747 // finding the chain of operations that leads from the phi to the loop 7748 // exit value. 7749 SmallVector<Instruction *, 4> ReductionOperations = 7750 RdxDesc.getReductionOpChain(Phi, TheLoop); 7751 bool InLoop = !ReductionOperations.empty(); 7752 if (InLoop) { 7753 InLoopReductionChains[Phi] = ReductionOperations; 7754 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7755 Instruction *LastChain = Phi; 7756 for (auto *I : ReductionOperations) { 7757 InLoopReductionImmediateChains[I] = LastChain; 7758 LastChain = I; 7759 } 7760 } 7761 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7762 << " reduction for phi: " << *Phi << "\n"); 7763 } 7764 } 7765 7766 // TODO: we could return a pair of values that specify the max VF and 7767 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7768 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7769 // doesn't have a cost model that can choose which plan to execute if 7770 // more than one is generated. 7771 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7772 LoopVectorizationCostModel &CM) { 7773 unsigned WidestType; 7774 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7775 return WidestVectorRegBits / WidestType; 7776 } 7777 7778 VectorizationFactor 7779 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7780 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7781 ElementCount VF = UserVF; 7782 // Outer loop handling: They may require CFG and instruction level 7783 // transformations before even evaluating whether vectorization is profitable. 7784 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7785 // the vectorization pipeline. 7786 if (!OrigLoop->isInnermost()) { 7787 // If the user doesn't provide a vectorization factor, determine a 7788 // reasonable one. 7789 if (UserVF.isZero()) { 7790 VF = ElementCount::getFixed(determineVPlanVF( 7791 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7792 .getFixedSize(), 7793 CM)); 7794 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7795 7796 // Make sure we have a VF > 1 for stress testing. 7797 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7798 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7799 << "overriding computed VF.\n"); 7800 VF = ElementCount::getFixed(4); 7801 } 7802 } 7803 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7804 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7805 "VF needs to be a power of two"); 7806 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7807 << "VF " << VF << " to build VPlans.\n"); 7808 buildVPlans(VF, VF); 7809 7810 // For VPlan build stress testing, we bail out after VPlan construction. 7811 if (VPlanBuildStressTest) 7812 return VectorizationFactor::Disabled(); 7813 7814 return {VF, 0 /*Cost*/}; 7815 } 7816 7817 LLVM_DEBUG( 7818 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7819 "VPlan-native path.\n"); 7820 return VectorizationFactor::Disabled(); 7821 } 7822 7823 Optional<VectorizationFactor> 7824 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7825 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7826 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7827 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7828 return None; 7829 7830 // Invalidate interleave groups if all blocks of loop will be predicated. 7831 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7832 !useMaskedInterleavedAccesses(*TTI)) { 7833 LLVM_DEBUG( 7834 dbgs() 7835 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7836 "which requires masked-interleaved support.\n"); 7837 if (CM.InterleaveInfo.invalidateGroups()) 7838 // Invalidating interleave groups also requires invalidating all decisions 7839 // based on them, which includes widening decisions and uniform and scalar 7840 // values. 7841 CM.invalidateCostModelingDecisions(); 7842 } 7843 7844 ElementCount MaxUserVF = 7845 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7846 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7847 if (!UserVF.isZero() && UserVFIsLegal) { 7848 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7849 "VF needs to be a power of two"); 7850 // Collect the instructions (and their associated costs) that will be more 7851 // profitable to scalarize. 7852 if (CM.selectUserVectorizationFactor(UserVF)) { 7853 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7854 CM.collectInLoopReductions(); 7855 buildVPlansWithVPRecipes(UserVF, UserVF); 7856 LLVM_DEBUG(printPlans(dbgs())); 7857 return {{UserVF, 0}}; 7858 } else 7859 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7860 "InvalidCost", ORE, OrigLoop); 7861 } 7862 7863 // Populate the set of Vectorization Factor Candidates. 7864 ElementCountSet VFCandidates; 7865 for (auto VF = ElementCount::getFixed(1); 7866 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7867 VFCandidates.insert(VF); 7868 for (auto VF = ElementCount::getScalable(1); 7869 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7870 VFCandidates.insert(VF); 7871 7872 for (const auto &VF : VFCandidates) { 7873 // Collect Uniform and Scalar instructions after vectorization with VF. 7874 CM.collectUniformsAndScalars(VF); 7875 7876 // Collect the instructions (and their associated costs) that will be more 7877 // profitable to scalarize. 7878 if (VF.isVector()) 7879 CM.collectInstsToScalarize(VF); 7880 } 7881 7882 CM.collectInLoopReductions(); 7883 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7884 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7885 7886 LLVM_DEBUG(printPlans(dbgs())); 7887 if (!MaxFactors.hasVector()) 7888 return VectorizationFactor::Disabled(); 7889 7890 // Select the optimal vectorization factor. 7891 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7892 7893 // Check if it is profitable to vectorize with runtime checks. 7894 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7895 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7896 bool PragmaThresholdReached = 7897 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7898 bool ThresholdReached = 7899 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7900 if ((ThresholdReached && !Hints.allowReordering()) || 7901 PragmaThresholdReached) { 7902 ORE->emit([&]() { 7903 return OptimizationRemarkAnalysisAliasing( 7904 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7905 OrigLoop->getHeader()) 7906 << "loop not vectorized: cannot prove it is safe to reorder " 7907 "memory operations"; 7908 }); 7909 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7910 Hints.emitRemarkWithHints(); 7911 return VectorizationFactor::Disabled(); 7912 } 7913 } 7914 return SelectedVF; 7915 } 7916 7917 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7918 assert(count_if(VPlans, 7919 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7920 1 && 7921 "Best VF has not a single VPlan."); 7922 7923 for (const VPlanPtr &Plan : VPlans) { 7924 if (Plan->hasVF(VF)) 7925 return *Plan.get(); 7926 } 7927 llvm_unreachable("No plan found!"); 7928 } 7929 7930 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7931 VPlan &BestVPlan, 7932 InnerLoopVectorizer &ILV, 7933 DominatorTree *DT) { 7934 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7935 << '\n'); 7936 7937 // Perform the actual loop transformation. 7938 7939 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7940 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7941 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7942 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7943 State.CanonicalIV = ILV.Induction; 7944 ILV.collectPoisonGeneratingRecipes(State); 7945 7946 ILV.printDebugTracesAtStart(); 7947 7948 //===------------------------------------------------===// 7949 // 7950 // Notice: any optimization or new instruction that go 7951 // into the code below should also be implemented in 7952 // the cost-model. 7953 // 7954 //===------------------------------------------------===// 7955 7956 // 2. Copy and widen instructions from the old loop into the new loop. 7957 BestVPlan.execute(&State); 7958 7959 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7960 // predication, updating analyses. 7961 ILV.fixVectorizedLoop(State); 7962 7963 ILV.printDebugTracesAtEnd(); 7964 } 7965 7966 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7967 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7968 for (const auto &Plan : VPlans) 7969 if (PrintVPlansInDotFormat) 7970 Plan->printDOT(O); 7971 else 7972 Plan->print(O); 7973 } 7974 #endif 7975 7976 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7977 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7978 7979 // We create new control-flow for the vectorized loop, so the original exit 7980 // conditions will be dead after vectorization if it's only used by the 7981 // terminator 7982 SmallVector<BasicBlock*> ExitingBlocks; 7983 OrigLoop->getExitingBlocks(ExitingBlocks); 7984 for (auto *BB : ExitingBlocks) { 7985 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7986 if (!Cmp || !Cmp->hasOneUse()) 7987 continue; 7988 7989 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7990 if (!DeadInstructions.insert(Cmp).second) 7991 continue; 7992 7993 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7994 // TODO: can recurse through operands in general 7995 for (Value *Op : Cmp->operands()) { 7996 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7997 DeadInstructions.insert(cast<Instruction>(Op)); 7998 } 7999 } 8000 8001 // We create new "steps" for induction variable updates to which the original 8002 // induction variables map. An original update instruction will be dead if 8003 // all its users except the induction variable are dead. 8004 auto *Latch = OrigLoop->getLoopLatch(); 8005 for (auto &Induction : Legal->getInductionVars()) { 8006 PHINode *Ind = Induction.first; 8007 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8008 8009 // If the tail is to be folded by masking, the primary induction variable, 8010 // if exists, isn't dead: it will be used for masking. Don't kill it. 8011 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8012 continue; 8013 8014 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8015 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8016 })) 8017 DeadInstructions.insert(IndUpdate); 8018 } 8019 } 8020 8021 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8022 8023 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8024 8025 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8026 Value *Step, 8027 Instruction::BinaryOps BinOp) { 8028 // When unrolling and the VF is 1, we only need to add a simple scalar. 8029 Type *Ty = Val->getType(); 8030 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8031 8032 if (Ty->isFloatingPointTy()) { 8033 // Floating-point operations inherit FMF via the builder's flags. 8034 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8035 return Builder.CreateBinOp(BinOp, Val, MulOp); 8036 } 8037 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8038 } 8039 8040 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8041 SmallVector<Metadata *, 4> MDs; 8042 // Reserve first location for self reference to the LoopID metadata node. 8043 MDs.push_back(nullptr); 8044 bool IsUnrollMetadata = false; 8045 MDNode *LoopID = L->getLoopID(); 8046 if (LoopID) { 8047 // First find existing loop unrolling disable metadata. 8048 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8049 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8050 if (MD) { 8051 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8052 IsUnrollMetadata = 8053 S && S->getString().startswith("llvm.loop.unroll.disable"); 8054 } 8055 MDs.push_back(LoopID->getOperand(i)); 8056 } 8057 } 8058 8059 if (!IsUnrollMetadata) { 8060 // Add runtime unroll disable metadata. 8061 LLVMContext &Context = L->getHeader()->getContext(); 8062 SmallVector<Metadata *, 1> DisableOperands; 8063 DisableOperands.push_back( 8064 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8065 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8066 MDs.push_back(DisableNode); 8067 MDNode *NewLoopID = MDNode::get(Context, MDs); 8068 // Set operand 0 to refer to the loop id itself. 8069 NewLoopID->replaceOperandWith(0, NewLoopID); 8070 L->setLoopID(NewLoopID); 8071 } 8072 } 8073 8074 //===--------------------------------------------------------------------===// 8075 // EpilogueVectorizerMainLoop 8076 //===--------------------------------------------------------------------===// 8077 8078 /// This function is partially responsible for generating the control flow 8079 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8080 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8081 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8082 Loop *Lp = createVectorLoopSkeleton(""); 8083 8084 // Generate the code to check the minimum iteration count of the vector 8085 // epilogue (see below). 8086 EPI.EpilogueIterationCountCheck = 8087 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8088 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8089 8090 // Generate the code to check any assumptions that we've made for SCEV 8091 // expressions. 8092 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8093 8094 // Generate the code that checks at runtime if arrays overlap. We put the 8095 // checks into a separate block to make the more common case of few elements 8096 // faster. 8097 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8098 8099 // Generate the iteration count check for the main loop, *after* the check 8100 // for the epilogue loop, so that the path-length is shorter for the case 8101 // that goes directly through the vector epilogue. The longer-path length for 8102 // the main loop is compensated for, by the gain from vectorizing the larger 8103 // trip count. Note: the branch will get updated later on when we vectorize 8104 // the epilogue. 8105 EPI.MainLoopIterationCountCheck = 8106 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8107 8108 // Generate the induction variable. 8109 OldInduction = Legal->getPrimaryInduction(); 8110 Type *IdxTy = Legal->getWidestInductionType(); 8111 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8112 8113 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8114 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8115 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8116 EPI.VectorTripCount = CountRoundDown; 8117 Induction = 8118 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8119 getDebugLocFromInstOrOperands(OldInduction)); 8120 8121 // Skip induction resume value creation here because they will be created in 8122 // the second pass. If we created them here, they wouldn't be used anyway, 8123 // because the vplan in the second pass still contains the inductions from the 8124 // original loop. 8125 8126 return completeLoopSkeleton(Lp, OrigLoopID); 8127 } 8128 8129 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8130 LLVM_DEBUG({ 8131 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8132 << "Main Loop VF:" << EPI.MainLoopVF 8133 << ", Main Loop UF:" << EPI.MainLoopUF 8134 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8135 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8136 }); 8137 } 8138 8139 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8140 DEBUG_WITH_TYPE(VerboseDebug, { 8141 dbgs() << "intermediate fn:\n" 8142 << *OrigLoop->getHeader()->getParent() << "\n"; 8143 }); 8144 } 8145 8146 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8147 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8148 assert(L && "Expected valid Loop."); 8149 assert(Bypass && "Expected valid bypass basic block."); 8150 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8151 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8152 Value *Count = getOrCreateTripCount(L); 8153 // Reuse existing vector loop preheader for TC checks. 8154 // Note that new preheader block is generated for vector loop. 8155 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8156 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8157 8158 // Generate code to check if the loop's trip count is less than VF * UF of the 8159 // main vector loop. 8160 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8161 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8162 8163 Value *CheckMinIters = Builder.CreateICmp( 8164 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8165 "min.iters.check"); 8166 8167 if (!ForEpilogue) 8168 TCCheckBlock->setName("vector.main.loop.iter.check"); 8169 8170 // Create new preheader for vector loop. 8171 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8172 DT, LI, nullptr, "vector.ph"); 8173 8174 if (ForEpilogue) { 8175 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8176 DT->getNode(Bypass)->getIDom()) && 8177 "TC check is expected to dominate Bypass"); 8178 8179 // Update dominator for Bypass & LoopExit. 8180 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8181 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8182 // For loops with multiple exits, there's no edge from the middle block 8183 // to exit blocks (as the epilogue must run) and thus no need to update 8184 // the immediate dominator of the exit blocks. 8185 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8186 8187 LoopBypassBlocks.push_back(TCCheckBlock); 8188 8189 // Save the trip count so we don't have to regenerate it in the 8190 // vec.epilog.iter.check. This is safe to do because the trip count 8191 // generated here dominates the vector epilog iter check. 8192 EPI.TripCount = Count; 8193 } 8194 8195 ReplaceInstWithInst( 8196 TCCheckBlock->getTerminator(), 8197 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8198 8199 return TCCheckBlock; 8200 } 8201 8202 //===--------------------------------------------------------------------===// 8203 // EpilogueVectorizerEpilogueLoop 8204 //===--------------------------------------------------------------------===// 8205 8206 /// This function is partially responsible for generating the control flow 8207 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8208 BasicBlock * 8209 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8210 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8211 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8212 8213 // Now, compare the remaining count and if there aren't enough iterations to 8214 // execute the vectorized epilogue skip to the scalar part. 8215 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8216 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8217 LoopVectorPreHeader = 8218 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8219 LI, nullptr, "vec.epilog.ph"); 8220 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8221 VecEpilogueIterationCountCheck); 8222 8223 // Adjust the control flow taking the state info from the main loop 8224 // vectorization into account. 8225 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8226 "expected this to be saved from the previous pass."); 8227 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8228 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8229 8230 DT->changeImmediateDominator(LoopVectorPreHeader, 8231 EPI.MainLoopIterationCountCheck); 8232 8233 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8234 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8235 8236 if (EPI.SCEVSafetyCheck) 8237 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8238 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8239 if (EPI.MemSafetyCheck) 8240 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8241 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8242 8243 DT->changeImmediateDominator( 8244 VecEpilogueIterationCountCheck, 8245 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8246 8247 DT->changeImmediateDominator(LoopScalarPreHeader, 8248 EPI.EpilogueIterationCountCheck); 8249 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8250 // If there is an epilogue which must run, there's no edge from the 8251 // middle block to exit blocks and thus no need to update the immediate 8252 // dominator of the exit blocks. 8253 DT->changeImmediateDominator(LoopExitBlock, 8254 EPI.EpilogueIterationCountCheck); 8255 8256 // Keep track of bypass blocks, as they feed start values to the induction 8257 // phis in the scalar loop preheader. 8258 if (EPI.SCEVSafetyCheck) 8259 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8260 if (EPI.MemSafetyCheck) 8261 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8262 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8263 8264 // Generate a resume induction for the vector epilogue and put it in the 8265 // vector epilogue preheader 8266 Type *IdxTy = Legal->getWidestInductionType(); 8267 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8268 LoopVectorPreHeader->getFirstNonPHI()); 8269 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8270 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8271 EPI.MainLoopIterationCountCheck); 8272 8273 // Generate the induction variable. 8274 OldInduction = Legal->getPrimaryInduction(); 8275 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8276 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8277 Value *StartIdx = EPResumeVal; 8278 Induction = 8279 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8280 getDebugLocFromInstOrOperands(OldInduction)); 8281 8282 // Generate induction resume values. These variables save the new starting 8283 // indexes for the scalar loop. They are used to test if there are any tail 8284 // iterations left once the vector loop has completed. 8285 // Note that when the vectorized epilogue is skipped due to iteration count 8286 // check, then the resume value for the induction variable comes from 8287 // the trip count of the main vector loop, hence passing the AdditionalBypass 8288 // argument. 8289 createInductionResumeValues(Lp, CountRoundDown, 8290 {VecEpilogueIterationCountCheck, 8291 EPI.VectorTripCount} /* AdditionalBypass */); 8292 8293 AddRuntimeUnrollDisableMetaData(Lp); 8294 return completeLoopSkeleton(Lp, OrigLoopID); 8295 } 8296 8297 BasicBlock * 8298 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8299 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8300 8301 assert(EPI.TripCount && 8302 "Expected trip count to have been safed in the first pass."); 8303 assert( 8304 (!isa<Instruction>(EPI.TripCount) || 8305 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8306 "saved trip count does not dominate insertion point."); 8307 Value *TC = EPI.TripCount; 8308 IRBuilder<> Builder(Insert->getTerminator()); 8309 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8310 8311 // Generate code to check if the loop's trip count is less than VF * UF of the 8312 // vector epilogue loop. 8313 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8314 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8315 8316 Value *CheckMinIters = 8317 Builder.CreateICmp(P, Count, 8318 createStepForVF(Builder, Count->getType(), 8319 EPI.EpilogueVF, EPI.EpilogueUF), 8320 "min.epilog.iters.check"); 8321 8322 ReplaceInstWithInst( 8323 Insert->getTerminator(), 8324 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8325 8326 LoopBypassBlocks.push_back(Insert); 8327 return Insert; 8328 } 8329 8330 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8331 LLVM_DEBUG({ 8332 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8333 << "Epilogue Loop VF:" << EPI.EpilogueVF 8334 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8335 }); 8336 } 8337 8338 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8339 DEBUG_WITH_TYPE(VerboseDebug, { 8340 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8341 }); 8342 } 8343 8344 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8345 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8346 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8347 bool PredicateAtRangeStart = Predicate(Range.Start); 8348 8349 for (ElementCount TmpVF = Range.Start * 2; 8350 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8351 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8352 Range.End = TmpVF; 8353 break; 8354 } 8355 8356 return PredicateAtRangeStart; 8357 } 8358 8359 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8360 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8361 /// of VF's starting at a given VF and extending it as much as possible. Each 8362 /// vectorization decision can potentially shorten this sub-range during 8363 /// buildVPlan(). 8364 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8365 ElementCount MaxVF) { 8366 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8367 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8368 VFRange SubRange = {VF, MaxVFPlusOne}; 8369 VPlans.push_back(buildVPlan(SubRange)); 8370 VF = SubRange.End; 8371 } 8372 } 8373 8374 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8375 VPlanPtr &Plan) { 8376 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8377 8378 // Look for cached value. 8379 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8380 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8381 if (ECEntryIt != EdgeMaskCache.end()) 8382 return ECEntryIt->second; 8383 8384 VPValue *SrcMask = createBlockInMask(Src, Plan); 8385 8386 // The terminator has to be a branch inst! 8387 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8388 assert(BI && "Unexpected terminator found"); 8389 8390 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8391 return EdgeMaskCache[Edge] = SrcMask; 8392 8393 // If source is an exiting block, we know the exit edge is dynamically dead 8394 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8395 // adding uses of an otherwise potentially dead instruction. 8396 if (OrigLoop->isLoopExiting(Src)) 8397 return EdgeMaskCache[Edge] = SrcMask; 8398 8399 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8400 assert(EdgeMask && "No Edge Mask found for condition"); 8401 8402 if (BI->getSuccessor(0) != Dst) 8403 EdgeMask = Builder.createNot(EdgeMask); 8404 8405 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8406 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8407 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8408 // The select version does not introduce new UB if SrcMask is false and 8409 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8410 VPValue *False = Plan->getOrAddVPValue( 8411 ConstantInt::getFalse(BI->getCondition()->getType())); 8412 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8413 } 8414 8415 return EdgeMaskCache[Edge] = EdgeMask; 8416 } 8417 8418 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8419 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8420 8421 // Look for cached value. 8422 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8423 if (BCEntryIt != BlockMaskCache.end()) 8424 return BCEntryIt->second; 8425 8426 // All-one mask is modelled as no-mask following the convention for masked 8427 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8428 VPValue *BlockMask = nullptr; 8429 8430 if (OrigLoop->getHeader() == BB) { 8431 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8432 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8433 8434 // Create the block in mask as the first non-phi instruction in the block. 8435 VPBuilder::InsertPointGuard Guard(Builder); 8436 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8437 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8438 8439 // Introduce the early-exit compare IV <= BTC to form header block mask. 8440 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8441 // Start by constructing the desired canonical IV. 8442 VPValue *IV = nullptr; 8443 if (Legal->getPrimaryInduction()) 8444 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8445 else { 8446 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8447 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8448 IV = IVRecipe; 8449 } 8450 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8451 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8452 8453 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8454 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8455 // as a second argument, we only pass the IV here and extract the 8456 // tripcount from the transform state where codegen of the VP instructions 8457 // happen. 8458 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8459 } else { 8460 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8461 } 8462 return BlockMaskCache[BB] = BlockMask; 8463 } 8464 8465 // This is the block mask. We OR all incoming edges. 8466 for (auto *Predecessor : predecessors(BB)) { 8467 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8468 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8469 return BlockMaskCache[BB] = EdgeMask; 8470 8471 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8472 BlockMask = EdgeMask; 8473 continue; 8474 } 8475 8476 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8477 } 8478 8479 return BlockMaskCache[BB] = BlockMask; 8480 } 8481 8482 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8483 ArrayRef<VPValue *> Operands, 8484 VFRange &Range, 8485 VPlanPtr &Plan) { 8486 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8487 "Must be called with either a load or store"); 8488 8489 auto willWiden = [&](ElementCount VF) -> bool { 8490 if (VF.isScalar()) 8491 return false; 8492 LoopVectorizationCostModel::InstWidening Decision = 8493 CM.getWideningDecision(I, VF); 8494 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8495 "CM decision should be taken at this point."); 8496 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8497 return true; 8498 if (CM.isScalarAfterVectorization(I, VF) || 8499 CM.isProfitableToScalarize(I, VF)) 8500 return false; 8501 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8502 }; 8503 8504 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8505 return nullptr; 8506 8507 VPValue *Mask = nullptr; 8508 if (Legal->isMaskRequired(I)) 8509 Mask = createBlockInMask(I->getParent(), Plan); 8510 8511 // Determine if the pointer operand of the access is either consecutive or 8512 // reverse consecutive. 8513 LoopVectorizationCostModel::InstWidening Decision = 8514 CM.getWideningDecision(I, Range.Start); 8515 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8516 bool Consecutive = 8517 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8518 8519 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8520 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8521 Consecutive, Reverse); 8522 8523 StoreInst *Store = cast<StoreInst>(I); 8524 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8525 Mask, Consecutive, Reverse); 8526 } 8527 8528 VPWidenIntOrFpInductionRecipe * 8529 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8530 ArrayRef<VPValue *> Operands) const { 8531 // Check if this is an integer or fp induction. If so, build the recipe that 8532 // produces its scalar and vector values. 8533 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { 8534 assert(II->getStartValue() == 8535 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8536 return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); 8537 } 8538 8539 return nullptr; 8540 } 8541 8542 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8543 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8544 VPlan &Plan) const { 8545 // Optimize the special case where the source is a constant integer 8546 // induction variable. Notice that we can only optimize the 'trunc' case 8547 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8548 // (c) other casts depend on pointer size. 8549 8550 // Determine whether \p K is a truncation based on an induction variable that 8551 // can be optimized. 8552 auto isOptimizableIVTruncate = 8553 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8554 return [=](ElementCount VF) -> bool { 8555 return CM.isOptimizableIVTruncate(K, VF); 8556 }; 8557 }; 8558 8559 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8560 isOptimizableIVTruncate(I), Range)) { 8561 8562 auto *Phi = cast<PHINode>(I->getOperand(0)); 8563 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8564 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8565 return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); 8566 } 8567 return nullptr; 8568 } 8569 8570 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8571 ArrayRef<VPValue *> Operands, 8572 VPlanPtr &Plan) { 8573 // If all incoming values are equal, the incoming VPValue can be used directly 8574 // instead of creating a new VPBlendRecipe. 8575 VPValue *FirstIncoming = Operands[0]; 8576 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8577 return FirstIncoming == Inc; 8578 })) { 8579 return Operands[0]; 8580 } 8581 8582 // We know that all PHIs in non-header blocks are converted into selects, so 8583 // we don't have to worry about the insertion order and we can just use the 8584 // builder. At this point we generate the predication tree. There may be 8585 // duplications since this is a simple recursive scan, but future 8586 // optimizations will clean it up. 8587 SmallVector<VPValue *, 2> OperandsWithMask; 8588 unsigned NumIncoming = Phi->getNumIncomingValues(); 8589 8590 for (unsigned In = 0; In < NumIncoming; In++) { 8591 VPValue *EdgeMask = 8592 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8593 assert((EdgeMask || NumIncoming == 1) && 8594 "Multiple predecessors with one having a full mask"); 8595 OperandsWithMask.push_back(Operands[In]); 8596 if (EdgeMask) 8597 OperandsWithMask.push_back(EdgeMask); 8598 } 8599 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8600 } 8601 8602 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8603 ArrayRef<VPValue *> Operands, 8604 VFRange &Range) const { 8605 8606 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8607 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8608 Range); 8609 8610 if (IsPredicated) 8611 return nullptr; 8612 8613 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8614 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8615 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8616 ID == Intrinsic::pseudoprobe || 8617 ID == Intrinsic::experimental_noalias_scope_decl)) 8618 return nullptr; 8619 8620 auto willWiden = [&](ElementCount VF) -> bool { 8621 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8622 // The following case may be scalarized depending on the VF. 8623 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8624 // version of the instruction. 8625 // Is it beneficial to perform intrinsic call compared to lib call? 8626 bool NeedToScalarize = false; 8627 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8628 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8629 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8630 return UseVectorIntrinsic || !NeedToScalarize; 8631 }; 8632 8633 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8634 return nullptr; 8635 8636 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8637 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8638 } 8639 8640 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8641 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8642 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8643 // Instruction should be widened, unless it is scalar after vectorization, 8644 // scalarization is profitable or it is predicated. 8645 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8646 return CM.isScalarAfterVectorization(I, VF) || 8647 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8648 }; 8649 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8650 Range); 8651 } 8652 8653 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8654 ArrayRef<VPValue *> Operands) const { 8655 auto IsVectorizableOpcode = [](unsigned Opcode) { 8656 switch (Opcode) { 8657 case Instruction::Add: 8658 case Instruction::And: 8659 case Instruction::AShr: 8660 case Instruction::BitCast: 8661 case Instruction::FAdd: 8662 case Instruction::FCmp: 8663 case Instruction::FDiv: 8664 case Instruction::FMul: 8665 case Instruction::FNeg: 8666 case Instruction::FPExt: 8667 case Instruction::FPToSI: 8668 case Instruction::FPToUI: 8669 case Instruction::FPTrunc: 8670 case Instruction::FRem: 8671 case Instruction::FSub: 8672 case Instruction::ICmp: 8673 case Instruction::IntToPtr: 8674 case Instruction::LShr: 8675 case Instruction::Mul: 8676 case Instruction::Or: 8677 case Instruction::PtrToInt: 8678 case Instruction::SDiv: 8679 case Instruction::Select: 8680 case Instruction::SExt: 8681 case Instruction::Shl: 8682 case Instruction::SIToFP: 8683 case Instruction::SRem: 8684 case Instruction::Sub: 8685 case Instruction::Trunc: 8686 case Instruction::UDiv: 8687 case Instruction::UIToFP: 8688 case Instruction::URem: 8689 case Instruction::Xor: 8690 case Instruction::ZExt: 8691 return true; 8692 } 8693 return false; 8694 }; 8695 8696 if (!IsVectorizableOpcode(I->getOpcode())) 8697 return nullptr; 8698 8699 // Success: widen this instruction. 8700 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8701 } 8702 8703 void VPRecipeBuilder::fixHeaderPhis() { 8704 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8705 for (VPWidenPHIRecipe *R : PhisToFix) { 8706 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8707 VPRecipeBase *IncR = 8708 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8709 R->addOperand(IncR->getVPSingleValue()); 8710 } 8711 } 8712 8713 VPBasicBlock *VPRecipeBuilder::handleReplication( 8714 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8715 VPlanPtr &Plan) { 8716 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8717 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8718 Range); 8719 8720 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8721 [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, 8722 Range); 8723 8724 // Even if the instruction is not marked as uniform, there are certain 8725 // intrinsic calls that can be effectively treated as such, so we check for 8726 // them here. Conservatively, we only do this for scalable vectors, since 8727 // for fixed-width VFs we can always fall back on full scalarization. 8728 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8729 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8730 case Intrinsic::assume: 8731 case Intrinsic::lifetime_start: 8732 case Intrinsic::lifetime_end: 8733 // For scalable vectors if one of the operands is variant then we still 8734 // want to mark as uniform, which will generate one instruction for just 8735 // the first lane of the vector. We can't scalarize the call in the same 8736 // way as for fixed-width vectors because we don't know how many lanes 8737 // there are. 8738 // 8739 // The reasons for doing it this way for scalable vectors are: 8740 // 1. For the assume intrinsic generating the instruction for the first 8741 // lane is still be better than not generating any at all. For 8742 // example, the input may be a splat across all lanes. 8743 // 2. For the lifetime start/end intrinsics the pointer operand only 8744 // does anything useful when the input comes from a stack object, 8745 // which suggests it should always be uniform. For non-stack objects 8746 // the effect is to poison the object, which still allows us to 8747 // remove the call. 8748 IsUniform = true; 8749 break; 8750 default: 8751 break; 8752 } 8753 } 8754 8755 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8756 IsUniform, IsPredicated); 8757 setRecipe(I, Recipe); 8758 Plan->addVPValue(I, Recipe); 8759 8760 // Find if I uses a predicated instruction. If so, it will use its scalar 8761 // value. Avoid hoisting the insert-element which packs the scalar value into 8762 // a vector value, as that happens iff all users use the vector value. 8763 for (VPValue *Op : Recipe->operands()) { 8764 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8765 if (!PredR) 8766 continue; 8767 auto *RepR = 8768 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8769 assert(RepR->isPredicated() && 8770 "expected Replicate recipe to be predicated"); 8771 RepR->setAlsoPack(false); 8772 } 8773 8774 // Finalize the recipe for Instr, first if it is not predicated. 8775 if (!IsPredicated) { 8776 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8777 VPBB->appendRecipe(Recipe); 8778 return VPBB; 8779 } 8780 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8781 assert(VPBB->getSuccessors().empty() && 8782 "VPBB has successors when handling predicated replication."); 8783 // Record predicated instructions for above packing optimizations. 8784 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8785 VPBlockUtils::insertBlockAfter(Region, VPBB); 8786 auto *RegSucc = new VPBasicBlock(); 8787 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8788 return RegSucc; 8789 } 8790 8791 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8792 VPRecipeBase *PredRecipe, 8793 VPlanPtr &Plan) { 8794 // Instructions marked for predication are replicated and placed under an 8795 // if-then construct to prevent side-effects. 8796 8797 // Generate recipes to compute the block mask for this region. 8798 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8799 8800 // Build the triangular if-then region. 8801 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8802 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8803 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8804 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8805 auto *PHIRecipe = Instr->getType()->isVoidTy() 8806 ? nullptr 8807 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8808 if (PHIRecipe) { 8809 Plan->removeVPValueFor(Instr); 8810 Plan->addVPValue(Instr, PHIRecipe); 8811 } 8812 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8813 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8814 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8815 8816 // Note: first set Entry as region entry and then connect successors starting 8817 // from it in order, to propagate the "parent" of each VPBasicBlock. 8818 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8819 VPBlockUtils::connectBlocks(Pred, Exit); 8820 8821 return Region; 8822 } 8823 8824 VPRecipeOrVPValueTy 8825 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8826 ArrayRef<VPValue *> Operands, 8827 VFRange &Range, VPlanPtr &Plan) { 8828 // First, check for specific widening recipes that deal with calls, memory 8829 // operations, inductions and Phi nodes. 8830 if (auto *CI = dyn_cast<CallInst>(Instr)) 8831 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8832 8833 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8834 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8835 8836 VPRecipeBase *Recipe; 8837 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8838 if (Phi->getParent() != OrigLoop->getHeader()) 8839 return tryToBlend(Phi, Operands, Plan); 8840 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8841 return toVPRecipeResult(Recipe); 8842 8843 VPWidenPHIRecipe *PhiRecipe = nullptr; 8844 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8845 VPValue *StartV = Operands[0]; 8846 if (Legal->isReductionVariable(Phi)) { 8847 const RecurrenceDescriptor &RdxDesc = 8848 Legal->getReductionVars().find(Phi)->second; 8849 assert(RdxDesc.getRecurrenceStartValue() == 8850 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8851 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8852 CM.isInLoopReduction(Phi), 8853 CM.useOrderedReductions(RdxDesc)); 8854 } else { 8855 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8856 } 8857 8858 // Record the incoming value from the backedge, so we can add the incoming 8859 // value from the backedge after all recipes have been created. 8860 recordRecipeOf(cast<Instruction>( 8861 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8862 PhisToFix.push_back(PhiRecipe); 8863 } else { 8864 // TODO: record start and backedge value for remaining pointer induction 8865 // phis. 8866 assert(Phi->getType()->isPointerTy() && 8867 "only pointer phis should be handled here"); 8868 PhiRecipe = new VPWidenPHIRecipe(Phi); 8869 } 8870 8871 return toVPRecipeResult(PhiRecipe); 8872 } 8873 8874 if (isa<TruncInst>(Instr) && 8875 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8876 Range, *Plan))) 8877 return toVPRecipeResult(Recipe); 8878 8879 if (!shouldWiden(Instr, Range)) 8880 return nullptr; 8881 8882 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8883 return toVPRecipeResult(new VPWidenGEPRecipe( 8884 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8885 8886 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8887 bool InvariantCond = 8888 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8889 return toVPRecipeResult(new VPWidenSelectRecipe( 8890 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8891 } 8892 8893 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8894 } 8895 8896 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8897 ElementCount MaxVF) { 8898 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8899 8900 // Collect instructions from the original loop that will become trivially dead 8901 // in the vectorized loop. We don't need to vectorize these instructions. For 8902 // example, original induction update instructions can become dead because we 8903 // separately emit induction "steps" when generating code for the new loop. 8904 // Similarly, we create a new latch condition when setting up the structure 8905 // of the new loop, so the old one can become dead. 8906 SmallPtrSet<Instruction *, 4> DeadInstructions; 8907 collectTriviallyDeadInstructions(DeadInstructions); 8908 8909 // Add assume instructions we need to drop to DeadInstructions, to prevent 8910 // them from being added to the VPlan. 8911 // TODO: We only need to drop assumes in blocks that get flattend. If the 8912 // control flow is preserved, we should keep them. 8913 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8914 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8915 8916 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8917 // Dead instructions do not need sinking. Remove them from SinkAfter. 8918 for (Instruction *I : DeadInstructions) 8919 SinkAfter.erase(I); 8920 8921 // Cannot sink instructions after dead instructions (there won't be any 8922 // recipes for them). Instead, find the first non-dead previous instruction. 8923 for (auto &P : Legal->getSinkAfter()) { 8924 Instruction *SinkTarget = P.second; 8925 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8926 (void)FirstInst; 8927 while (DeadInstructions.contains(SinkTarget)) { 8928 assert( 8929 SinkTarget != FirstInst && 8930 "Must find a live instruction (at least the one feeding the " 8931 "first-order recurrence PHI) before reaching beginning of the block"); 8932 SinkTarget = SinkTarget->getPrevNode(); 8933 assert(SinkTarget != P.first && 8934 "sink source equals target, no sinking required"); 8935 } 8936 P.second = SinkTarget; 8937 } 8938 8939 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8940 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8941 VFRange SubRange = {VF, MaxVFPlusOne}; 8942 VPlans.push_back( 8943 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8944 VF = SubRange.End; 8945 } 8946 } 8947 8948 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8949 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8950 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8951 8952 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8953 8954 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8955 8956 // --------------------------------------------------------------------------- 8957 // Pre-construction: record ingredients whose recipes we'll need to further 8958 // process after constructing the initial VPlan. 8959 // --------------------------------------------------------------------------- 8960 8961 // Mark instructions we'll need to sink later and their targets as 8962 // ingredients whose recipe we'll need to record. 8963 for (auto &Entry : SinkAfter) { 8964 RecipeBuilder.recordRecipeOf(Entry.first); 8965 RecipeBuilder.recordRecipeOf(Entry.second); 8966 } 8967 for (auto &Reduction : CM.getInLoopReductionChains()) { 8968 PHINode *Phi = Reduction.first; 8969 RecurKind Kind = 8970 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8971 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8972 8973 RecipeBuilder.recordRecipeOf(Phi); 8974 for (auto &R : ReductionOperations) { 8975 RecipeBuilder.recordRecipeOf(R); 8976 // For min/max reducitons, where we have a pair of icmp/select, we also 8977 // need to record the ICmp recipe, so it can be removed later. 8978 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8979 "Only min/max recurrences allowed for inloop reductions"); 8980 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8981 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8982 } 8983 } 8984 8985 // For each interleave group which is relevant for this (possibly trimmed) 8986 // Range, add it to the set of groups to be later applied to the VPlan and add 8987 // placeholders for its members' Recipes which we'll be replacing with a 8988 // single VPInterleaveRecipe. 8989 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8990 auto applyIG = [IG, this](ElementCount VF) -> bool { 8991 return (VF.isVector() && // Query is illegal for VF == 1 8992 CM.getWideningDecision(IG->getInsertPos(), VF) == 8993 LoopVectorizationCostModel::CM_Interleave); 8994 }; 8995 if (!getDecisionAndClampRange(applyIG, Range)) 8996 continue; 8997 InterleaveGroups.insert(IG); 8998 for (unsigned i = 0; i < IG->getFactor(); i++) 8999 if (Instruction *Member = IG->getMember(i)) 9000 RecipeBuilder.recordRecipeOf(Member); 9001 }; 9002 9003 // --------------------------------------------------------------------------- 9004 // Build initial VPlan: Scan the body of the loop in a topological order to 9005 // visit each basic block after having visited its predecessor basic blocks. 9006 // --------------------------------------------------------------------------- 9007 9008 auto Plan = std::make_unique<VPlan>(); 9009 9010 // Scan the body of the loop in a topological order to visit each basic block 9011 // after having visited its predecessor basic blocks. 9012 LoopBlocksDFS DFS(OrigLoop); 9013 DFS.perform(LI); 9014 9015 VPBasicBlock *VPBB = nullptr; 9016 VPBasicBlock *HeaderVPBB = nullptr; 9017 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9018 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9019 // Relevant instructions from basic block BB will be grouped into VPRecipe 9020 // ingredients and fill a new VPBasicBlock. 9021 unsigned VPBBsForBB = 0; 9022 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9023 if (VPBB) 9024 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9025 else { 9026 auto *TopRegion = new VPRegionBlock("vector loop"); 9027 TopRegion->setEntry(FirstVPBBForBB); 9028 Plan->setEntry(TopRegion); 9029 HeaderVPBB = FirstVPBBForBB; 9030 } 9031 VPBB = FirstVPBBForBB; 9032 Builder.setInsertPoint(VPBB); 9033 9034 // Introduce each ingredient into VPlan. 9035 // TODO: Model and preserve debug instrinsics in VPlan. 9036 for (Instruction &I : BB->instructionsWithoutDebug()) { 9037 Instruction *Instr = &I; 9038 9039 // First filter out irrelevant instructions, to ensure no recipes are 9040 // built for them. 9041 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9042 continue; 9043 9044 SmallVector<VPValue *, 4> Operands; 9045 auto *Phi = dyn_cast<PHINode>(Instr); 9046 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9047 Operands.push_back(Plan->getOrAddVPValue( 9048 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9049 } else { 9050 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9051 Operands = {OpRange.begin(), OpRange.end()}; 9052 } 9053 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9054 Instr, Operands, Range, Plan)) { 9055 // If Instr can be simplified to an existing VPValue, use it. 9056 if (RecipeOrValue.is<VPValue *>()) { 9057 auto *VPV = RecipeOrValue.get<VPValue *>(); 9058 Plan->addVPValue(Instr, VPV); 9059 // If the re-used value is a recipe, register the recipe for the 9060 // instruction, in case the recipe for Instr needs to be recorded. 9061 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9062 RecipeBuilder.setRecipe(Instr, R); 9063 continue; 9064 } 9065 // Otherwise, add the new recipe. 9066 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9067 for (auto *Def : Recipe->definedValues()) { 9068 auto *UV = Def->getUnderlyingValue(); 9069 Plan->addVPValue(UV, Def); 9070 } 9071 9072 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9073 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9074 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9075 // of the header block. That can happen for truncates of induction 9076 // variables. Those recipes are moved to the phi section of the header 9077 // block after applying SinkAfter, which relies on the original 9078 // position of the trunc. 9079 assert(isa<TruncInst>(Instr)); 9080 InductionsToMove.push_back( 9081 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9082 } 9083 RecipeBuilder.setRecipe(Instr, Recipe); 9084 VPBB->appendRecipe(Recipe); 9085 continue; 9086 } 9087 9088 // Otherwise, if all widening options failed, Instruction is to be 9089 // replicated. This may create a successor for VPBB. 9090 VPBasicBlock *NextVPBB = 9091 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9092 if (NextVPBB != VPBB) { 9093 VPBB = NextVPBB; 9094 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9095 : ""); 9096 } 9097 } 9098 } 9099 9100 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9101 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9102 "entry block must be set to a VPRegionBlock having a non-empty entry " 9103 "VPBasicBlock"); 9104 RecipeBuilder.fixHeaderPhis(); 9105 9106 // --------------------------------------------------------------------------- 9107 // Transform initial VPlan: Apply previously taken decisions, in order, to 9108 // bring the VPlan to its final state. 9109 // --------------------------------------------------------------------------- 9110 9111 // Apply Sink-After legal constraints. 9112 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9113 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9114 if (Region && Region->isReplicator()) { 9115 assert(Region->getNumSuccessors() == 1 && 9116 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9117 assert(R->getParent()->size() == 1 && 9118 "A recipe in an original replicator region must be the only " 9119 "recipe in its block"); 9120 return Region; 9121 } 9122 return nullptr; 9123 }; 9124 for (auto &Entry : SinkAfter) { 9125 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9126 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9127 9128 auto *TargetRegion = GetReplicateRegion(Target); 9129 auto *SinkRegion = GetReplicateRegion(Sink); 9130 if (!SinkRegion) { 9131 // If the sink source is not a replicate region, sink the recipe directly. 9132 if (TargetRegion) { 9133 // The target is in a replication region, make sure to move Sink to 9134 // the block after it, not into the replication region itself. 9135 VPBasicBlock *NextBlock = 9136 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9137 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9138 } else 9139 Sink->moveAfter(Target); 9140 continue; 9141 } 9142 9143 // The sink source is in a replicate region. Unhook the region from the CFG. 9144 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9145 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9146 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9147 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9148 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9149 9150 if (TargetRegion) { 9151 // The target recipe is also in a replicate region, move the sink region 9152 // after the target region. 9153 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9154 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9155 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9156 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9157 } else { 9158 // The sink source is in a replicate region, we need to move the whole 9159 // replicate region, which should only contain a single recipe in the 9160 // main block. 9161 auto *SplitBlock = 9162 Target->getParent()->splitAt(std::next(Target->getIterator())); 9163 9164 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9165 9166 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9167 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9168 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9169 if (VPBB == SplitPred) 9170 VPBB = SplitBlock; 9171 } 9172 } 9173 9174 cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); 9175 9176 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9177 9178 // Now that sink-after is done, move induction recipes for optimized truncates 9179 // to the phi section of the header block. 9180 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9181 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9182 9183 // Adjust the recipes for any inloop reductions. 9184 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9185 9186 // Introduce a recipe to combine the incoming and previous values of a 9187 // first-order recurrence. 9188 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9189 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9190 if (!RecurPhi) 9191 continue; 9192 9193 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9194 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9195 auto *Region = GetReplicateRegion(PrevRecipe); 9196 if (Region) 9197 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9198 if (Region || PrevRecipe->isPhi()) 9199 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9200 else 9201 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9202 9203 auto *RecurSplice = cast<VPInstruction>( 9204 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9205 {RecurPhi, RecurPhi->getBackedgeValue()})); 9206 9207 RecurPhi->replaceAllUsesWith(RecurSplice); 9208 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9209 // all users. 9210 RecurSplice->setOperand(0, RecurPhi); 9211 } 9212 9213 // Interleave memory: for each Interleave Group we marked earlier as relevant 9214 // for this VPlan, replace the Recipes widening its memory instructions with a 9215 // single VPInterleaveRecipe at its insertion point. 9216 for (auto IG : InterleaveGroups) { 9217 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9218 RecipeBuilder.getRecipe(IG->getInsertPos())); 9219 SmallVector<VPValue *, 4> StoredValues; 9220 for (unsigned i = 0; i < IG->getFactor(); ++i) 9221 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9222 auto *StoreR = 9223 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9224 StoredValues.push_back(StoreR->getStoredValue()); 9225 } 9226 9227 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9228 Recipe->getMask()); 9229 VPIG->insertBefore(Recipe); 9230 unsigned J = 0; 9231 for (unsigned i = 0; i < IG->getFactor(); ++i) 9232 if (Instruction *Member = IG->getMember(i)) { 9233 if (!Member->getType()->isVoidTy()) { 9234 VPValue *OriginalV = Plan->getVPValue(Member); 9235 Plan->removeVPValueFor(Member); 9236 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9237 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9238 J++; 9239 } 9240 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9241 } 9242 } 9243 9244 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9245 // in ways that accessing values using original IR values is incorrect. 9246 Plan->disableValue2VPValue(); 9247 9248 VPlanTransforms::sinkScalarOperands(*Plan); 9249 VPlanTransforms::mergeReplicateRegions(*Plan); 9250 9251 std::string PlanName; 9252 raw_string_ostream RSO(PlanName); 9253 ElementCount VF = Range.Start; 9254 Plan->addVF(VF); 9255 RSO << "Initial VPlan for VF={" << VF; 9256 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9257 Plan->addVF(VF); 9258 RSO << "," << VF; 9259 } 9260 RSO << "},UF>=1"; 9261 RSO.flush(); 9262 Plan->setName(PlanName); 9263 9264 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9265 return Plan; 9266 } 9267 9268 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9269 // Outer loop handling: They may require CFG and instruction level 9270 // transformations before even evaluating whether vectorization is profitable. 9271 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9272 // the vectorization pipeline. 9273 assert(!OrigLoop->isInnermost()); 9274 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9275 9276 // Create new empty VPlan 9277 auto Plan = std::make_unique<VPlan>(); 9278 9279 // Build hierarchical CFG 9280 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9281 HCFGBuilder.buildHierarchicalCFG(); 9282 9283 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9284 VF *= 2) 9285 Plan->addVF(VF); 9286 9287 if (EnableVPlanPredication) { 9288 VPlanPredicator VPP(*Plan); 9289 VPP.predicate(); 9290 9291 // Avoid running transformation to recipes until masked code generation in 9292 // VPlan-native path is in place. 9293 return Plan; 9294 } 9295 9296 SmallPtrSet<Instruction *, 1> DeadInstructions; 9297 VPlanTransforms::VPInstructionsToVPRecipes( 9298 OrigLoop, Plan, 9299 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9300 DeadInstructions, *PSE.getSE()); 9301 return Plan; 9302 } 9303 9304 // Adjust the recipes for reductions. For in-loop reductions the chain of 9305 // instructions leading from the loop exit instr to the phi need to be converted 9306 // to reductions, with one operand being vector and the other being the scalar 9307 // reduction chain. For other reductions, a select is introduced between the phi 9308 // and live-out recipes when folding the tail. 9309 void LoopVectorizationPlanner::adjustRecipesForReductions( 9310 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9311 ElementCount MinVF) { 9312 for (auto &Reduction : CM.getInLoopReductionChains()) { 9313 PHINode *Phi = Reduction.first; 9314 const RecurrenceDescriptor &RdxDesc = 9315 Legal->getReductionVars().find(Phi)->second; 9316 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9317 9318 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9319 continue; 9320 9321 // ReductionOperations are orders top-down from the phi's use to the 9322 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9323 // which of the two operands will remain scalar and which will be reduced. 9324 // For minmax the chain will be the select instructions. 9325 Instruction *Chain = Phi; 9326 for (Instruction *R : ReductionOperations) { 9327 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9328 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9329 9330 VPValue *ChainOp = Plan->getVPValue(Chain); 9331 unsigned FirstOpId; 9332 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9333 "Only min/max recurrences allowed for inloop reductions"); 9334 // Recognize a call to the llvm.fmuladd intrinsic. 9335 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9336 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9337 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9338 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9339 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9340 "Expected to replace a VPWidenSelectSC"); 9341 FirstOpId = 1; 9342 } else { 9343 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9344 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9345 "Expected to replace a VPWidenSC"); 9346 FirstOpId = 0; 9347 } 9348 unsigned VecOpId = 9349 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9350 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9351 9352 auto *CondOp = CM.foldTailByMasking() 9353 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9354 : nullptr; 9355 9356 if (IsFMulAdd) { 9357 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9358 // need to create an fmul recipe to use as the vector operand for the 9359 // fadd reduction. 9360 VPInstruction *FMulRecipe = new VPInstruction( 9361 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9362 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9363 WidenRecipe->getParent()->insert(FMulRecipe, 9364 WidenRecipe->getIterator()); 9365 VecOp = FMulRecipe; 9366 } 9367 VPReductionRecipe *RedRecipe = 9368 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9369 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9370 Plan->removeVPValueFor(R); 9371 Plan->addVPValue(R, RedRecipe); 9372 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9373 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9374 WidenRecipe->eraseFromParent(); 9375 9376 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9377 VPRecipeBase *CompareRecipe = 9378 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9379 assert(isa<VPWidenRecipe>(CompareRecipe) && 9380 "Expected to replace a VPWidenSC"); 9381 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9382 "Expected no remaining users"); 9383 CompareRecipe->eraseFromParent(); 9384 } 9385 Chain = R; 9386 } 9387 } 9388 9389 // If tail is folded by masking, introduce selects between the phi 9390 // and the live-out instruction of each reduction, at the end of the latch. 9391 if (CM.foldTailByMasking()) { 9392 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9393 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9394 if (!PhiR || PhiR->isInLoop()) 9395 continue; 9396 Builder.setInsertPoint(LatchVPBB); 9397 VPValue *Cond = 9398 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9399 VPValue *Red = PhiR->getBackedgeValue(); 9400 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9401 } 9402 } 9403 } 9404 9405 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9406 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9407 VPSlotTracker &SlotTracker) const { 9408 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9409 IG->getInsertPos()->printAsOperand(O, false); 9410 O << ", "; 9411 getAddr()->printAsOperand(O, SlotTracker); 9412 VPValue *Mask = getMask(); 9413 if (Mask) { 9414 O << ", "; 9415 Mask->printAsOperand(O, SlotTracker); 9416 } 9417 9418 unsigned OpIdx = 0; 9419 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9420 if (!IG->getMember(i)) 9421 continue; 9422 if (getNumStoreOperands() > 0) { 9423 O << "\n" << Indent << " store "; 9424 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9425 O << " to index " << i; 9426 } else { 9427 O << "\n" << Indent << " "; 9428 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9429 O << " = load from index " << i; 9430 } 9431 ++OpIdx; 9432 } 9433 } 9434 #endif 9435 9436 void VPWidenCallRecipe::execute(VPTransformState &State) { 9437 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9438 *this, State); 9439 } 9440 9441 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9442 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9443 State.ILV->setDebugLocFromInst(&I); 9444 9445 // The condition can be loop invariant but still defined inside the 9446 // loop. This means that we can't just use the original 'cond' value. 9447 // We have to take the 'vectorized' value and pick the first lane. 9448 // Instcombine will make this a no-op. 9449 auto *InvarCond = 9450 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9451 9452 for (unsigned Part = 0; Part < State.UF; ++Part) { 9453 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9454 Value *Op0 = State.get(getOperand(1), Part); 9455 Value *Op1 = State.get(getOperand(2), Part); 9456 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9457 State.set(this, Sel, Part); 9458 State.ILV->addMetadata(Sel, &I); 9459 } 9460 } 9461 9462 void VPWidenRecipe::execute(VPTransformState &State) { 9463 auto &I = *cast<Instruction>(getUnderlyingValue()); 9464 auto &Builder = State.Builder; 9465 switch (I.getOpcode()) { 9466 case Instruction::Call: 9467 case Instruction::Br: 9468 case Instruction::PHI: 9469 case Instruction::GetElementPtr: 9470 case Instruction::Select: 9471 llvm_unreachable("This instruction is handled by a different recipe."); 9472 case Instruction::UDiv: 9473 case Instruction::SDiv: 9474 case Instruction::SRem: 9475 case Instruction::URem: 9476 case Instruction::Add: 9477 case Instruction::FAdd: 9478 case Instruction::Sub: 9479 case Instruction::FSub: 9480 case Instruction::FNeg: 9481 case Instruction::Mul: 9482 case Instruction::FMul: 9483 case Instruction::FDiv: 9484 case Instruction::FRem: 9485 case Instruction::Shl: 9486 case Instruction::LShr: 9487 case Instruction::AShr: 9488 case Instruction::And: 9489 case Instruction::Or: 9490 case Instruction::Xor: { 9491 // Just widen unops and binops. 9492 State.ILV->setDebugLocFromInst(&I); 9493 9494 for (unsigned Part = 0; Part < State.UF; ++Part) { 9495 SmallVector<Value *, 2> Ops; 9496 for (VPValue *VPOp : operands()) 9497 Ops.push_back(State.get(VPOp, Part)); 9498 9499 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9500 9501 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9502 VecOp->copyIRFlags(&I); 9503 9504 // If the instruction is vectorized and was in a basic block that needed 9505 // predication, we can't propagate poison-generating flags (nuw/nsw, 9506 // exact, etc.). The control flow has been linearized and the 9507 // instruction is no longer guarded by the predicate, which could make 9508 // the flag properties to no longer hold. 9509 if (State.MayGeneratePoisonRecipes.count(this) > 0) 9510 VecOp->dropPoisonGeneratingFlags(); 9511 } 9512 9513 // Use this vector value for all users of the original instruction. 9514 State.set(this, V, Part); 9515 State.ILV->addMetadata(V, &I); 9516 } 9517 9518 break; 9519 } 9520 case Instruction::ICmp: 9521 case Instruction::FCmp: { 9522 // Widen compares. Generate vector compares. 9523 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9524 auto *Cmp = cast<CmpInst>(&I); 9525 State.ILV->setDebugLocFromInst(Cmp); 9526 for (unsigned Part = 0; Part < State.UF; ++Part) { 9527 Value *A = State.get(getOperand(0), Part); 9528 Value *B = State.get(getOperand(1), Part); 9529 Value *C = nullptr; 9530 if (FCmp) { 9531 // Propagate fast math flags. 9532 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9533 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9534 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9535 } else { 9536 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9537 } 9538 State.set(this, C, Part); 9539 State.ILV->addMetadata(C, &I); 9540 } 9541 9542 break; 9543 } 9544 9545 case Instruction::ZExt: 9546 case Instruction::SExt: 9547 case Instruction::FPToUI: 9548 case Instruction::FPToSI: 9549 case Instruction::FPExt: 9550 case Instruction::PtrToInt: 9551 case Instruction::IntToPtr: 9552 case Instruction::SIToFP: 9553 case Instruction::UIToFP: 9554 case Instruction::Trunc: 9555 case Instruction::FPTrunc: 9556 case Instruction::BitCast: { 9557 auto *CI = cast<CastInst>(&I); 9558 State.ILV->setDebugLocFromInst(CI); 9559 9560 /// Vectorize casts. 9561 Type *DestTy = (State.VF.isScalar()) 9562 ? CI->getType() 9563 : VectorType::get(CI->getType(), State.VF); 9564 9565 for (unsigned Part = 0; Part < State.UF; ++Part) { 9566 Value *A = State.get(getOperand(0), Part); 9567 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9568 State.set(this, Cast, Part); 9569 State.ILV->addMetadata(Cast, &I); 9570 } 9571 break; 9572 } 9573 default: 9574 // This instruction is not vectorized by simple widening. 9575 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9576 llvm_unreachable("Unhandled instruction!"); 9577 } // end of switch. 9578 } 9579 9580 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9581 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9582 // Construct a vector GEP by widening the operands of the scalar GEP as 9583 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9584 // results in a vector of pointers when at least one operand of the GEP 9585 // is vector-typed. Thus, to keep the representation compact, we only use 9586 // vector-typed operands for loop-varying values. 9587 9588 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9589 // If we are vectorizing, but the GEP has only loop-invariant operands, 9590 // the GEP we build (by only using vector-typed operands for 9591 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9592 // produce a vector of pointers, we need to either arbitrarily pick an 9593 // operand to broadcast, or broadcast a clone of the original GEP. 9594 // Here, we broadcast a clone of the original. 9595 // 9596 // TODO: If at some point we decide to scalarize instructions having 9597 // loop-invariant operands, this special case will no longer be 9598 // required. We would add the scalarization decision to 9599 // collectLoopScalars() and teach getVectorValue() to broadcast 9600 // the lane-zero scalar value. 9601 auto *Clone = State.Builder.Insert(GEP->clone()); 9602 for (unsigned Part = 0; Part < State.UF; ++Part) { 9603 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9604 State.set(this, EntryPart, Part); 9605 State.ILV->addMetadata(EntryPart, GEP); 9606 } 9607 } else { 9608 // If the GEP has at least one loop-varying operand, we are sure to 9609 // produce a vector of pointers. But if we are only unrolling, we want 9610 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9611 // produce with the code below will be scalar (if VF == 1) or vector 9612 // (otherwise). Note that for the unroll-only case, we still maintain 9613 // values in the vector mapping with initVector, as we do for other 9614 // instructions. 9615 for (unsigned Part = 0; Part < State.UF; ++Part) { 9616 // The pointer operand of the new GEP. If it's loop-invariant, we 9617 // won't broadcast it. 9618 auto *Ptr = IsPtrLoopInvariant 9619 ? State.get(getOperand(0), VPIteration(0, 0)) 9620 : State.get(getOperand(0), Part); 9621 9622 // Collect all the indices for the new GEP. If any index is 9623 // loop-invariant, we won't broadcast it. 9624 SmallVector<Value *, 4> Indices; 9625 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9626 VPValue *Operand = getOperand(I); 9627 if (IsIndexLoopInvariant[I - 1]) 9628 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9629 else 9630 Indices.push_back(State.get(Operand, Part)); 9631 } 9632 9633 // If the GEP instruction is vectorized and was in a basic block that 9634 // needed predication, we can't propagate the poison-generating 'inbounds' 9635 // flag. The control flow has been linearized and the GEP is no longer 9636 // guarded by the predicate, which could make the 'inbounds' properties to 9637 // no longer hold. 9638 bool IsInBounds = 9639 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9640 9641 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9642 // but it should be a vector, otherwise. 9643 auto *NewGEP = IsInBounds 9644 ? State.Builder.CreateInBoundsGEP( 9645 GEP->getSourceElementType(), Ptr, Indices) 9646 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9647 Ptr, Indices); 9648 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9649 "NewGEP is not a pointer vector"); 9650 State.set(this, NewGEP, Part); 9651 State.ILV->addMetadata(NewGEP, GEP); 9652 } 9653 } 9654 } 9655 9656 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9657 assert(!State.Instance && "Int or FP induction being replicated."); 9658 State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), 9659 getStartValue()->getLiveInIRValue(), 9660 getTruncInst(), getVPValue(0), State); 9661 } 9662 9663 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9664 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9665 State); 9666 } 9667 9668 void VPBlendRecipe::execute(VPTransformState &State) { 9669 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9670 // We know that all PHIs in non-header blocks are converted into 9671 // selects, so we don't have to worry about the insertion order and we 9672 // can just use the builder. 9673 // At this point we generate the predication tree. There may be 9674 // duplications since this is a simple recursive scan, but future 9675 // optimizations will clean it up. 9676 9677 unsigned NumIncoming = getNumIncomingValues(); 9678 9679 // Generate a sequence of selects of the form: 9680 // SELECT(Mask3, In3, 9681 // SELECT(Mask2, In2, 9682 // SELECT(Mask1, In1, 9683 // In0))) 9684 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9685 // are essentially undef are taken from In0. 9686 InnerLoopVectorizer::VectorParts Entry(State.UF); 9687 for (unsigned In = 0; In < NumIncoming; ++In) { 9688 for (unsigned Part = 0; Part < State.UF; ++Part) { 9689 // We might have single edge PHIs (blocks) - use an identity 9690 // 'select' for the first PHI operand. 9691 Value *In0 = State.get(getIncomingValue(In), Part); 9692 if (In == 0) 9693 Entry[Part] = In0; // Initialize with the first incoming value. 9694 else { 9695 // Select between the current value and the previous incoming edge 9696 // based on the incoming mask. 9697 Value *Cond = State.get(getMask(In), Part); 9698 Entry[Part] = 9699 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9700 } 9701 } 9702 } 9703 for (unsigned Part = 0; Part < State.UF; ++Part) 9704 State.set(this, Entry[Part], Part); 9705 } 9706 9707 void VPInterleaveRecipe::execute(VPTransformState &State) { 9708 assert(!State.Instance && "Interleave group being replicated."); 9709 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9710 getStoredValues(), getMask()); 9711 } 9712 9713 void VPReductionRecipe::execute(VPTransformState &State) { 9714 assert(!State.Instance && "Reduction being replicated."); 9715 Value *PrevInChain = State.get(getChainOp(), 0); 9716 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9717 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9718 // Propagate the fast-math flags carried by the underlying instruction. 9719 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9720 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9721 for (unsigned Part = 0; Part < State.UF; ++Part) { 9722 Value *NewVecOp = State.get(getVecOp(), Part); 9723 if (VPValue *Cond = getCondOp()) { 9724 Value *NewCond = State.get(Cond, Part); 9725 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9726 Value *Iden = RdxDesc->getRecurrenceIdentity( 9727 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9728 Value *IdenVec = 9729 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9730 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9731 NewVecOp = Select; 9732 } 9733 Value *NewRed; 9734 Value *NextInChain; 9735 if (IsOrdered) { 9736 if (State.VF.isVector()) 9737 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9738 PrevInChain); 9739 else 9740 NewRed = State.Builder.CreateBinOp( 9741 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9742 NewVecOp); 9743 PrevInChain = NewRed; 9744 } else { 9745 PrevInChain = State.get(getChainOp(), Part); 9746 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9747 } 9748 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9749 NextInChain = 9750 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9751 NewRed, PrevInChain); 9752 } else if (IsOrdered) 9753 NextInChain = NewRed; 9754 else 9755 NextInChain = State.Builder.CreateBinOp( 9756 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9757 PrevInChain); 9758 State.set(this, NextInChain, Part); 9759 } 9760 } 9761 9762 void VPReplicateRecipe::execute(VPTransformState &State) { 9763 if (State.Instance) { // Generate a single instance. 9764 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9765 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9766 IsPredicated, State); 9767 // Insert scalar instance packing it into a vector. 9768 if (AlsoPack && State.VF.isVector()) { 9769 // If we're constructing lane 0, initialize to start from poison. 9770 if (State.Instance->Lane.isFirstLane()) { 9771 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9772 Value *Poison = PoisonValue::get( 9773 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9774 State.set(this, Poison, State.Instance->Part); 9775 } 9776 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9777 } 9778 return; 9779 } 9780 9781 // Generate scalar instances for all VF lanes of all UF parts, unless the 9782 // instruction is uniform inwhich case generate only the first lane for each 9783 // of the UF parts. 9784 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9785 assert((!State.VF.isScalable() || IsUniform) && 9786 "Can't scalarize a scalable vector"); 9787 for (unsigned Part = 0; Part < State.UF; ++Part) 9788 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9789 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9790 VPIteration(Part, Lane), IsPredicated, 9791 State); 9792 } 9793 9794 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9795 assert(State.Instance && "Branch on Mask works only on single instance."); 9796 9797 unsigned Part = State.Instance->Part; 9798 unsigned Lane = State.Instance->Lane.getKnownLane(); 9799 9800 Value *ConditionBit = nullptr; 9801 VPValue *BlockInMask = getMask(); 9802 if (BlockInMask) { 9803 ConditionBit = State.get(BlockInMask, Part); 9804 if (ConditionBit->getType()->isVectorTy()) 9805 ConditionBit = State.Builder.CreateExtractElement( 9806 ConditionBit, State.Builder.getInt32(Lane)); 9807 } else // Block in mask is all-one. 9808 ConditionBit = State.Builder.getTrue(); 9809 9810 // Replace the temporary unreachable terminator with a new conditional branch, 9811 // whose two destinations will be set later when they are created. 9812 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9813 assert(isa<UnreachableInst>(CurrentTerminator) && 9814 "Expected to replace unreachable terminator with conditional branch."); 9815 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9816 CondBr->setSuccessor(0, nullptr); 9817 ReplaceInstWithInst(CurrentTerminator, CondBr); 9818 } 9819 9820 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9821 assert(State.Instance && "Predicated instruction PHI works per instance."); 9822 Instruction *ScalarPredInst = 9823 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9824 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9825 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9826 assert(PredicatingBB && "Predicated block has no single predecessor."); 9827 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9828 "operand must be VPReplicateRecipe"); 9829 9830 // By current pack/unpack logic we need to generate only a single phi node: if 9831 // a vector value for the predicated instruction exists at this point it means 9832 // the instruction has vector users only, and a phi for the vector value is 9833 // needed. In this case the recipe of the predicated instruction is marked to 9834 // also do that packing, thereby "hoisting" the insert-element sequence. 9835 // Otherwise, a phi node for the scalar value is needed. 9836 unsigned Part = State.Instance->Part; 9837 if (State.hasVectorValue(getOperand(0), Part)) { 9838 Value *VectorValue = State.get(getOperand(0), Part); 9839 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9840 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9841 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9842 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9843 if (State.hasVectorValue(this, Part)) 9844 State.reset(this, VPhi, Part); 9845 else 9846 State.set(this, VPhi, Part); 9847 // NOTE: Currently we need to update the value of the operand, so the next 9848 // predicated iteration inserts its generated value in the correct vector. 9849 State.reset(getOperand(0), VPhi, Part); 9850 } else { 9851 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9852 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9853 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9854 PredicatingBB); 9855 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9856 if (State.hasScalarValue(this, *State.Instance)) 9857 State.reset(this, Phi, *State.Instance); 9858 else 9859 State.set(this, Phi, *State.Instance); 9860 // NOTE: Currently we need to update the value of the operand, so the next 9861 // predicated iteration inserts its generated value in the correct vector. 9862 State.reset(getOperand(0), Phi, *State.Instance); 9863 } 9864 } 9865 9866 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9867 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9868 9869 // Attempt to issue a wide load. 9870 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9871 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9872 9873 assert((LI || SI) && "Invalid Load/Store instruction"); 9874 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9875 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9876 9877 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9878 9879 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9880 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9881 bool CreateGatherScatter = !Consecutive; 9882 9883 auto &Builder = State.Builder; 9884 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9885 bool isMaskRequired = getMask(); 9886 if (isMaskRequired) 9887 for (unsigned Part = 0; Part < State.UF; ++Part) 9888 BlockInMaskParts[Part] = State.get(getMask(), Part); 9889 9890 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9891 // Calculate the pointer for the specific unroll-part. 9892 GetElementPtrInst *PartPtr = nullptr; 9893 9894 bool InBounds = false; 9895 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9896 InBounds = gep->isInBounds(); 9897 if (Reverse) { 9898 // If the address is consecutive but reversed, then the 9899 // wide store needs to start at the last vector element. 9900 // RunTimeVF = VScale * VF.getKnownMinValue() 9901 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9902 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9903 // NumElt = -Part * RunTimeVF 9904 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9905 // LastLane = 1 - RunTimeVF 9906 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9907 PartPtr = 9908 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9909 PartPtr->setIsInBounds(InBounds); 9910 PartPtr = cast<GetElementPtrInst>( 9911 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9912 PartPtr->setIsInBounds(InBounds); 9913 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9914 BlockInMaskParts[Part] = 9915 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9916 } else { 9917 Value *Increment = 9918 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9919 PartPtr = cast<GetElementPtrInst>( 9920 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9921 PartPtr->setIsInBounds(InBounds); 9922 } 9923 9924 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9925 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9926 }; 9927 9928 // Handle Stores: 9929 if (SI) { 9930 State.ILV->setDebugLocFromInst(SI); 9931 9932 for (unsigned Part = 0; Part < State.UF; ++Part) { 9933 Instruction *NewSI = nullptr; 9934 Value *StoredVal = State.get(StoredValue, Part); 9935 if (CreateGatherScatter) { 9936 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9937 Value *VectorGep = State.get(getAddr(), Part); 9938 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9939 MaskPart); 9940 } else { 9941 if (Reverse) { 9942 // If we store to reverse consecutive memory locations, then we need 9943 // to reverse the order of elements in the stored value. 9944 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9945 // We don't want to update the value in the map as it might be used in 9946 // another expression. So don't call resetVectorValue(StoredVal). 9947 } 9948 auto *VecPtr = 9949 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9950 if (isMaskRequired) 9951 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9952 BlockInMaskParts[Part]); 9953 else 9954 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9955 } 9956 State.ILV->addMetadata(NewSI, SI); 9957 } 9958 return; 9959 } 9960 9961 // Handle loads. 9962 assert(LI && "Must have a load instruction"); 9963 State.ILV->setDebugLocFromInst(LI); 9964 for (unsigned Part = 0; Part < State.UF; ++Part) { 9965 Value *NewLI; 9966 if (CreateGatherScatter) { 9967 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9968 Value *VectorGep = State.get(getAddr(), Part); 9969 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9970 nullptr, "wide.masked.gather"); 9971 State.ILV->addMetadata(NewLI, LI); 9972 } else { 9973 auto *VecPtr = 9974 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9975 if (isMaskRequired) 9976 NewLI = Builder.CreateMaskedLoad( 9977 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9978 PoisonValue::get(DataTy), "wide.masked.load"); 9979 else 9980 NewLI = 9981 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9982 9983 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9984 State.ILV->addMetadata(NewLI, LI); 9985 if (Reverse) 9986 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9987 } 9988 9989 State.set(getVPSingleValue(), NewLI, Part); 9990 } 9991 } 9992 9993 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9994 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9995 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9996 // for predication. 9997 static ScalarEpilogueLowering getScalarEpilogueLowering( 9998 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9999 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10000 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10001 LoopVectorizationLegality &LVL) { 10002 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10003 // don't look at hints or options, and don't request a scalar epilogue. 10004 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10005 // LoopAccessInfo (due to code dependency and not being able to reliably get 10006 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10007 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10008 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10009 // back to the old way and vectorize with versioning when forced. See D81345.) 10010 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10011 PGSOQueryType::IRPass) && 10012 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10013 return CM_ScalarEpilogueNotAllowedOptSize; 10014 10015 // 2) If set, obey the directives 10016 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10017 switch (PreferPredicateOverEpilogue) { 10018 case PreferPredicateTy::ScalarEpilogue: 10019 return CM_ScalarEpilogueAllowed; 10020 case PreferPredicateTy::PredicateElseScalarEpilogue: 10021 return CM_ScalarEpilogueNotNeededUsePredicate; 10022 case PreferPredicateTy::PredicateOrDontVectorize: 10023 return CM_ScalarEpilogueNotAllowedUsePredicate; 10024 }; 10025 } 10026 10027 // 3) If set, obey the hints 10028 switch (Hints.getPredicate()) { 10029 case LoopVectorizeHints::FK_Enabled: 10030 return CM_ScalarEpilogueNotNeededUsePredicate; 10031 case LoopVectorizeHints::FK_Disabled: 10032 return CM_ScalarEpilogueAllowed; 10033 }; 10034 10035 // 4) if the TTI hook indicates this is profitable, request predication. 10036 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10037 LVL.getLAI())) 10038 return CM_ScalarEpilogueNotNeededUsePredicate; 10039 10040 return CM_ScalarEpilogueAllowed; 10041 } 10042 10043 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10044 // If Values have been set for this Def return the one relevant for \p Part. 10045 if (hasVectorValue(Def, Part)) 10046 return Data.PerPartOutput[Def][Part]; 10047 10048 if (!hasScalarValue(Def, {Part, 0})) { 10049 Value *IRV = Def->getLiveInIRValue(); 10050 Value *B = ILV->getBroadcastInstrs(IRV); 10051 set(Def, B, Part); 10052 return B; 10053 } 10054 10055 Value *ScalarValue = get(Def, {Part, 0}); 10056 // If we aren't vectorizing, we can just copy the scalar map values over 10057 // to the vector map. 10058 if (VF.isScalar()) { 10059 set(Def, ScalarValue, Part); 10060 return ScalarValue; 10061 } 10062 10063 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10064 bool IsUniform = RepR && RepR->isUniform(); 10065 10066 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10067 // Check if there is a scalar value for the selected lane. 10068 if (!hasScalarValue(Def, {Part, LastLane})) { 10069 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10070 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10071 "unexpected recipe found to be invariant"); 10072 IsUniform = true; 10073 LastLane = 0; 10074 } 10075 10076 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10077 // Set the insert point after the last scalarized instruction or after the 10078 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10079 // will directly follow the scalar definitions. 10080 auto OldIP = Builder.saveIP(); 10081 auto NewIP = 10082 isa<PHINode>(LastInst) 10083 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10084 : std::next(BasicBlock::iterator(LastInst)); 10085 Builder.SetInsertPoint(&*NewIP); 10086 10087 // However, if we are vectorizing, we need to construct the vector values. 10088 // If the value is known to be uniform after vectorization, we can just 10089 // broadcast the scalar value corresponding to lane zero for each unroll 10090 // iteration. Otherwise, we construct the vector values using 10091 // insertelement instructions. Since the resulting vectors are stored in 10092 // State, we will only generate the insertelements once. 10093 Value *VectorValue = nullptr; 10094 if (IsUniform) { 10095 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10096 set(Def, VectorValue, Part); 10097 } else { 10098 // Initialize packing with insertelements to start from undef. 10099 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10100 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10101 set(Def, Undef, Part); 10102 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10103 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10104 VectorValue = get(Def, Part); 10105 } 10106 Builder.restoreIP(OldIP); 10107 return VectorValue; 10108 } 10109 10110 // Process the loop in the VPlan-native vectorization path. This path builds 10111 // VPlan upfront in the vectorization pipeline, which allows to apply 10112 // VPlan-to-VPlan transformations from the very beginning without modifying the 10113 // input LLVM IR. 10114 static bool processLoopInVPlanNativePath( 10115 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10116 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10117 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10118 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10119 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10120 LoopVectorizationRequirements &Requirements) { 10121 10122 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10123 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10124 return false; 10125 } 10126 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10127 Function *F = L->getHeader()->getParent(); 10128 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10129 10130 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10131 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10132 10133 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10134 &Hints, IAI); 10135 // Use the planner for outer loop vectorization. 10136 // TODO: CM is not used at this point inside the planner. Turn CM into an 10137 // optional argument if we don't need it in the future. 10138 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10139 Requirements, ORE); 10140 10141 // Get user vectorization factor. 10142 ElementCount UserVF = Hints.getWidth(); 10143 10144 CM.collectElementTypesForWidening(); 10145 10146 // Plan how to best vectorize, return the best VF and its cost. 10147 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10148 10149 // If we are stress testing VPlan builds, do not attempt to generate vector 10150 // code. Masked vector code generation support will follow soon. 10151 // Also, do not attempt to vectorize if no vector code will be produced. 10152 if (VPlanBuildStressTest || EnableVPlanPredication || 10153 VectorizationFactor::Disabled() == VF) 10154 return false; 10155 10156 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10157 10158 { 10159 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10160 F->getParent()->getDataLayout()); 10161 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10162 &CM, BFI, PSI, Checks); 10163 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10164 << L->getHeader()->getParent()->getName() << "\"\n"); 10165 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10166 } 10167 10168 // Mark the loop as already vectorized to avoid vectorizing again. 10169 Hints.setAlreadyVectorized(); 10170 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10171 return true; 10172 } 10173 10174 // Emit a remark if there are stores to floats that required a floating point 10175 // extension. If the vectorized loop was generated with floating point there 10176 // will be a performance penalty from the conversion overhead and the change in 10177 // the vector width. 10178 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10179 SmallVector<Instruction *, 4> Worklist; 10180 for (BasicBlock *BB : L->getBlocks()) { 10181 for (Instruction &Inst : *BB) { 10182 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10183 if (S->getValueOperand()->getType()->isFloatTy()) 10184 Worklist.push_back(S); 10185 } 10186 } 10187 } 10188 10189 // Traverse the floating point stores upwards searching, for floating point 10190 // conversions. 10191 SmallPtrSet<const Instruction *, 4> Visited; 10192 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10193 while (!Worklist.empty()) { 10194 auto *I = Worklist.pop_back_val(); 10195 if (!L->contains(I)) 10196 continue; 10197 if (!Visited.insert(I).second) 10198 continue; 10199 10200 // Emit a remark if the floating point store required a floating 10201 // point conversion. 10202 // TODO: More work could be done to identify the root cause such as a 10203 // constant or a function return type and point the user to it. 10204 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10205 ORE->emit([&]() { 10206 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10207 I->getDebugLoc(), L->getHeader()) 10208 << "floating point conversion changes vector width. " 10209 << "Mixed floating point precision requires an up/down " 10210 << "cast that will negatively impact performance."; 10211 }); 10212 10213 for (Use &Op : I->operands()) 10214 if (auto *OpI = dyn_cast<Instruction>(Op)) 10215 Worklist.push_back(OpI); 10216 } 10217 } 10218 10219 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10220 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10221 !EnableLoopInterleaving), 10222 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10223 !EnableLoopVectorization) {} 10224 10225 bool LoopVectorizePass::processLoop(Loop *L) { 10226 assert((EnableVPlanNativePath || L->isInnermost()) && 10227 "VPlan-native path is not enabled. Only process inner loops."); 10228 10229 #ifndef NDEBUG 10230 const std::string DebugLocStr = getDebugLocString(L); 10231 #endif /* NDEBUG */ 10232 10233 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10234 << L->getHeader()->getParent()->getName() << "\" from " 10235 << DebugLocStr << "\n"); 10236 10237 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10238 10239 LLVM_DEBUG( 10240 dbgs() << "LV: Loop hints:" 10241 << " force=" 10242 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10243 ? "disabled" 10244 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10245 ? "enabled" 10246 : "?")) 10247 << " width=" << Hints.getWidth() 10248 << " interleave=" << Hints.getInterleave() << "\n"); 10249 10250 // Function containing loop 10251 Function *F = L->getHeader()->getParent(); 10252 10253 // Looking at the diagnostic output is the only way to determine if a loop 10254 // was vectorized (other than looking at the IR or machine code), so it 10255 // is important to generate an optimization remark for each loop. Most of 10256 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10257 // generated as OptimizationRemark and OptimizationRemarkMissed are 10258 // less verbose reporting vectorized loops and unvectorized loops that may 10259 // benefit from vectorization, respectively. 10260 10261 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10262 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10263 return false; 10264 } 10265 10266 PredicatedScalarEvolution PSE(*SE, *L); 10267 10268 // Check if it is legal to vectorize the loop. 10269 LoopVectorizationRequirements Requirements; 10270 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10271 &Requirements, &Hints, DB, AC, BFI, PSI); 10272 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10273 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10274 Hints.emitRemarkWithHints(); 10275 return false; 10276 } 10277 10278 // Check the function attributes and profiles to find out if this function 10279 // should be optimized for size. 10280 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10281 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10282 10283 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10284 // here. They may require CFG and instruction level transformations before 10285 // even evaluating whether vectorization is profitable. Since we cannot modify 10286 // the incoming IR, we need to build VPlan upfront in the vectorization 10287 // pipeline. 10288 if (!L->isInnermost()) 10289 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10290 ORE, BFI, PSI, Hints, Requirements); 10291 10292 assert(L->isInnermost() && "Inner loop expected."); 10293 10294 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10295 // count by optimizing for size, to minimize overheads. 10296 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10297 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10298 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10299 << "This loop is worth vectorizing only if no scalar " 10300 << "iteration overheads are incurred."); 10301 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10302 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10303 else { 10304 LLVM_DEBUG(dbgs() << "\n"); 10305 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10306 } 10307 } 10308 10309 // Check the function attributes to see if implicit floats are allowed. 10310 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10311 // an integer loop and the vector instructions selected are purely integer 10312 // vector instructions? 10313 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10314 reportVectorizationFailure( 10315 "Can't vectorize when the NoImplicitFloat attribute is used", 10316 "loop not vectorized due to NoImplicitFloat attribute", 10317 "NoImplicitFloat", ORE, L); 10318 Hints.emitRemarkWithHints(); 10319 return false; 10320 } 10321 10322 // Check if the target supports potentially unsafe FP vectorization. 10323 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10324 // for the target we're vectorizing for, to make sure none of the 10325 // additional fp-math flags can help. 10326 if (Hints.isPotentiallyUnsafe() && 10327 TTI->isFPVectorizationPotentiallyUnsafe()) { 10328 reportVectorizationFailure( 10329 "Potentially unsafe FP op prevents vectorization", 10330 "loop not vectorized due to unsafe FP support.", 10331 "UnsafeFP", ORE, L); 10332 Hints.emitRemarkWithHints(); 10333 return false; 10334 } 10335 10336 bool AllowOrderedReductions; 10337 // If the flag is set, use that instead and override the TTI behaviour. 10338 if (ForceOrderedReductions.getNumOccurrences() > 0) 10339 AllowOrderedReductions = ForceOrderedReductions; 10340 else 10341 AllowOrderedReductions = TTI->enableOrderedReductions(); 10342 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10343 ORE->emit([&]() { 10344 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10345 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10346 ExactFPMathInst->getDebugLoc(), 10347 ExactFPMathInst->getParent()) 10348 << "loop not vectorized: cannot prove it is safe to reorder " 10349 "floating-point operations"; 10350 }); 10351 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10352 "reorder floating-point operations\n"); 10353 Hints.emitRemarkWithHints(); 10354 return false; 10355 } 10356 10357 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10358 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10359 10360 // If an override option has been passed in for interleaved accesses, use it. 10361 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10362 UseInterleaved = EnableInterleavedMemAccesses; 10363 10364 // Analyze interleaved memory accesses. 10365 if (UseInterleaved) { 10366 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10367 } 10368 10369 // Use the cost model. 10370 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10371 F, &Hints, IAI); 10372 CM.collectValuesToIgnore(); 10373 CM.collectElementTypesForWidening(); 10374 10375 // Use the planner for vectorization. 10376 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10377 Requirements, ORE); 10378 10379 // Get user vectorization factor and interleave count. 10380 ElementCount UserVF = Hints.getWidth(); 10381 unsigned UserIC = Hints.getInterleave(); 10382 10383 // Plan how to best vectorize, return the best VF and its cost. 10384 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10385 10386 VectorizationFactor VF = VectorizationFactor::Disabled(); 10387 unsigned IC = 1; 10388 10389 if (MaybeVF) { 10390 VF = *MaybeVF; 10391 // Select the interleave count. 10392 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10393 } 10394 10395 // Identify the diagnostic messages that should be produced. 10396 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10397 bool VectorizeLoop = true, InterleaveLoop = true; 10398 if (VF.Width.isScalar()) { 10399 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10400 VecDiagMsg = std::make_pair( 10401 "VectorizationNotBeneficial", 10402 "the cost-model indicates that vectorization is not beneficial"); 10403 VectorizeLoop = false; 10404 } 10405 10406 if (!MaybeVF && UserIC > 1) { 10407 // Tell the user interleaving was avoided up-front, despite being explicitly 10408 // requested. 10409 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10410 "interleaving should be avoided up front\n"); 10411 IntDiagMsg = std::make_pair( 10412 "InterleavingAvoided", 10413 "Ignoring UserIC, because interleaving was avoided up front"); 10414 InterleaveLoop = false; 10415 } else if (IC == 1 && UserIC <= 1) { 10416 // Tell the user interleaving is not beneficial. 10417 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10418 IntDiagMsg = std::make_pair( 10419 "InterleavingNotBeneficial", 10420 "the cost-model indicates that interleaving is not beneficial"); 10421 InterleaveLoop = false; 10422 if (UserIC == 1) { 10423 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10424 IntDiagMsg.second += 10425 " and is explicitly disabled or interleave count is set to 1"; 10426 } 10427 } else if (IC > 1 && UserIC == 1) { 10428 // Tell the user interleaving is beneficial, but it explicitly disabled. 10429 LLVM_DEBUG( 10430 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10431 IntDiagMsg = std::make_pair( 10432 "InterleavingBeneficialButDisabled", 10433 "the cost-model indicates that interleaving is beneficial " 10434 "but is explicitly disabled or interleave count is set to 1"); 10435 InterleaveLoop = false; 10436 } 10437 10438 // Override IC if user provided an interleave count. 10439 IC = UserIC > 0 ? UserIC : IC; 10440 10441 // Emit diagnostic messages, if any. 10442 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10443 if (!VectorizeLoop && !InterleaveLoop) { 10444 // Do not vectorize or interleaving the loop. 10445 ORE->emit([&]() { 10446 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10447 L->getStartLoc(), L->getHeader()) 10448 << VecDiagMsg.second; 10449 }); 10450 ORE->emit([&]() { 10451 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10452 L->getStartLoc(), L->getHeader()) 10453 << IntDiagMsg.second; 10454 }); 10455 return false; 10456 } else if (!VectorizeLoop && InterleaveLoop) { 10457 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10458 ORE->emit([&]() { 10459 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10460 L->getStartLoc(), L->getHeader()) 10461 << VecDiagMsg.second; 10462 }); 10463 } else if (VectorizeLoop && !InterleaveLoop) { 10464 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10465 << ") in " << DebugLocStr << '\n'); 10466 ORE->emit([&]() { 10467 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10468 L->getStartLoc(), L->getHeader()) 10469 << IntDiagMsg.second; 10470 }); 10471 } else if (VectorizeLoop && InterleaveLoop) { 10472 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10473 << ") in " << DebugLocStr << '\n'); 10474 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10475 } 10476 10477 bool DisableRuntimeUnroll = false; 10478 MDNode *OrigLoopID = L->getLoopID(); 10479 { 10480 // Optimistically generate runtime checks. Drop them if they turn out to not 10481 // be profitable. Limit the scope of Checks, so the cleanup happens 10482 // immediately after vector codegeneration is done. 10483 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10484 F->getParent()->getDataLayout()); 10485 if (!VF.Width.isScalar() || IC > 1) 10486 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10487 10488 using namespace ore; 10489 if (!VectorizeLoop) { 10490 assert(IC > 1 && "interleave count should not be 1 or 0"); 10491 // If we decided that it is not legal to vectorize the loop, then 10492 // interleave it. 10493 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10494 &CM, BFI, PSI, Checks); 10495 10496 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10497 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10498 10499 ORE->emit([&]() { 10500 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10501 L->getHeader()) 10502 << "interleaved loop (interleaved count: " 10503 << NV("InterleaveCount", IC) << ")"; 10504 }); 10505 } else { 10506 // If we decided that it is *legal* to vectorize the loop, then do it. 10507 10508 // Consider vectorizing the epilogue too if it's profitable. 10509 VectorizationFactor EpilogueVF = 10510 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10511 if (EpilogueVF.Width.isVector()) { 10512 10513 // The first pass vectorizes the main loop and creates a scalar epilogue 10514 // to be vectorized by executing the plan (potentially with a different 10515 // factor) again shortly afterwards. 10516 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10517 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10518 EPI, &LVL, &CM, BFI, PSI, Checks); 10519 10520 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10521 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10522 DT); 10523 ++LoopsVectorized; 10524 10525 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10526 formLCSSARecursively(*L, *DT, LI, SE); 10527 10528 // Second pass vectorizes the epilogue and adjusts the control flow 10529 // edges from the first pass. 10530 EPI.MainLoopVF = EPI.EpilogueVF; 10531 EPI.MainLoopUF = EPI.EpilogueUF; 10532 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10533 ORE, EPI, &LVL, &CM, BFI, PSI, 10534 Checks); 10535 10536 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10537 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10538 DT); 10539 ++LoopsEpilogueVectorized; 10540 10541 if (!MainILV.areSafetyChecksAdded()) 10542 DisableRuntimeUnroll = true; 10543 } else { 10544 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10545 &LVL, &CM, BFI, PSI, Checks); 10546 10547 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10548 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10549 ++LoopsVectorized; 10550 10551 // Add metadata to disable runtime unrolling a scalar loop when there 10552 // are no runtime checks about strides and memory. A scalar loop that is 10553 // rarely used is not worth unrolling. 10554 if (!LB.areSafetyChecksAdded()) 10555 DisableRuntimeUnroll = true; 10556 } 10557 // Report the vectorization decision. 10558 ORE->emit([&]() { 10559 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10560 L->getHeader()) 10561 << "vectorized loop (vectorization width: " 10562 << NV("VectorizationFactor", VF.Width) 10563 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10564 }); 10565 } 10566 10567 if (ORE->allowExtraAnalysis(LV_NAME)) 10568 checkMixedPrecision(L, ORE); 10569 } 10570 10571 Optional<MDNode *> RemainderLoopID = 10572 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10573 LLVMLoopVectorizeFollowupEpilogue}); 10574 if (RemainderLoopID.hasValue()) { 10575 L->setLoopID(RemainderLoopID.getValue()); 10576 } else { 10577 if (DisableRuntimeUnroll) 10578 AddRuntimeUnrollDisableMetaData(L); 10579 10580 // Mark the loop as already vectorized to avoid vectorizing again. 10581 Hints.setAlreadyVectorized(); 10582 } 10583 10584 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10585 return true; 10586 } 10587 10588 LoopVectorizeResult LoopVectorizePass::runImpl( 10589 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10590 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10591 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10592 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10593 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10594 SE = &SE_; 10595 LI = &LI_; 10596 TTI = &TTI_; 10597 DT = &DT_; 10598 BFI = &BFI_; 10599 TLI = TLI_; 10600 AA = &AA_; 10601 AC = &AC_; 10602 GetLAA = &GetLAA_; 10603 DB = &DB_; 10604 ORE = &ORE_; 10605 PSI = PSI_; 10606 10607 // Don't attempt if 10608 // 1. the target claims to have no vector registers, and 10609 // 2. interleaving won't help ILP. 10610 // 10611 // The second condition is necessary because, even if the target has no 10612 // vector registers, loop vectorization may still enable scalar 10613 // interleaving. 10614 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10615 TTI->getMaxInterleaveFactor(1) < 2) 10616 return LoopVectorizeResult(false, false); 10617 10618 bool Changed = false, CFGChanged = false; 10619 10620 // The vectorizer requires loops to be in simplified form. 10621 // Since simplification may add new inner loops, it has to run before the 10622 // legality and profitability checks. This means running the loop vectorizer 10623 // will simplify all loops, regardless of whether anything end up being 10624 // vectorized. 10625 for (auto &L : *LI) 10626 Changed |= CFGChanged |= 10627 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10628 10629 // Build up a worklist of inner-loops to vectorize. This is necessary as 10630 // the act of vectorizing or partially unrolling a loop creates new loops 10631 // and can invalidate iterators across the loops. 10632 SmallVector<Loop *, 8> Worklist; 10633 10634 for (Loop *L : *LI) 10635 collectSupportedLoops(*L, LI, ORE, Worklist); 10636 10637 LoopsAnalyzed += Worklist.size(); 10638 10639 // Now walk the identified inner loops. 10640 while (!Worklist.empty()) { 10641 Loop *L = Worklist.pop_back_val(); 10642 10643 // For the inner loops we actually process, form LCSSA to simplify the 10644 // transform. 10645 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10646 10647 Changed |= CFGChanged |= processLoop(L); 10648 } 10649 10650 // Process each loop nest in the function. 10651 return LoopVectorizeResult(Changed, CFGChanged); 10652 } 10653 10654 PreservedAnalyses LoopVectorizePass::run(Function &F, 10655 FunctionAnalysisManager &AM) { 10656 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10657 auto &LI = AM.getResult<LoopAnalysis>(F); 10658 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10659 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10660 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10661 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10662 auto &AA = AM.getResult<AAManager>(F); 10663 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10664 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10665 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10666 10667 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10668 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10669 [&](Loop &L) -> const LoopAccessInfo & { 10670 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10671 TLI, TTI, nullptr, nullptr, nullptr}; 10672 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10673 }; 10674 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10675 ProfileSummaryInfo *PSI = 10676 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10677 LoopVectorizeResult Result = 10678 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10679 if (!Result.MadeAnyChange) 10680 return PreservedAnalyses::all(); 10681 PreservedAnalyses PA; 10682 10683 // We currently do not preserve loopinfo/dominator analyses with outer loop 10684 // vectorization. Until this is addressed, mark these analyses as preserved 10685 // only for non-VPlan-native path. 10686 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10687 if (!EnableVPlanNativePath) { 10688 PA.preserve<LoopAnalysis>(); 10689 PA.preserve<DominatorTreeAnalysis>(); 10690 } 10691 10692 if (Result.MadeCFGChange) { 10693 // Making CFG changes likely means a loop got vectorized. Indicate that 10694 // extra simplification passes should be run. 10695 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10696 // be run if runtime checks have been added. 10697 AM.getResult<ShouldRunExtraVectorPasses>(F); 10698 PA.preserve<ShouldRunExtraVectorPasses>(); 10699 } else { 10700 PA.preserveSet<CFGAnalyses>(); 10701 } 10702 return PA; 10703 } 10704 10705 void LoopVectorizePass::printPipeline( 10706 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10707 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10708 OS, MapClassName2PassName); 10709 10710 OS << "<"; 10711 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10712 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10713 OS << ">"; 10714 } 10715