1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop. 474 /// In the case of epilogue vectorization, this function is overriden to 475 /// handle the more complex control flow around the loops. 476 virtual BasicBlock *createVectorizedLoopSkeleton(); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 483 void fixVectorizedLoop(VPTransformState &State); 484 485 // Return true if any runtime check is added. 486 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 487 488 /// A type for vectorized values in the new loop. Each value from the 489 /// original loop, when vectorized, is represented by UF vector values in the 490 /// new unrolled loop, where UF is the unroll factor. 491 using VectorParts = SmallVector<Value *, 2>; 492 493 /// Vectorize a single first-order recurrence or pointer induction PHINode in 494 /// a block. This method handles the induction variable canonicalization. It 495 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 496 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 497 VPTransformState &State); 498 499 /// A helper function to scalarize a single Instruction in the innermost loop. 500 /// Generates a sequence of scalar instances for each lane between \p MinLane 501 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 502 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 503 /// Instr's operands. 504 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 505 const VPIteration &Instance, bool IfPredicateInstr, 506 VPTransformState &State); 507 508 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 509 /// is provided, the integer induction variable will first be truncated to 510 /// the corresponding type. 511 void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, 512 Value *Start, TruncInst *Trunc, VPValue *Def, 513 VPTransformState &State); 514 515 /// Construct the vector value of a scalarized value \p V one lane at a time. 516 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 517 VPTransformState &State); 518 519 /// Try to vectorize interleaved access group \p Group with the base address 520 /// given in \p Addr, optionally masking the vector operations if \p 521 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 522 /// values in the vectorized loop. 523 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 524 ArrayRef<VPValue *> VPDefs, 525 VPTransformState &State, VPValue *Addr, 526 ArrayRef<VPValue *> StoredValues, 527 VPValue *BlockInMask = nullptr); 528 529 /// Set the debug location in the builder \p Ptr using the debug location in 530 /// \p V. If \p Ptr is None then it uses the class member's Builder. 531 void setDebugLocFromInst(const Value *V, 532 Optional<IRBuilder<> *> CustomBuilder = None); 533 534 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 535 void fixNonInductionPHIs(VPTransformState &State); 536 537 /// Returns true if the reordering of FP operations is not allowed, but we are 538 /// able to vectorize with strict in-order reductions for the given RdxDesc. 539 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 540 541 /// Create a broadcast instruction. This method generates a broadcast 542 /// instruction (shuffle) for loop invariant values and for the induction 543 /// value. If this is the induction variable then we extend it to N, N+1, ... 544 /// this is needed because each iteration in the loop corresponds to a SIMD 545 /// element. 546 virtual Value *getBroadcastInstrs(Value *V); 547 548 /// Add metadata from one instruction to another. 549 /// 550 /// This includes both the original MDs from \p From and additional ones (\see 551 /// addNewMetadata). Use this for *newly created* instructions in the vector 552 /// loop. 553 void addMetadata(Instruction *To, Instruction *From); 554 555 /// Similar to the previous function but it adds the metadata to a 556 /// vector of instructions. 557 void addMetadata(ArrayRef<Value *> To, Instruction *From); 558 559 protected: 560 friend class LoopVectorizationPlanner; 561 562 /// A small list of PHINodes. 563 using PhiVector = SmallVector<PHINode *, 4>; 564 565 /// A type for scalarized values in the new loop. Each value from the 566 /// original loop, when scalarized, is represented by UF x VF scalar values 567 /// in the new unrolled loop, where UF is the unroll factor and VF is the 568 /// vectorization factor. 569 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 570 571 /// Set up the values of the IVs correctly when exiting the vector loop. 572 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 573 Value *CountRoundDown, Value *EndValue, 574 BasicBlock *MiddleBlock); 575 576 /// Create a new induction variable inside L. 577 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 578 Value *Step, Instruction *DL); 579 580 /// Handle all cross-iteration phis in the header. 581 void fixCrossIterationPHIs(VPTransformState &State); 582 583 /// Create the exit value of first order recurrences in the middle block and 584 /// update their users. 585 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 586 587 /// Create code for the loop exit value of the reduction. 588 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 589 590 /// Clear NSW/NUW flags from reduction instructions if necessary. 591 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 592 VPTransformState &State); 593 594 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 595 /// means we need to add the appropriate incoming value from the middle 596 /// block as exiting edges from the scalar epilogue loop (if present) are 597 /// already in place, and we exit the vector loop exclusively to the middle 598 /// block. 599 void fixLCSSAPHIs(VPTransformState &State); 600 601 /// Iteratively sink the scalarized operands of a predicated instruction into 602 /// the block that was created for it. 603 void sinkScalarOperands(Instruction *PredInst); 604 605 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 606 /// represented as. 607 void truncateToMinimalBitwidths(VPTransformState &State); 608 609 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 610 /// variable on which to base the steps, \p Step is the size of the step, and 611 /// \p EntryVal is the value from the original loop that maps to the steps. 612 /// Note that \p EntryVal doesn't have to be an induction variable - it 613 /// can also be a truncate instruction. 614 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 615 const InductionDescriptor &ID, VPValue *Def, 616 VPTransformState &State); 617 618 /// Create a vector induction phi node based on an existing scalar one. \p 619 /// EntryVal is the value from the original loop that maps to the vector phi 620 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 621 /// truncate instruction, instead of widening the original IV, we widen a 622 /// version of the IV truncated to \p EntryVal's type. 623 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 624 Value *Step, Value *Start, 625 Instruction *EntryVal, VPValue *Def, 626 VPTransformState &State); 627 628 /// Returns true if an instruction \p I should be scalarized instead of 629 /// vectorized for the chosen vectorization factor. 630 bool shouldScalarizeInstruction(Instruction *I) const; 631 632 /// Returns true if we should generate a scalar version of \p IV. 633 bool needsScalarInduction(Instruction *IV) const; 634 635 /// Generate a shuffle sequence that will reverse the vector Vec. 636 virtual Value *reverseVector(Value *Vec); 637 638 /// Returns (and creates if needed) the original loop trip count. 639 Value *getOrCreateTripCount(Loop *NewLoop); 640 641 /// Returns (and creates if needed) the trip count of the widened loop. 642 Value *getOrCreateVectorTripCount(Loop *NewLoop); 643 644 /// Returns a bitcasted value to the requested vector type. 645 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 646 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 647 const DataLayout &DL); 648 649 /// Emit a bypass check to see if the vector trip count is zero, including if 650 /// it overflows. 651 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 652 653 /// Emit a bypass check to see if all of the SCEV assumptions we've 654 /// had to make are correct. Returns the block containing the checks or 655 /// nullptr if no checks have been added. 656 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 657 658 /// Emit bypass checks to check any memory assumptions we may have made. 659 /// Returns the block containing the checks or nullptr if no checks have been 660 /// added. 661 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 662 663 /// Compute the transformed value of Index at offset StartValue using step 664 /// StepValue. 665 /// For integer induction, returns StartValue + Index * StepValue. 666 /// For pointer induction, returns StartValue[Index * StepValue]. 667 /// FIXME: The newly created binary instructions should contain nsw/nuw 668 /// flags, which can be found from the original scalar operations. 669 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 670 const DataLayout &DL, 671 const InductionDescriptor &ID, 672 BasicBlock *VectorHeader) const; 673 674 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 675 /// vector loop preheader, middle block and scalar preheader. Also 676 /// allocate a loop object for the new vector loop and return it. 677 Loop *createVectorLoopSkeleton(StringRef Prefix); 678 679 /// Create new phi nodes for the induction variables to resume iteration count 680 /// in the scalar epilogue, from where the vectorized loop left off (given by 681 /// \p VectorTripCount). 682 /// In cases where the loop skeleton is more complicated (eg. epilogue 683 /// vectorization) and the resume values can come from an additional bypass 684 /// block, the \p AdditionalBypass pair provides information about the bypass 685 /// block and the end value on the edge from bypass to this loop. 686 void createInductionResumeValues( 687 Loop *L, Value *VectorTripCount, 688 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 689 690 /// Complete the loop skeleton by adding debug MDs, creating appropriate 691 /// conditional branches in the middle block, preparing the builder and 692 /// running the verifier. Take in the vector loop \p L as argument, and return 693 /// the preheader of the completed vector loop. 694 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 695 696 /// Add additional metadata to \p To that was not present on \p Orig. 697 /// 698 /// Currently this is used to add the noalias annotations based on the 699 /// inserted memchecks. Use this for instructions that are *cloned* into the 700 /// vector loop. 701 void addNewMetadata(Instruction *To, const Instruction *Orig); 702 703 /// Collect poison-generating recipes that may generate a poison value that is 704 /// used after vectorization, even when their operands are not poison. Those 705 /// recipes meet the following conditions: 706 /// * Contribute to the address computation of a recipe generating a widen 707 /// memory load/store (VPWidenMemoryInstructionRecipe or 708 /// VPInterleaveRecipe). 709 /// * Such a widen memory load/store has at least one underlying Instruction 710 /// that is in a basic block that needs predication and after vectorization 711 /// the generated instruction won't be predicated. 712 void collectPoisonGeneratingRecipes(VPTransformState &State); 713 714 /// Allow subclasses to override and print debug traces before/after vplan 715 /// execution, when trace information is requested. 716 virtual void printDebugTracesAtStart(){}; 717 virtual void printDebugTracesAtEnd(){}; 718 719 /// The original loop. 720 Loop *OrigLoop; 721 722 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 723 /// dynamic knowledge to simplify SCEV expressions and converts them to a 724 /// more usable form. 725 PredicatedScalarEvolution &PSE; 726 727 /// Loop Info. 728 LoopInfo *LI; 729 730 /// Dominator Tree. 731 DominatorTree *DT; 732 733 /// Alias Analysis. 734 AAResults *AA; 735 736 /// Target Library Info. 737 const TargetLibraryInfo *TLI; 738 739 /// Target Transform Info. 740 const TargetTransformInfo *TTI; 741 742 /// Assumption Cache. 743 AssumptionCache *AC; 744 745 /// Interface to emit optimization remarks. 746 OptimizationRemarkEmitter *ORE; 747 748 /// LoopVersioning. It's only set up (non-null) if memchecks were 749 /// used. 750 /// 751 /// This is currently only used to add no-alias metadata based on the 752 /// memchecks. The actually versioning is performed manually. 753 std::unique_ptr<LoopVersioning> LVer; 754 755 /// The vectorization SIMD factor to use. Each vector will have this many 756 /// vector elements. 757 ElementCount VF; 758 759 /// The vectorization unroll factor to use. Each scalar is vectorized to this 760 /// many different vector instructions. 761 unsigned UF; 762 763 /// The builder that we use 764 IRBuilder<> Builder; 765 766 // --- Vectorization state --- 767 768 /// The vector-loop preheader. 769 BasicBlock *LoopVectorPreHeader; 770 771 /// The scalar-loop preheader. 772 BasicBlock *LoopScalarPreHeader; 773 774 /// Middle Block between the vector and the scalar. 775 BasicBlock *LoopMiddleBlock; 776 777 /// The unique ExitBlock of the scalar loop if one exists. Note that 778 /// there can be multiple exiting edges reaching this block. 779 BasicBlock *LoopExitBlock; 780 781 /// The vector loop body. 782 BasicBlock *LoopVectorBody; 783 784 /// The scalar loop body. 785 BasicBlock *LoopScalarBody; 786 787 /// A list of all bypass blocks. The first block is the entry of the loop. 788 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 789 790 /// The new Induction variable which was added to the new block. 791 PHINode *Induction = nullptr; 792 793 /// The induction variable of the old basic block. 794 PHINode *OldInduction = nullptr; 795 796 /// Store instructions that were predicated. 797 SmallVector<Instruction *, 4> PredicatedInstructions; 798 799 /// Trip count of the original loop. 800 Value *TripCount = nullptr; 801 802 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 803 Value *VectorTripCount = nullptr; 804 805 /// The legality analysis. 806 LoopVectorizationLegality *Legal; 807 808 /// The profitablity analysis. 809 LoopVectorizationCostModel *Cost; 810 811 // Record whether runtime checks are added. 812 bool AddedSafetyChecks = false; 813 814 // Holds the end values for each induction variable. We save the end values 815 // so we can later fix-up the external users of the induction variables. 816 DenseMap<PHINode *, Value *> IVEndValues; 817 818 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 819 // fixed up at the end of vector code generation. 820 SmallVector<PHINode *, 8> OrigPHIsToFix; 821 822 /// BFI and PSI are used to check for profile guided size optimizations. 823 BlockFrequencyInfo *BFI; 824 ProfileSummaryInfo *PSI; 825 826 // Whether this loop should be optimized for size based on profile guided size 827 // optimizatios. 828 bool OptForSizeBasedOnProfile; 829 830 /// Structure to hold information about generated runtime checks, responsible 831 /// for cleaning the checks, if vectorization turns out unprofitable. 832 GeneratedRTChecks &RTChecks; 833 }; 834 835 class InnerLoopUnroller : public InnerLoopVectorizer { 836 public: 837 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 838 LoopInfo *LI, DominatorTree *DT, 839 const TargetLibraryInfo *TLI, 840 const TargetTransformInfo *TTI, AssumptionCache *AC, 841 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 842 LoopVectorizationLegality *LVL, 843 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 844 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 845 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 846 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 847 BFI, PSI, Check) {} 848 849 private: 850 Value *getBroadcastInstrs(Value *V) override; 851 Value *reverseVector(Value *Vec) override; 852 }; 853 854 /// Encapsulate information regarding vectorization of a loop and its epilogue. 855 /// This information is meant to be updated and used across two stages of 856 /// epilogue vectorization. 857 struct EpilogueLoopVectorizationInfo { 858 ElementCount MainLoopVF = ElementCount::getFixed(0); 859 unsigned MainLoopUF = 0; 860 ElementCount EpilogueVF = ElementCount::getFixed(0); 861 unsigned EpilogueUF = 0; 862 BasicBlock *MainLoopIterationCountCheck = nullptr; 863 BasicBlock *EpilogueIterationCountCheck = nullptr; 864 BasicBlock *SCEVSafetyCheck = nullptr; 865 BasicBlock *MemSafetyCheck = nullptr; 866 Value *TripCount = nullptr; 867 Value *VectorTripCount = nullptr; 868 869 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 870 ElementCount EVF, unsigned EUF) 871 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 872 assert(EUF == 1 && 873 "A high UF for the epilogue loop is likely not beneficial."); 874 } 875 }; 876 877 /// An extension of the inner loop vectorizer that creates a skeleton for a 878 /// vectorized loop that has its epilogue (residual) also vectorized. 879 /// The idea is to run the vplan on a given loop twice, firstly to setup the 880 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 881 /// from the first step and vectorize the epilogue. This is achieved by 882 /// deriving two concrete strategy classes from this base class and invoking 883 /// them in succession from the loop vectorizer planner. 884 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 885 public: 886 InnerLoopAndEpilogueVectorizer( 887 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 888 DominatorTree *DT, const TargetLibraryInfo *TLI, 889 const TargetTransformInfo *TTI, AssumptionCache *AC, 890 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 891 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 892 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 893 GeneratedRTChecks &Checks) 894 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 895 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 896 Checks), 897 EPI(EPI) {} 898 899 // Override this function to handle the more complex control flow around the 900 // three loops. 901 BasicBlock *createVectorizedLoopSkeleton() final override { 902 return createEpilogueVectorizedLoopSkeleton(); 903 } 904 905 /// The interface for creating a vectorized skeleton using one of two 906 /// different strategies, each corresponding to one execution of the vplan 907 /// as described above. 908 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 909 910 /// Holds and updates state information required to vectorize the main loop 911 /// and its epilogue in two separate passes. This setup helps us avoid 912 /// regenerating and recomputing runtime safety checks. It also helps us to 913 /// shorten the iteration-count-check path length for the cases where the 914 /// iteration count of the loop is so small that the main vector loop is 915 /// completely skipped. 916 EpilogueLoopVectorizationInfo &EPI; 917 }; 918 919 /// A specialized derived class of inner loop vectorizer that performs 920 /// vectorization of *main* loops in the process of vectorizing loops and their 921 /// epilogues. 922 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 923 public: 924 EpilogueVectorizerMainLoop( 925 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 926 DominatorTree *DT, const TargetLibraryInfo *TLI, 927 const TargetTransformInfo *TTI, AssumptionCache *AC, 928 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 929 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 930 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 931 GeneratedRTChecks &Check) 932 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 933 EPI, LVL, CM, BFI, PSI, Check) {} 934 /// Implements the interface for creating a vectorized skeleton using the 935 /// *main loop* strategy (ie the first pass of vplan execution). 936 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 937 938 protected: 939 /// Emits an iteration count bypass check once for the main loop (when \p 940 /// ForEpilogue is false) and once for the epilogue loop (when \p 941 /// ForEpilogue is true). 942 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 943 bool ForEpilogue); 944 void printDebugTracesAtStart() override; 945 void printDebugTracesAtEnd() override; 946 }; 947 948 // A specialized derived class of inner loop vectorizer that performs 949 // vectorization of *epilogue* loops in the process of vectorizing loops and 950 // their epilogues. 951 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 952 public: 953 EpilogueVectorizerEpilogueLoop( 954 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 955 DominatorTree *DT, const TargetLibraryInfo *TLI, 956 const TargetTransformInfo *TTI, AssumptionCache *AC, 957 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 958 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 959 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 960 GeneratedRTChecks &Checks) 961 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 962 EPI, LVL, CM, BFI, PSI, Checks) {} 963 /// Implements the interface for creating a vectorized skeleton using the 964 /// *epilogue loop* strategy (ie the second pass of vplan execution). 965 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 966 967 protected: 968 /// Emits an iteration count bypass check after the main vector loop has 969 /// finished to see if there are any iterations left to execute by either 970 /// the vector epilogue or the scalar epilogue. 971 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 972 BasicBlock *Bypass, 973 BasicBlock *Insert); 974 void printDebugTracesAtStart() override; 975 void printDebugTracesAtEnd() override; 976 }; 977 } // end namespace llvm 978 979 /// Look for a meaningful debug location on the instruction or it's 980 /// operands. 981 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 982 if (!I) 983 return I; 984 985 DebugLoc Empty; 986 if (I->getDebugLoc() != Empty) 987 return I; 988 989 for (Use &Op : I->operands()) { 990 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 991 if (OpInst->getDebugLoc() != Empty) 992 return OpInst; 993 } 994 995 return I; 996 } 997 998 void InnerLoopVectorizer::setDebugLocFromInst( 999 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1000 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1001 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1002 const DILocation *DIL = Inst->getDebugLoc(); 1003 1004 // When a FSDiscriminator is enabled, we don't need to add the multiply 1005 // factors to the discriminators. 1006 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1007 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1008 // FIXME: For scalable vectors, assume vscale=1. 1009 auto NewDIL = 1010 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1011 if (NewDIL) 1012 B->SetCurrentDebugLocation(NewDIL.getValue()); 1013 else 1014 LLVM_DEBUG(dbgs() 1015 << "Failed to create new discriminator: " 1016 << DIL->getFilename() << " Line: " << DIL->getLine()); 1017 } else 1018 B->SetCurrentDebugLocation(DIL); 1019 } else 1020 B->SetCurrentDebugLocation(DebugLoc()); 1021 } 1022 1023 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1024 /// is passed, the message relates to that particular instruction. 1025 #ifndef NDEBUG 1026 static void debugVectorizationMessage(const StringRef Prefix, 1027 const StringRef DebugMsg, 1028 Instruction *I) { 1029 dbgs() << "LV: " << Prefix << DebugMsg; 1030 if (I != nullptr) 1031 dbgs() << " " << *I; 1032 else 1033 dbgs() << '.'; 1034 dbgs() << '\n'; 1035 } 1036 #endif 1037 1038 /// Create an analysis remark that explains why vectorization failed 1039 /// 1040 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1041 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1042 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1043 /// the location of the remark. \return the remark object that can be 1044 /// streamed to. 1045 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1046 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1047 Value *CodeRegion = TheLoop->getHeader(); 1048 DebugLoc DL = TheLoop->getStartLoc(); 1049 1050 if (I) { 1051 CodeRegion = I->getParent(); 1052 // If there is no debug location attached to the instruction, revert back to 1053 // using the loop's. 1054 if (I->getDebugLoc()) 1055 DL = I->getDebugLoc(); 1056 } 1057 1058 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1059 } 1060 1061 /// Return a value for Step multiplied by VF. 1062 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1063 int64_t Step) { 1064 assert(Ty->isIntegerTy() && "Expected an integer step"); 1065 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1066 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1067 } 1068 1069 namespace llvm { 1070 1071 /// Return the runtime value for VF. 1072 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1073 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1074 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1075 } 1076 1077 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1078 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1079 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1080 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1081 return B.CreateUIToFP(RuntimeVF, FTy); 1082 } 1083 1084 void reportVectorizationFailure(const StringRef DebugMsg, 1085 const StringRef OREMsg, const StringRef ORETag, 1086 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1087 Instruction *I) { 1088 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1089 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1090 ORE->emit( 1091 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1092 << "loop not vectorized: " << OREMsg); 1093 } 1094 1095 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1096 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1097 Instruction *I) { 1098 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1099 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1100 ORE->emit( 1101 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1102 << Msg); 1103 } 1104 1105 } // end namespace llvm 1106 1107 #ifndef NDEBUG 1108 /// \return string containing a file name and a line # for the given loop. 1109 static std::string getDebugLocString(const Loop *L) { 1110 std::string Result; 1111 if (L) { 1112 raw_string_ostream OS(Result); 1113 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1114 LoopDbgLoc.print(OS); 1115 else 1116 // Just print the module name. 1117 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1118 OS.flush(); 1119 } 1120 return Result; 1121 } 1122 #endif 1123 1124 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1125 const Instruction *Orig) { 1126 // If the loop was versioned with memchecks, add the corresponding no-alias 1127 // metadata. 1128 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1129 LVer->annotateInstWithNoAlias(To, Orig); 1130 } 1131 1132 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1133 VPTransformState &State) { 1134 1135 // Collect recipes in the backward slice of `Root` that may generate a poison 1136 // value that is used after vectorization. 1137 SmallPtrSet<VPRecipeBase *, 16> Visited; 1138 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1139 SmallVector<VPRecipeBase *, 16> Worklist; 1140 Worklist.push_back(Root); 1141 1142 // Traverse the backward slice of Root through its use-def chain. 1143 while (!Worklist.empty()) { 1144 VPRecipeBase *CurRec = Worklist.back(); 1145 Worklist.pop_back(); 1146 1147 if (!Visited.insert(CurRec).second) 1148 continue; 1149 1150 // Prune search if we find another recipe generating a widen memory 1151 // instruction. Widen memory instructions involved in address computation 1152 // will lead to gather/scatter instructions, which don't need to be 1153 // handled. 1154 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1155 isa<VPInterleaveRecipe>(CurRec)) 1156 continue; 1157 1158 // This recipe contributes to the address computation of a widen 1159 // load/store. Collect recipe if its underlying instruction has 1160 // poison-generating flags. 1161 Instruction *Instr = CurRec->getUnderlyingInstr(); 1162 if (Instr && Instr->hasPoisonGeneratingFlags()) 1163 State.MayGeneratePoisonRecipes.insert(CurRec); 1164 1165 // Add new definitions to the worklist. 1166 for (VPValue *operand : CurRec->operands()) 1167 if (VPDef *OpDef = operand->getDef()) 1168 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1169 } 1170 }); 1171 1172 // Traverse all the recipes in the VPlan and collect the poison-generating 1173 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1174 // VPInterleaveRecipe. 1175 auto Iter = depth_first( 1176 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1177 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1178 for (VPRecipeBase &Recipe : *VPBB) { 1179 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1180 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1181 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1182 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1183 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1184 collectPoisonGeneratingInstrsInBackwardSlice( 1185 cast<VPRecipeBase>(AddrDef)); 1186 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1187 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1188 if (AddrDef) { 1189 // Check if any member of the interleave group needs predication. 1190 const InterleaveGroup<Instruction> *InterGroup = 1191 InterleaveRec->getInterleaveGroup(); 1192 bool NeedPredication = false; 1193 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1194 I < NumMembers; ++I) { 1195 Instruction *Member = InterGroup->getMember(I); 1196 if (Member) 1197 NeedPredication |= 1198 Legal->blockNeedsPredication(Member->getParent()); 1199 } 1200 1201 if (NeedPredication) 1202 collectPoisonGeneratingInstrsInBackwardSlice( 1203 cast<VPRecipeBase>(AddrDef)); 1204 } 1205 } 1206 } 1207 } 1208 } 1209 1210 void InnerLoopVectorizer::addMetadata(Instruction *To, 1211 Instruction *From) { 1212 propagateMetadata(To, From); 1213 addNewMetadata(To, From); 1214 } 1215 1216 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1217 Instruction *From) { 1218 for (Value *V : To) { 1219 if (Instruction *I = dyn_cast<Instruction>(V)) 1220 addMetadata(I, From); 1221 } 1222 } 1223 1224 namespace llvm { 1225 1226 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1227 // lowered. 1228 enum ScalarEpilogueLowering { 1229 1230 // The default: allowing scalar epilogues. 1231 CM_ScalarEpilogueAllowed, 1232 1233 // Vectorization with OptForSize: don't allow epilogues. 1234 CM_ScalarEpilogueNotAllowedOptSize, 1235 1236 // A special case of vectorisation with OptForSize: loops with a very small 1237 // trip count are considered for vectorization under OptForSize, thereby 1238 // making sure the cost of their loop body is dominant, free of runtime 1239 // guards and scalar iteration overheads. 1240 CM_ScalarEpilogueNotAllowedLowTripLoop, 1241 1242 // Loop hint predicate indicating an epilogue is undesired. 1243 CM_ScalarEpilogueNotNeededUsePredicate, 1244 1245 // Directive indicating we must either tail fold or not vectorize 1246 CM_ScalarEpilogueNotAllowedUsePredicate 1247 }; 1248 1249 /// ElementCountComparator creates a total ordering for ElementCount 1250 /// for the purposes of using it in a set structure. 1251 struct ElementCountComparator { 1252 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1253 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1254 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1255 } 1256 }; 1257 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1258 1259 /// LoopVectorizationCostModel - estimates the expected speedups due to 1260 /// vectorization. 1261 /// In many cases vectorization is not profitable. This can happen because of 1262 /// a number of reasons. In this class we mainly attempt to predict the 1263 /// expected speedup/slowdowns due to the supported instruction set. We use the 1264 /// TargetTransformInfo to query the different backends for the cost of 1265 /// different operations. 1266 class LoopVectorizationCostModel { 1267 public: 1268 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1269 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1270 LoopVectorizationLegality *Legal, 1271 const TargetTransformInfo &TTI, 1272 const TargetLibraryInfo *TLI, DemandedBits *DB, 1273 AssumptionCache *AC, 1274 OptimizationRemarkEmitter *ORE, const Function *F, 1275 const LoopVectorizeHints *Hints, 1276 InterleavedAccessInfo &IAI) 1277 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1278 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1279 Hints(Hints), InterleaveInfo(IAI) {} 1280 1281 /// \return An upper bound for the vectorization factors (both fixed and 1282 /// scalable). If the factors are 0, vectorization and interleaving should be 1283 /// avoided up front. 1284 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1285 1286 /// \return True if runtime checks are required for vectorization, and false 1287 /// otherwise. 1288 bool runtimeChecksRequired(); 1289 1290 /// \return The most profitable vectorization factor and the cost of that VF. 1291 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1292 /// then this vectorization factor will be selected if vectorization is 1293 /// possible. 1294 VectorizationFactor 1295 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1296 1297 VectorizationFactor 1298 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1299 const LoopVectorizationPlanner &LVP); 1300 1301 /// Setup cost-based decisions for user vectorization factor. 1302 /// \return true if the UserVF is a feasible VF to be chosen. 1303 bool selectUserVectorizationFactor(ElementCount UserVF) { 1304 collectUniformsAndScalars(UserVF); 1305 collectInstsToScalarize(UserVF); 1306 return expectedCost(UserVF).first.isValid(); 1307 } 1308 1309 /// \return The size (in bits) of the smallest and widest types in the code 1310 /// that needs to be vectorized. We ignore values that remain scalar such as 1311 /// 64 bit loop indices. 1312 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1313 1314 /// \return The desired interleave count. 1315 /// If interleave count has been specified by metadata it will be returned. 1316 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1317 /// are the selected vectorization factor and the cost of the selected VF. 1318 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1319 1320 /// Memory access instruction may be vectorized in more than one way. 1321 /// Form of instruction after vectorization depends on cost. 1322 /// This function takes cost-based decisions for Load/Store instructions 1323 /// and collects them in a map. This decisions map is used for building 1324 /// the lists of loop-uniform and loop-scalar instructions. 1325 /// The calculated cost is saved with widening decision in order to 1326 /// avoid redundant calculations. 1327 void setCostBasedWideningDecision(ElementCount VF); 1328 1329 /// A struct that represents some properties of the register usage 1330 /// of a loop. 1331 struct RegisterUsage { 1332 /// Holds the number of loop invariant values that are used in the loop. 1333 /// The key is ClassID of target-provided register class. 1334 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1335 /// Holds the maximum number of concurrent live intervals in the loop. 1336 /// The key is ClassID of target-provided register class. 1337 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1338 }; 1339 1340 /// \return Returns information about the register usages of the loop for the 1341 /// given vectorization factors. 1342 SmallVector<RegisterUsage, 8> 1343 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1344 1345 /// Collect values we want to ignore in the cost model. 1346 void collectValuesToIgnore(); 1347 1348 /// Collect all element types in the loop for which widening is needed. 1349 void collectElementTypesForWidening(); 1350 1351 /// Split reductions into those that happen in the loop, and those that happen 1352 /// outside. In loop reductions are collected into InLoopReductionChains. 1353 void collectInLoopReductions(); 1354 1355 /// Returns true if we should use strict in-order reductions for the given 1356 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1357 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1358 /// of FP operations. 1359 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1360 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1361 } 1362 1363 /// \returns The smallest bitwidth each instruction can be represented with. 1364 /// The vector equivalents of these instructions should be truncated to this 1365 /// type. 1366 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1367 return MinBWs; 1368 } 1369 1370 /// \returns True if it is more profitable to scalarize instruction \p I for 1371 /// vectorization factor \p VF. 1372 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1373 assert(VF.isVector() && 1374 "Profitable to scalarize relevant only for VF > 1."); 1375 1376 // Cost model is not run in the VPlan-native path - return conservative 1377 // result until this changes. 1378 if (EnableVPlanNativePath) 1379 return false; 1380 1381 auto Scalars = InstsToScalarize.find(VF); 1382 assert(Scalars != InstsToScalarize.end() && 1383 "VF not yet analyzed for scalarization profitability"); 1384 return Scalars->second.find(I) != Scalars->second.end(); 1385 } 1386 1387 /// Returns true if \p I is known to be uniform after vectorization. 1388 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1389 if (VF.isScalar()) 1390 return true; 1391 1392 // Cost model is not run in the VPlan-native path - return conservative 1393 // result until this changes. 1394 if (EnableVPlanNativePath) 1395 return false; 1396 1397 auto UniformsPerVF = Uniforms.find(VF); 1398 assert(UniformsPerVF != Uniforms.end() && 1399 "VF not yet analyzed for uniformity"); 1400 return UniformsPerVF->second.count(I); 1401 } 1402 1403 /// Returns true if \p I is known to be scalar after vectorization. 1404 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1405 if (VF.isScalar()) 1406 return true; 1407 1408 // Cost model is not run in the VPlan-native path - return conservative 1409 // result until this changes. 1410 if (EnableVPlanNativePath) 1411 return false; 1412 1413 auto ScalarsPerVF = Scalars.find(VF); 1414 assert(ScalarsPerVF != Scalars.end() && 1415 "Scalar values are not calculated for VF"); 1416 return ScalarsPerVF->second.count(I); 1417 } 1418 1419 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1420 /// for vectorization factor \p VF. 1421 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1422 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1423 !isProfitableToScalarize(I, VF) && 1424 !isScalarAfterVectorization(I, VF); 1425 } 1426 1427 /// Decision that was taken during cost calculation for memory instruction. 1428 enum InstWidening { 1429 CM_Unknown, 1430 CM_Widen, // For consecutive accesses with stride +1. 1431 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1432 CM_Interleave, 1433 CM_GatherScatter, 1434 CM_Scalarize 1435 }; 1436 1437 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1438 /// instruction \p I and vector width \p VF. 1439 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1440 InstructionCost Cost) { 1441 assert(VF.isVector() && "Expected VF >=2"); 1442 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1443 } 1444 1445 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1446 /// interleaving group \p Grp and vector width \p VF. 1447 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1448 ElementCount VF, InstWidening W, 1449 InstructionCost Cost) { 1450 assert(VF.isVector() && "Expected VF >=2"); 1451 /// Broadcast this decicion to all instructions inside the group. 1452 /// But the cost will be assigned to one instruction only. 1453 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1454 if (auto *I = Grp->getMember(i)) { 1455 if (Grp->getInsertPos() == I) 1456 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1457 else 1458 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1459 } 1460 } 1461 } 1462 1463 /// Return the cost model decision for the given instruction \p I and vector 1464 /// width \p VF. Return CM_Unknown if this instruction did not pass 1465 /// through the cost modeling. 1466 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1467 assert(VF.isVector() && "Expected VF to be a vector VF"); 1468 // Cost model is not run in the VPlan-native path - return conservative 1469 // result until this changes. 1470 if (EnableVPlanNativePath) 1471 return CM_GatherScatter; 1472 1473 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1474 auto Itr = WideningDecisions.find(InstOnVF); 1475 if (Itr == WideningDecisions.end()) 1476 return CM_Unknown; 1477 return Itr->second.first; 1478 } 1479 1480 /// Return the vectorization cost for the given instruction \p I and vector 1481 /// width \p VF. 1482 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1483 assert(VF.isVector() && "Expected VF >=2"); 1484 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1485 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1486 "The cost is not calculated"); 1487 return WideningDecisions[InstOnVF].second; 1488 } 1489 1490 /// Return True if instruction \p I is an optimizable truncate whose operand 1491 /// is an induction variable. Such a truncate will be removed by adding a new 1492 /// induction variable with the destination type. 1493 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1494 // If the instruction is not a truncate, return false. 1495 auto *Trunc = dyn_cast<TruncInst>(I); 1496 if (!Trunc) 1497 return false; 1498 1499 // Get the source and destination types of the truncate. 1500 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1501 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1502 1503 // If the truncate is free for the given types, return false. Replacing a 1504 // free truncate with an induction variable would add an induction variable 1505 // update instruction to each iteration of the loop. We exclude from this 1506 // check the primary induction variable since it will need an update 1507 // instruction regardless. 1508 Value *Op = Trunc->getOperand(0); 1509 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1510 return false; 1511 1512 // If the truncated value is not an induction variable, return false. 1513 return Legal->isInductionPhi(Op); 1514 } 1515 1516 /// Collects the instructions to scalarize for each predicated instruction in 1517 /// the loop. 1518 void collectInstsToScalarize(ElementCount VF); 1519 1520 /// Collect Uniform and Scalar values for the given \p VF. 1521 /// The sets depend on CM decision for Load/Store instructions 1522 /// that may be vectorized as interleave, gather-scatter or scalarized. 1523 void collectUniformsAndScalars(ElementCount VF) { 1524 // Do the analysis once. 1525 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1526 return; 1527 setCostBasedWideningDecision(VF); 1528 collectLoopUniforms(VF); 1529 collectLoopScalars(VF); 1530 } 1531 1532 /// Returns true if the target machine supports masked store operation 1533 /// for the given \p DataType and kind of access to \p Ptr. 1534 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1535 return Legal->isConsecutivePtr(DataType, Ptr) && 1536 TTI.isLegalMaskedStore(DataType, Alignment); 1537 } 1538 1539 /// Returns true if the target machine supports masked load operation 1540 /// for the given \p DataType and kind of access to \p Ptr. 1541 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1542 return Legal->isConsecutivePtr(DataType, Ptr) && 1543 TTI.isLegalMaskedLoad(DataType, Alignment); 1544 } 1545 1546 /// Returns true if the target machine can represent \p V as a masked gather 1547 /// or scatter operation. 1548 bool isLegalGatherOrScatter(Value *V) { 1549 bool LI = isa<LoadInst>(V); 1550 bool SI = isa<StoreInst>(V); 1551 if (!LI && !SI) 1552 return false; 1553 auto *Ty = getLoadStoreType(V); 1554 Align Align = getLoadStoreAlignment(V); 1555 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1556 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1557 } 1558 1559 /// Returns true if the target machine supports all of the reduction 1560 /// variables found for the given VF. 1561 bool canVectorizeReductions(ElementCount VF) const { 1562 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1563 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1564 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1565 })); 1566 } 1567 1568 /// Returns true if \p I is an instruction that will be scalarized with 1569 /// predication. Such instructions include conditional stores and 1570 /// instructions that may divide by zero. 1571 /// If a non-zero VF has been calculated, we check if I will be scalarized 1572 /// predication for that VF. 1573 bool isScalarWithPredication(Instruction *I) const; 1574 1575 // Returns true if \p I is an instruction that will be predicated either 1576 // through scalar predication or masked load/store or masked gather/scatter. 1577 // Superset of instructions that return true for isScalarWithPredication. 1578 bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { 1579 // When we know the load is uniform and the original scalar loop was not 1580 // predicated we don't need to mark it as a predicated instruction. Any 1581 // vectorised blocks created when tail-folding are something artificial we 1582 // have introduced and we know there is always at least one active lane. 1583 // That's why we call Legal->blockNeedsPredication here because it doesn't 1584 // query tail-folding. 1585 if (IsKnownUniform && isa<LoadInst>(I) && 1586 !Legal->blockNeedsPredication(I->getParent())) 1587 return false; 1588 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1589 return false; 1590 // Loads and stores that need some form of masked operation are predicated 1591 // instructions. 1592 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1593 return Legal->isMaskRequired(I); 1594 return isScalarWithPredication(I); 1595 } 1596 1597 /// Returns true if \p I is a memory instruction with consecutive memory 1598 /// access that can be widened. 1599 bool 1600 memoryInstructionCanBeWidened(Instruction *I, 1601 ElementCount VF = ElementCount::getFixed(1)); 1602 1603 /// Returns true if \p I is a memory instruction in an interleaved-group 1604 /// of memory accesses that can be vectorized with wide vector loads/stores 1605 /// and shuffles. 1606 bool 1607 interleavedAccessCanBeWidened(Instruction *I, 1608 ElementCount VF = ElementCount::getFixed(1)); 1609 1610 /// Check if \p Instr belongs to any interleaved access group. 1611 bool isAccessInterleaved(Instruction *Instr) { 1612 return InterleaveInfo.isInterleaved(Instr); 1613 } 1614 1615 /// Get the interleaved access group that \p Instr belongs to. 1616 const InterleaveGroup<Instruction> * 1617 getInterleavedAccessGroup(Instruction *Instr) { 1618 return InterleaveInfo.getInterleaveGroup(Instr); 1619 } 1620 1621 /// Returns true if we're required to use a scalar epilogue for at least 1622 /// the final iteration of the original loop. 1623 bool requiresScalarEpilogue(ElementCount VF) const { 1624 if (!isScalarEpilogueAllowed()) 1625 return false; 1626 // If we might exit from anywhere but the latch, must run the exiting 1627 // iteration in scalar form. 1628 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1629 return true; 1630 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1631 } 1632 1633 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1634 /// loop hint annotation. 1635 bool isScalarEpilogueAllowed() const { 1636 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1637 } 1638 1639 /// Returns true if all loop blocks should be masked to fold tail loop. 1640 bool foldTailByMasking() const { return FoldTailByMasking; } 1641 1642 /// Returns true if the instructions in this block requires predication 1643 /// for any reason, e.g. because tail folding now requires a predicate 1644 /// or because the block in the original loop was predicated. 1645 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1646 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1647 } 1648 1649 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1650 /// nodes to the chain of instructions representing the reductions. Uses a 1651 /// MapVector to ensure deterministic iteration order. 1652 using ReductionChainMap = 1653 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1654 1655 /// Return the chain of instructions representing an inloop reduction. 1656 const ReductionChainMap &getInLoopReductionChains() const { 1657 return InLoopReductionChains; 1658 } 1659 1660 /// Returns true if the Phi is part of an inloop reduction. 1661 bool isInLoopReduction(PHINode *Phi) const { 1662 return InLoopReductionChains.count(Phi); 1663 } 1664 1665 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1666 /// with factor VF. Return the cost of the instruction, including 1667 /// scalarization overhead if it's needed. 1668 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1669 1670 /// Estimate cost of a call instruction CI if it were vectorized with factor 1671 /// VF. Return the cost of the instruction, including scalarization overhead 1672 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1673 /// scalarized - 1674 /// i.e. either vector version isn't available, or is too expensive. 1675 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1676 bool &NeedToScalarize) const; 1677 1678 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1679 /// that of B. 1680 bool isMoreProfitable(const VectorizationFactor &A, 1681 const VectorizationFactor &B) const; 1682 1683 /// Invalidates decisions already taken by the cost model. 1684 void invalidateCostModelingDecisions() { 1685 WideningDecisions.clear(); 1686 Uniforms.clear(); 1687 Scalars.clear(); 1688 } 1689 1690 private: 1691 unsigned NumPredStores = 0; 1692 1693 /// \return An upper bound for the vectorization factors for both 1694 /// fixed and scalable vectorization, where the minimum-known number of 1695 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1696 /// disabled or unsupported, then the scalable part will be equal to 1697 /// ElementCount::getScalable(0). 1698 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1699 ElementCount UserVF, 1700 bool FoldTailByMasking); 1701 1702 /// \return the maximized element count based on the targets vector 1703 /// registers and the loop trip-count, but limited to a maximum safe VF. 1704 /// This is a helper function of computeFeasibleMaxVF. 1705 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1706 /// issue that occurred on one of the buildbots which cannot be reproduced 1707 /// without having access to the properietary compiler (see comments on 1708 /// D98509). The issue is currently under investigation and this workaround 1709 /// will be removed as soon as possible. 1710 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1711 unsigned SmallestType, 1712 unsigned WidestType, 1713 const ElementCount &MaxSafeVF, 1714 bool FoldTailByMasking); 1715 1716 /// \return the maximum legal scalable VF, based on the safe max number 1717 /// of elements. 1718 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1719 1720 /// The vectorization cost is a combination of the cost itself and a boolean 1721 /// indicating whether any of the contributing operations will actually 1722 /// operate on vector values after type legalization in the backend. If this 1723 /// latter value is false, then all operations will be scalarized (i.e. no 1724 /// vectorization has actually taken place). 1725 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1726 1727 /// Returns the expected execution cost. The unit of the cost does 1728 /// not matter because we use the 'cost' units to compare different 1729 /// vector widths. The cost that is returned is *not* normalized by 1730 /// the factor width. If \p Invalid is not nullptr, this function 1731 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1732 /// each instruction that has an Invalid cost for the given VF. 1733 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1734 VectorizationCostTy 1735 expectedCost(ElementCount VF, 1736 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1737 1738 /// Returns the execution time cost of an instruction for a given vector 1739 /// width. Vector width of one means scalar. 1740 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1741 1742 /// The cost-computation logic from getInstructionCost which provides 1743 /// the vector type as an output parameter. 1744 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1745 Type *&VectorTy); 1746 1747 /// Return the cost of instructions in an inloop reduction pattern, if I is 1748 /// part of that pattern. 1749 Optional<InstructionCost> 1750 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1751 TTI::TargetCostKind CostKind); 1752 1753 /// Calculate vectorization cost of memory instruction \p I. 1754 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1755 1756 /// The cost computation for scalarized memory instruction. 1757 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1758 1759 /// The cost computation for interleaving group of memory instructions. 1760 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1761 1762 /// The cost computation for Gather/Scatter instruction. 1763 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1764 1765 /// The cost computation for widening instruction \p I with consecutive 1766 /// memory access. 1767 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1768 1769 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1770 /// Load: scalar load + broadcast. 1771 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1772 /// element) 1773 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1774 1775 /// Estimate the overhead of scalarizing an instruction. This is a 1776 /// convenience wrapper for the type-based getScalarizationOverhead API. 1777 InstructionCost getScalarizationOverhead(Instruction *I, 1778 ElementCount VF) const; 1779 1780 /// Returns whether the instruction is a load or store and will be a emitted 1781 /// as a vector operation. 1782 bool isConsecutiveLoadOrStore(Instruction *I); 1783 1784 /// Returns true if an artificially high cost for emulated masked memrefs 1785 /// should be used. 1786 bool useEmulatedMaskMemRefHack(Instruction *I); 1787 1788 /// Map of scalar integer values to the smallest bitwidth they can be legally 1789 /// represented as. The vector equivalents of these values should be truncated 1790 /// to this type. 1791 MapVector<Instruction *, uint64_t> MinBWs; 1792 1793 /// A type representing the costs for instructions if they were to be 1794 /// scalarized rather than vectorized. The entries are Instruction-Cost 1795 /// pairs. 1796 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1797 1798 /// A set containing all BasicBlocks that are known to present after 1799 /// vectorization as a predicated block. 1800 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1801 1802 /// Records whether it is allowed to have the original scalar loop execute at 1803 /// least once. This may be needed as a fallback loop in case runtime 1804 /// aliasing/dependence checks fail, or to handle the tail/remainder 1805 /// iterations when the trip count is unknown or doesn't divide by the VF, 1806 /// or as a peel-loop to handle gaps in interleave-groups. 1807 /// Under optsize and when the trip count is very small we don't allow any 1808 /// iterations to execute in the scalar loop. 1809 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1810 1811 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1812 bool FoldTailByMasking = false; 1813 1814 /// A map holding scalar costs for different vectorization factors. The 1815 /// presence of a cost for an instruction in the mapping indicates that the 1816 /// instruction will be scalarized when vectorizing with the associated 1817 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1818 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1819 1820 /// Holds the instructions known to be uniform after vectorization. 1821 /// The data is collected per VF. 1822 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1823 1824 /// Holds the instructions known to be scalar after vectorization. 1825 /// The data is collected per VF. 1826 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1827 1828 /// Holds the instructions (address computations) that are forced to be 1829 /// scalarized. 1830 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1831 1832 /// PHINodes of the reductions that should be expanded in-loop along with 1833 /// their associated chains of reduction operations, in program order from top 1834 /// (PHI) to bottom 1835 ReductionChainMap InLoopReductionChains; 1836 1837 /// A Map of inloop reduction operations and their immediate chain operand. 1838 /// FIXME: This can be removed once reductions can be costed correctly in 1839 /// vplan. This was added to allow quick lookup to the inloop operations, 1840 /// without having to loop through InLoopReductionChains. 1841 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1842 1843 /// Returns the expected difference in cost from scalarizing the expression 1844 /// feeding a predicated instruction \p PredInst. The instructions to 1845 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1846 /// non-negative return value implies the expression will be scalarized. 1847 /// Currently, only single-use chains are considered for scalarization. 1848 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1849 ElementCount VF); 1850 1851 /// Collect the instructions that are uniform after vectorization. An 1852 /// instruction is uniform if we represent it with a single scalar value in 1853 /// the vectorized loop corresponding to each vector iteration. Examples of 1854 /// uniform instructions include pointer operands of consecutive or 1855 /// interleaved memory accesses. Note that although uniformity implies an 1856 /// instruction will be scalar, the reverse is not true. In general, a 1857 /// scalarized instruction will be represented by VF scalar values in the 1858 /// vectorized loop, each corresponding to an iteration of the original 1859 /// scalar loop. 1860 void collectLoopUniforms(ElementCount VF); 1861 1862 /// Collect the instructions that are scalar after vectorization. An 1863 /// instruction is scalar if it is known to be uniform or will be scalarized 1864 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1865 /// to the list if they are used by a load/store instruction that is marked as 1866 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1867 /// VF values in the vectorized loop, each corresponding to an iteration of 1868 /// the original scalar loop. 1869 void collectLoopScalars(ElementCount VF); 1870 1871 /// Keeps cost model vectorization decision and cost for instructions. 1872 /// Right now it is used for memory instructions only. 1873 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1874 std::pair<InstWidening, InstructionCost>>; 1875 1876 DecisionList WideningDecisions; 1877 1878 /// Returns true if \p V is expected to be vectorized and it needs to be 1879 /// extracted. 1880 bool needsExtract(Value *V, ElementCount VF) const { 1881 Instruction *I = dyn_cast<Instruction>(V); 1882 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1883 TheLoop->isLoopInvariant(I)) 1884 return false; 1885 1886 // Assume we can vectorize V (and hence we need extraction) if the 1887 // scalars are not computed yet. This can happen, because it is called 1888 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1889 // the scalars are collected. That should be a safe assumption in most 1890 // cases, because we check if the operands have vectorizable types 1891 // beforehand in LoopVectorizationLegality. 1892 return Scalars.find(VF) == Scalars.end() || 1893 !isScalarAfterVectorization(I, VF); 1894 }; 1895 1896 /// Returns a range containing only operands needing to be extracted. 1897 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1898 ElementCount VF) const { 1899 return SmallVector<Value *, 4>(make_filter_range( 1900 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1901 } 1902 1903 /// Determines if we have the infrastructure to vectorize loop \p L and its 1904 /// epilogue, assuming the main loop is vectorized by \p VF. 1905 bool isCandidateForEpilogueVectorization(const Loop &L, 1906 const ElementCount VF) const; 1907 1908 /// Returns true if epilogue vectorization is considered profitable, and 1909 /// false otherwise. 1910 /// \p VF is the vectorization factor chosen for the original loop. 1911 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1912 1913 public: 1914 /// The loop that we evaluate. 1915 Loop *TheLoop; 1916 1917 /// Predicated scalar evolution analysis. 1918 PredicatedScalarEvolution &PSE; 1919 1920 /// Loop Info analysis. 1921 LoopInfo *LI; 1922 1923 /// Vectorization legality. 1924 LoopVectorizationLegality *Legal; 1925 1926 /// Vector target information. 1927 const TargetTransformInfo &TTI; 1928 1929 /// Target Library Info. 1930 const TargetLibraryInfo *TLI; 1931 1932 /// Demanded bits analysis. 1933 DemandedBits *DB; 1934 1935 /// Assumption cache. 1936 AssumptionCache *AC; 1937 1938 /// Interface to emit optimization remarks. 1939 OptimizationRemarkEmitter *ORE; 1940 1941 const Function *TheFunction; 1942 1943 /// Loop Vectorize Hint. 1944 const LoopVectorizeHints *Hints; 1945 1946 /// The interleave access information contains groups of interleaved accesses 1947 /// with the same stride and close to each other. 1948 InterleavedAccessInfo &InterleaveInfo; 1949 1950 /// Values to ignore in the cost model. 1951 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1952 1953 /// Values to ignore in the cost model when VF > 1. 1954 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1955 1956 /// All element types found in the loop. 1957 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1958 1959 /// Profitable vector factors. 1960 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1961 }; 1962 } // end namespace llvm 1963 1964 /// Helper struct to manage generating runtime checks for vectorization. 1965 /// 1966 /// The runtime checks are created up-front in temporary blocks to allow better 1967 /// estimating the cost and un-linked from the existing IR. After deciding to 1968 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1969 /// temporary blocks are completely removed. 1970 class GeneratedRTChecks { 1971 /// Basic block which contains the generated SCEV checks, if any. 1972 BasicBlock *SCEVCheckBlock = nullptr; 1973 1974 /// The value representing the result of the generated SCEV checks. If it is 1975 /// nullptr, either no SCEV checks have been generated or they have been used. 1976 Value *SCEVCheckCond = nullptr; 1977 1978 /// Basic block which contains the generated memory runtime checks, if any. 1979 BasicBlock *MemCheckBlock = nullptr; 1980 1981 /// The value representing the result of the generated memory runtime checks. 1982 /// If it is nullptr, either no memory runtime checks have been generated or 1983 /// they have been used. 1984 Value *MemRuntimeCheckCond = nullptr; 1985 1986 DominatorTree *DT; 1987 LoopInfo *LI; 1988 1989 SCEVExpander SCEVExp; 1990 SCEVExpander MemCheckExp; 1991 1992 public: 1993 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1994 const DataLayout &DL) 1995 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1996 MemCheckExp(SE, DL, "scev.check") {} 1997 1998 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1999 /// accurately estimate the cost of the runtime checks. The blocks are 2000 /// un-linked from the IR and is added back during vector code generation. If 2001 /// there is no vector code generation, the check blocks are removed 2002 /// completely. 2003 void Create(Loop *L, const LoopAccessInfo &LAI, 2004 const SCEVUnionPredicate &UnionPred) { 2005 2006 BasicBlock *LoopHeader = L->getHeader(); 2007 BasicBlock *Preheader = L->getLoopPreheader(); 2008 2009 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2010 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2011 // may be used by SCEVExpander. The blocks will be un-linked from their 2012 // predecessors and removed from LI & DT at the end of the function. 2013 if (!UnionPred.isAlwaysTrue()) { 2014 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2015 nullptr, "vector.scevcheck"); 2016 2017 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2018 &UnionPred, SCEVCheckBlock->getTerminator()); 2019 } 2020 2021 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2022 if (RtPtrChecking.Need) { 2023 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2024 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2025 "vector.memcheck"); 2026 2027 MemRuntimeCheckCond = 2028 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2029 RtPtrChecking.getChecks(), MemCheckExp); 2030 assert(MemRuntimeCheckCond && 2031 "no RT checks generated although RtPtrChecking " 2032 "claimed checks are required"); 2033 } 2034 2035 if (!MemCheckBlock && !SCEVCheckBlock) 2036 return; 2037 2038 // Unhook the temporary block with the checks, update various places 2039 // accordingly. 2040 if (SCEVCheckBlock) 2041 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2042 if (MemCheckBlock) 2043 MemCheckBlock->replaceAllUsesWith(Preheader); 2044 2045 if (SCEVCheckBlock) { 2046 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2047 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2048 Preheader->getTerminator()->eraseFromParent(); 2049 } 2050 if (MemCheckBlock) { 2051 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2052 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2053 Preheader->getTerminator()->eraseFromParent(); 2054 } 2055 2056 DT->changeImmediateDominator(LoopHeader, Preheader); 2057 if (MemCheckBlock) { 2058 DT->eraseNode(MemCheckBlock); 2059 LI->removeBlock(MemCheckBlock); 2060 } 2061 if (SCEVCheckBlock) { 2062 DT->eraseNode(SCEVCheckBlock); 2063 LI->removeBlock(SCEVCheckBlock); 2064 } 2065 } 2066 2067 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2068 /// unused. 2069 ~GeneratedRTChecks() { 2070 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2071 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2072 if (!SCEVCheckCond) 2073 SCEVCleaner.markResultUsed(); 2074 2075 if (!MemRuntimeCheckCond) 2076 MemCheckCleaner.markResultUsed(); 2077 2078 if (MemRuntimeCheckCond) { 2079 auto &SE = *MemCheckExp.getSE(); 2080 // Memory runtime check generation creates compares that use expanded 2081 // values. Remove them before running the SCEVExpanderCleaners. 2082 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2083 if (MemCheckExp.isInsertedInstruction(&I)) 2084 continue; 2085 SE.forgetValue(&I); 2086 I.eraseFromParent(); 2087 } 2088 } 2089 MemCheckCleaner.cleanup(); 2090 SCEVCleaner.cleanup(); 2091 2092 if (SCEVCheckCond) 2093 SCEVCheckBlock->eraseFromParent(); 2094 if (MemRuntimeCheckCond) 2095 MemCheckBlock->eraseFromParent(); 2096 } 2097 2098 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2099 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2100 /// depending on the generated condition. 2101 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2102 BasicBlock *LoopVectorPreHeader, 2103 BasicBlock *LoopExitBlock) { 2104 if (!SCEVCheckCond) 2105 return nullptr; 2106 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2107 if (C->isZero()) 2108 return nullptr; 2109 2110 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2111 2112 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2113 // Create new preheader for vector loop. 2114 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2115 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2116 2117 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2118 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2119 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2120 SCEVCheckBlock); 2121 2122 DT->addNewBlock(SCEVCheckBlock, Pred); 2123 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2124 2125 ReplaceInstWithInst( 2126 SCEVCheckBlock->getTerminator(), 2127 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2128 // Mark the check as used, to prevent it from being removed during cleanup. 2129 SCEVCheckCond = nullptr; 2130 return SCEVCheckBlock; 2131 } 2132 2133 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2134 /// the branches to branch to the vector preheader or \p Bypass, depending on 2135 /// the generated condition. 2136 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2137 BasicBlock *LoopVectorPreHeader) { 2138 // Check if we generated code that checks in runtime if arrays overlap. 2139 if (!MemRuntimeCheckCond) 2140 return nullptr; 2141 2142 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2143 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2144 MemCheckBlock); 2145 2146 DT->addNewBlock(MemCheckBlock, Pred); 2147 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2148 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2149 2150 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2151 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2152 2153 ReplaceInstWithInst( 2154 MemCheckBlock->getTerminator(), 2155 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2156 MemCheckBlock->getTerminator()->setDebugLoc( 2157 Pred->getTerminator()->getDebugLoc()); 2158 2159 // Mark the check as used, to prevent it from being removed during cleanup. 2160 MemRuntimeCheckCond = nullptr; 2161 return MemCheckBlock; 2162 } 2163 }; 2164 2165 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2166 // vectorization. The loop needs to be annotated with #pragma omp simd 2167 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2168 // vector length information is not provided, vectorization is not considered 2169 // explicit. Interleave hints are not allowed either. These limitations will be 2170 // relaxed in the future. 2171 // Please, note that we are currently forced to abuse the pragma 'clang 2172 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2173 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2174 // provides *explicit vectorization hints* (LV can bypass legal checks and 2175 // assume that vectorization is legal). However, both hints are implemented 2176 // using the same metadata (llvm.loop.vectorize, processed by 2177 // LoopVectorizeHints). This will be fixed in the future when the native IR 2178 // representation for pragma 'omp simd' is introduced. 2179 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2180 OptimizationRemarkEmitter *ORE) { 2181 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2182 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2183 2184 // Only outer loops with an explicit vectorization hint are supported. 2185 // Unannotated outer loops are ignored. 2186 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2187 return false; 2188 2189 Function *Fn = OuterLp->getHeader()->getParent(); 2190 if (!Hints.allowVectorization(Fn, OuterLp, 2191 true /*VectorizeOnlyWhenForced*/)) { 2192 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2193 return false; 2194 } 2195 2196 if (Hints.getInterleave() > 1) { 2197 // TODO: Interleave support is future work. 2198 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2199 "outer loops.\n"); 2200 Hints.emitRemarkWithHints(); 2201 return false; 2202 } 2203 2204 return true; 2205 } 2206 2207 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2208 OptimizationRemarkEmitter *ORE, 2209 SmallVectorImpl<Loop *> &V) { 2210 // Collect inner loops and outer loops without irreducible control flow. For 2211 // now, only collect outer loops that have explicit vectorization hints. If we 2212 // are stress testing the VPlan H-CFG construction, we collect the outermost 2213 // loop of every loop nest. 2214 if (L.isInnermost() || VPlanBuildStressTest || 2215 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2216 LoopBlocksRPO RPOT(&L); 2217 RPOT.perform(LI); 2218 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2219 V.push_back(&L); 2220 // TODO: Collect inner loops inside marked outer loops in case 2221 // vectorization fails for the outer loop. Do not invoke 2222 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2223 // already known to be reducible. We can use an inherited attribute for 2224 // that. 2225 return; 2226 } 2227 } 2228 for (Loop *InnerL : L) 2229 collectSupportedLoops(*InnerL, LI, ORE, V); 2230 } 2231 2232 namespace { 2233 2234 /// The LoopVectorize Pass. 2235 struct LoopVectorize : public FunctionPass { 2236 /// Pass identification, replacement for typeid 2237 static char ID; 2238 2239 LoopVectorizePass Impl; 2240 2241 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2242 bool VectorizeOnlyWhenForced = false) 2243 : FunctionPass(ID), 2244 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2245 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2246 } 2247 2248 bool runOnFunction(Function &F) override { 2249 if (skipFunction(F)) 2250 return false; 2251 2252 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2253 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2254 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2255 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2256 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2257 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2258 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2259 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2260 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2261 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2262 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2263 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2264 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2265 2266 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2267 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2268 2269 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2270 GetLAA, *ORE, PSI).MadeAnyChange; 2271 } 2272 2273 void getAnalysisUsage(AnalysisUsage &AU) const override { 2274 AU.addRequired<AssumptionCacheTracker>(); 2275 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2276 AU.addRequired<DominatorTreeWrapperPass>(); 2277 AU.addRequired<LoopInfoWrapperPass>(); 2278 AU.addRequired<ScalarEvolutionWrapperPass>(); 2279 AU.addRequired<TargetTransformInfoWrapperPass>(); 2280 AU.addRequired<AAResultsWrapperPass>(); 2281 AU.addRequired<LoopAccessLegacyAnalysis>(); 2282 AU.addRequired<DemandedBitsWrapperPass>(); 2283 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2284 AU.addRequired<InjectTLIMappingsLegacy>(); 2285 2286 // We currently do not preserve loopinfo/dominator analyses with outer loop 2287 // vectorization. Until this is addressed, mark these analyses as preserved 2288 // only for non-VPlan-native path. 2289 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2290 if (!EnableVPlanNativePath) { 2291 AU.addPreserved<LoopInfoWrapperPass>(); 2292 AU.addPreserved<DominatorTreeWrapperPass>(); 2293 } 2294 2295 AU.addPreserved<BasicAAWrapperPass>(); 2296 AU.addPreserved<GlobalsAAWrapperPass>(); 2297 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2298 } 2299 }; 2300 2301 } // end anonymous namespace 2302 2303 //===----------------------------------------------------------------------===// 2304 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2305 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2306 //===----------------------------------------------------------------------===// 2307 2308 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2309 // We need to place the broadcast of invariant variables outside the loop, 2310 // but only if it's proven safe to do so. Else, broadcast will be inside 2311 // vector loop body. 2312 Instruction *Instr = dyn_cast<Instruction>(V); 2313 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2314 (!Instr || 2315 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2316 // Place the code for broadcasting invariant variables in the new preheader. 2317 IRBuilder<>::InsertPointGuard Guard(Builder); 2318 if (SafeToHoist) 2319 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2320 2321 // Broadcast the scalar into all locations in the vector. 2322 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2323 2324 return Shuf; 2325 } 2326 2327 /// This function adds 2328 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2329 /// to each vector element of Val. The sequence starts at StartIndex. 2330 /// \p Opcode is relevant for FP induction variable. 2331 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2332 Instruction::BinaryOps BinOp, ElementCount VF, 2333 IRBuilder<> &Builder) { 2334 if (VF.isScalar()) { 2335 // When unrolling and the VF is 1, we only need to add a simple scalar. 2336 Type *Ty = Val->getType(); 2337 assert(!Ty->isVectorTy() && "Val must be a scalar"); 2338 2339 if (Ty->isFloatingPointTy()) { 2340 // Floating-point operations inherit FMF via the builder's flags. 2341 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 2342 return Builder.CreateBinOp(BinOp, Val, MulOp); 2343 } 2344 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), 2345 "induction"); 2346 } 2347 2348 // Create and check the types. 2349 auto *ValVTy = cast<VectorType>(Val->getType()); 2350 ElementCount VLen = ValVTy->getElementCount(); 2351 2352 Type *STy = Val->getType()->getScalarType(); 2353 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2354 "Induction Step must be an integer or FP"); 2355 assert(Step->getType() == STy && "Step has wrong type"); 2356 2357 SmallVector<Constant *, 8> Indices; 2358 2359 // Create a vector of consecutive numbers from zero to VF. 2360 VectorType *InitVecValVTy = ValVTy; 2361 Type *InitVecValSTy = STy; 2362 if (STy->isFloatingPointTy()) { 2363 InitVecValSTy = 2364 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2365 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2366 } 2367 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2368 2369 // Splat the StartIdx 2370 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2371 2372 if (STy->isIntegerTy()) { 2373 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2374 Step = Builder.CreateVectorSplat(VLen, Step); 2375 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2376 // FIXME: The newly created binary instructions should contain nsw/nuw 2377 // flags, which can be found from the original scalar operations. 2378 Step = Builder.CreateMul(InitVec, Step); 2379 return Builder.CreateAdd(Val, Step, "induction"); 2380 } 2381 2382 // Floating point induction. 2383 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2384 "Binary Opcode should be specified for FP induction"); 2385 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2386 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2387 2388 Step = Builder.CreateVectorSplat(VLen, Step); 2389 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2390 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2391 } 2392 2393 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2394 const InductionDescriptor &II, Value *Step, Value *Start, 2395 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2396 IRBuilder<> &Builder = State.Builder; 2397 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2398 "Expected either an induction phi-node or a truncate of it!"); 2399 2400 // Construct the initial value of the vector IV in the vector loop preheader 2401 auto CurrIP = Builder.saveIP(); 2402 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2403 if (isa<TruncInst>(EntryVal)) { 2404 assert(Start->getType()->isIntegerTy() && 2405 "Truncation requires an integer type"); 2406 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2407 Step = Builder.CreateTrunc(Step, TruncType); 2408 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2409 } 2410 2411 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2412 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2413 Value *SteppedStart = getStepVector( 2414 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2415 2416 // We create vector phi nodes for both integer and floating-point induction 2417 // variables. Here, we determine the kind of arithmetic we will perform. 2418 Instruction::BinaryOps AddOp; 2419 Instruction::BinaryOps MulOp; 2420 if (Step->getType()->isIntegerTy()) { 2421 AddOp = Instruction::Add; 2422 MulOp = Instruction::Mul; 2423 } else { 2424 AddOp = II.getInductionOpcode(); 2425 MulOp = Instruction::FMul; 2426 } 2427 2428 // Multiply the vectorization factor by the step using integer or 2429 // floating-point arithmetic as appropriate. 2430 Type *StepType = Step->getType(); 2431 Value *RuntimeVF; 2432 if (Step->getType()->isFloatingPointTy()) 2433 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2434 else 2435 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2436 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2437 2438 // Create a vector splat to use in the induction update. 2439 // 2440 // FIXME: If the step is non-constant, we create the vector splat with 2441 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2442 // handle a constant vector splat. 2443 Value *SplatVF = isa<Constant>(Mul) 2444 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2445 : Builder.CreateVectorSplat(State.VF, Mul); 2446 Builder.restoreIP(CurrIP); 2447 2448 // We may need to add the step a number of times, depending on the unroll 2449 // factor. The last of those goes into the PHI. 2450 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2451 &*LoopVectorBody->getFirstInsertionPt()); 2452 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2453 Instruction *LastInduction = VecInd; 2454 for (unsigned Part = 0; Part < UF; ++Part) { 2455 State.set(Def, LastInduction, Part); 2456 2457 if (isa<TruncInst>(EntryVal)) 2458 addMetadata(LastInduction, EntryVal); 2459 2460 LastInduction = cast<Instruction>( 2461 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2462 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2463 } 2464 2465 // Move the last step to the end of the latch block. This ensures consistent 2466 // placement of all induction updates. 2467 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2468 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2469 auto *ICmp = cast<Instruction>(Br->getCondition()); 2470 LastInduction->moveBefore(ICmp); 2471 LastInduction->setName("vec.ind.next"); 2472 2473 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2474 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2475 } 2476 2477 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2478 return Cost->isScalarAfterVectorization(I, VF) || 2479 Cost->isProfitableToScalarize(I, VF); 2480 } 2481 2482 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2483 if (shouldScalarizeInstruction(IV)) 2484 return true; 2485 auto isScalarInst = [&](User *U) -> bool { 2486 auto *I = cast<Instruction>(U); 2487 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2488 }; 2489 return llvm::any_of(IV->users(), isScalarInst); 2490 } 2491 2492 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, 2493 const InductionDescriptor &ID, 2494 Value *Start, TruncInst *Trunc, 2495 VPValue *Def, 2496 VPTransformState &State) { 2497 IRBuilder<> &Builder = State.Builder; 2498 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2499 "Primary induction variable must have an integer type"); 2500 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2501 2502 // The value from the original loop to which we are mapping the new induction 2503 // variable. 2504 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2505 2506 auto &DL = EntryVal->getModule()->getDataLayout(); 2507 2508 // Generate code for the induction step. Note that induction steps are 2509 // required to be loop-invariant 2510 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2511 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2512 "Induction step should be loop invariant"); 2513 if (PSE.getSE()->isSCEVable(IV->getType())) { 2514 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2515 return Exp.expandCodeFor(Step, Step->getType(), 2516 State.CFG.VectorPreHeader->getTerminator()); 2517 } 2518 return cast<SCEVUnknown>(Step)->getValue(); 2519 }; 2520 2521 // The scalar value to broadcast. This is derived from the canonical 2522 // induction variable. If a truncation type is given, truncate the canonical 2523 // induction variable and step. Otherwise, derive these values from the 2524 // induction descriptor. 2525 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2526 Value *ScalarIV = Induction; 2527 if (IV != OldInduction) { 2528 ScalarIV = IV->getType()->isIntegerTy() 2529 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2530 : Builder.CreateCast(Instruction::SIToFP, Induction, 2531 IV->getType()); 2532 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, 2533 State.CFG.PrevBB); 2534 ScalarIV->setName("offset.idx"); 2535 } 2536 if (Trunc) { 2537 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2538 assert(Step->getType()->isIntegerTy() && 2539 "Truncation requires an integer step"); 2540 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2541 Step = Builder.CreateTrunc(Step, TruncType); 2542 } 2543 return ScalarIV; 2544 }; 2545 2546 // Create the vector values from the scalar IV, in the absence of creating a 2547 // vector IV. 2548 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2549 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2550 for (unsigned Part = 0; Part < UF; ++Part) { 2551 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 2552 Value *StartIdx; 2553 if (Step->getType()->isFloatingPointTy()) 2554 StartIdx = 2555 getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); 2556 else 2557 StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); 2558 2559 Value *EntryPart = 2560 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), 2561 State.VF, State.Builder); 2562 State.set(Def, EntryPart, Part); 2563 if (Trunc) 2564 addMetadata(EntryPart, Trunc); 2565 } 2566 }; 2567 2568 // Fast-math-flags propagate from the original induction instruction. 2569 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2570 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2571 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2572 2573 // Now do the actual transformations, and start with creating the step value. 2574 Value *Step = CreateStepValue(ID.getStep()); 2575 if (State.VF.isZero() || State.VF.isScalar()) { 2576 Value *ScalarIV = CreateScalarIV(Step); 2577 CreateSplatIV(ScalarIV, Step); 2578 return; 2579 } 2580 2581 // Determine if we want a scalar version of the induction variable. This is 2582 // true if the induction variable itself is not widened, or if it has at 2583 // least one user in the loop that is not widened. 2584 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2585 if (!NeedsScalarIV) { 2586 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2587 return; 2588 } 2589 2590 // Try to create a new independent vector induction variable. If we can't 2591 // create the phi node, we will splat the scalar induction variable in each 2592 // loop iteration. 2593 if (!shouldScalarizeInstruction(EntryVal)) { 2594 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2595 Value *ScalarIV = CreateScalarIV(Step); 2596 // Create scalar steps that can be used by instructions we will later 2597 // scalarize. Note that the addition of the scalar steps will not increase 2598 // the number of instructions in the loop in the common case prior to 2599 // InstCombine. We will be trading one vector extract for each scalar step. 2600 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2601 return; 2602 } 2603 2604 // All IV users are scalar instructions, so only emit a scalar IV, not a 2605 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2606 // predicate used by the masked loads/stores. 2607 Value *ScalarIV = CreateScalarIV(Step); 2608 if (!Cost->isScalarEpilogueAllowed()) 2609 CreateSplatIV(ScalarIV, Step); 2610 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2611 } 2612 2613 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2614 Instruction *EntryVal, 2615 const InductionDescriptor &ID, 2616 VPValue *Def, 2617 VPTransformState &State) { 2618 IRBuilder<> &Builder = State.Builder; 2619 // We shouldn't have to build scalar steps if we aren't vectorizing. 2620 assert(State.VF.isVector() && "VF should be greater than one"); 2621 // Get the value type and ensure it and the step have the same integer type. 2622 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2623 assert(ScalarIVTy == Step->getType() && 2624 "Val and Step should have the same type"); 2625 2626 // We build scalar steps for both integer and floating-point induction 2627 // variables. Here, we determine the kind of arithmetic we will perform. 2628 Instruction::BinaryOps AddOp; 2629 Instruction::BinaryOps MulOp; 2630 if (ScalarIVTy->isIntegerTy()) { 2631 AddOp = Instruction::Add; 2632 MulOp = Instruction::Mul; 2633 } else { 2634 AddOp = ID.getInductionOpcode(); 2635 MulOp = Instruction::FMul; 2636 } 2637 2638 // Determine the number of scalars we need to generate for each unroll 2639 // iteration. If EntryVal is uniform, we only need to generate the first 2640 // lane. Otherwise, we generate all VF values. 2641 bool IsUniform = 2642 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); 2643 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 2644 // Compute the scalar steps and save the results in State. 2645 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2646 ScalarIVTy->getScalarSizeInBits()); 2647 Type *VecIVTy = nullptr; 2648 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2649 if (!IsUniform && State.VF.isScalable()) { 2650 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2651 UnitStepVec = 2652 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2653 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2654 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2655 } 2656 2657 for (unsigned Part = 0; Part < State.UF; ++Part) { 2658 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2659 2660 if (!IsUniform && State.VF.isScalable()) { 2661 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2662 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2663 if (ScalarIVTy->isFloatingPointTy()) 2664 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2665 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2666 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2667 State.set(Def, Add, Part); 2668 // It's useful to record the lane values too for the known minimum number 2669 // of elements so we do those below. This improves the code quality when 2670 // trying to extract the first element, for example. 2671 } 2672 2673 if (ScalarIVTy->isFloatingPointTy()) 2674 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2675 2676 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2677 Value *StartIdx = Builder.CreateBinOp( 2678 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2679 // The step returned by `createStepForVF` is a runtime-evaluated value 2680 // when VF is scalable. Otherwise, it should be folded into a Constant. 2681 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2682 "Expected StartIdx to be folded to a constant when VF is not " 2683 "scalable"); 2684 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2685 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2686 State.set(Def, Add, VPIteration(Part, Lane)); 2687 } 2688 } 2689 } 2690 2691 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2692 const VPIteration &Instance, 2693 VPTransformState &State) { 2694 Value *ScalarInst = State.get(Def, Instance); 2695 Value *VectorValue = State.get(Def, Instance.Part); 2696 VectorValue = Builder.CreateInsertElement( 2697 VectorValue, ScalarInst, 2698 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2699 State.set(Def, VectorValue, Instance.Part); 2700 } 2701 2702 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2703 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2704 return Builder.CreateVectorReverse(Vec, "reverse"); 2705 } 2706 2707 // Return whether we allow using masked interleave-groups (for dealing with 2708 // strided loads/stores that reside in predicated blocks, or for dealing 2709 // with gaps). 2710 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2711 // If an override option has been passed in for interleaved accesses, use it. 2712 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2713 return EnableMaskedInterleavedMemAccesses; 2714 2715 return TTI.enableMaskedInterleavedAccessVectorization(); 2716 } 2717 2718 // Try to vectorize the interleave group that \p Instr belongs to. 2719 // 2720 // E.g. Translate following interleaved load group (factor = 3): 2721 // for (i = 0; i < N; i+=3) { 2722 // R = Pic[i]; // Member of index 0 2723 // G = Pic[i+1]; // Member of index 1 2724 // B = Pic[i+2]; // Member of index 2 2725 // ... // do something to R, G, B 2726 // } 2727 // To: 2728 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2729 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2730 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2731 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2732 // 2733 // Or translate following interleaved store group (factor = 3): 2734 // for (i = 0; i < N; i+=3) { 2735 // ... do something to R, G, B 2736 // Pic[i] = R; // Member of index 0 2737 // Pic[i+1] = G; // Member of index 1 2738 // Pic[i+2] = B; // Member of index 2 2739 // } 2740 // To: 2741 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2742 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2743 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2744 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2745 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2746 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2747 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2748 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2749 VPValue *BlockInMask) { 2750 Instruction *Instr = Group->getInsertPos(); 2751 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2752 2753 // Prepare for the vector type of the interleaved load/store. 2754 Type *ScalarTy = getLoadStoreType(Instr); 2755 unsigned InterleaveFactor = Group->getFactor(); 2756 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2757 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2758 2759 // Prepare for the new pointers. 2760 SmallVector<Value *, 2> AddrParts; 2761 unsigned Index = Group->getIndex(Instr); 2762 2763 // TODO: extend the masked interleaved-group support to reversed access. 2764 assert((!BlockInMask || !Group->isReverse()) && 2765 "Reversed masked interleave-group not supported."); 2766 2767 // If the group is reverse, adjust the index to refer to the last vector lane 2768 // instead of the first. We adjust the index from the first vector lane, 2769 // rather than directly getting the pointer for lane VF - 1, because the 2770 // pointer operand of the interleaved access is supposed to be uniform. For 2771 // uniform instructions, we're only required to generate a value for the 2772 // first vector lane in each unroll iteration. 2773 if (Group->isReverse()) 2774 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2775 2776 for (unsigned Part = 0; Part < UF; Part++) { 2777 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2778 setDebugLocFromInst(AddrPart); 2779 2780 // Notice current instruction could be any index. Need to adjust the address 2781 // to the member of index 0. 2782 // 2783 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2784 // b = A[i]; // Member of index 0 2785 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2786 // 2787 // E.g. A[i+1] = a; // Member of index 1 2788 // A[i] = b; // Member of index 0 2789 // A[i+2] = c; // Member of index 2 (Current instruction) 2790 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2791 2792 bool InBounds = false; 2793 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2794 InBounds = gep->isInBounds(); 2795 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2796 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2797 2798 // Cast to the vector pointer type. 2799 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2800 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2801 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2802 } 2803 2804 setDebugLocFromInst(Instr); 2805 Value *PoisonVec = PoisonValue::get(VecTy); 2806 2807 Value *MaskForGaps = nullptr; 2808 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2809 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2810 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2811 } 2812 2813 // Vectorize the interleaved load group. 2814 if (isa<LoadInst>(Instr)) { 2815 // For each unroll part, create a wide load for the group. 2816 SmallVector<Value *, 2> NewLoads; 2817 for (unsigned Part = 0; Part < UF; Part++) { 2818 Instruction *NewLoad; 2819 if (BlockInMask || MaskForGaps) { 2820 assert(useMaskedInterleavedAccesses(*TTI) && 2821 "masked interleaved groups are not allowed."); 2822 Value *GroupMask = MaskForGaps; 2823 if (BlockInMask) { 2824 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2825 Value *ShuffledMask = Builder.CreateShuffleVector( 2826 BlockInMaskPart, 2827 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2828 "interleaved.mask"); 2829 GroupMask = MaskForGaps 2830 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2831 MaskForGaps) 2832 : ShuffledMask; 2833 } 2834 NewLoad = 2835 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2836 GroupMask, PoisonVec, "wide.masked.vec"); 2837 } 2838 else 2839 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2840 Group->getAlign(), "wide.vec"); 2841 Group->addMetadata(NewLoad); 2842 NewLoads.push_back(NewLoad); 2843 } 2844 2845 // For each member in the group, shuffle out the appropriate data from the 2846 // wide loads. 2847 unsigned J = 0; 2848 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2849 Instruction *Member = Group->getMember(I); 2850 2851 // Skip the gaps in the group. 2852 if (!Member) 2853 continue; 2854 2855 auto StrideMask = 2856 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2857 for (unsigned Part = 0; Part < UF; Part++) { 2858 Value *StridedVec = Builder.CreateShuffleVector( 2859 NewLoads[Part], StrideMask, "strided.vec"); 2860 2861 // If this member has different type, cast the result type. 2862 if (Member->getType() != ScalarTy) { 2863 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2864 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2865 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2866 } 2867 2868 if (Group->isReverse()) 2869 StridedVec = reverseVector(StridedVec); 2870 2871 State.set(VPDefs[J], StridedVec, Part); 2872 } 2873 ++J; 2874 } 2875 return; 2876 } 2877 2878 // The sub vector type for current instruction. 2879 auto *SubVT = VectorType::get(ScalarTy, VF); 2880 2881 // Vectorize the interleaved store group. 2882 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2883 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2884 "masked interleaved groups are not allowed."); 2885 assert((!MaskForGaps || !VF.isScalable()) && 2886 "masking gaps for scalable vectors is not yet supported."); 2887 for (unsigned Part = 0; Part < UF; Part++) { 2888 // Collect the stored vector from each member. 2889 SmallVector<Value *, 4> StoredVecs; 2890 for (unsigned i = 0; i < InterleaveFactor; i++) { 2891 assert((Group->getMember(i) || MaskForGaps) && 2892 "Fail to get a member from an interleaved store group"); 2893 Instruction *Member = Group->getMember(i); 2894 2895 // Skip the gaps in the group. 2896 if (!Member) { 2897 Value *Undef = PoisonValue::get(SubVT); 2898 StoredVecs.push_back(Undef); 2899 continue; 2900 } 2901 2902 Value *StoredVec = State.get(StoredValues[i], Part); 2903 2904 if (Group->isReverse()) 2905 StoredVec = reverseVector(StoredVec); 2906 2907 // If this member has different type, cast it to a unified type. 2908 2909 if (StoredVec->getType() != SubVT) 2910 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2911 2912 StoredVecs.push_back(StoredVec); 2913 } 2914 2915 // Concatenate all vectors into a wide vector. 2916 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2917 2918 // Interleave the elements in the wide vector. 2919 Value *IVec = Builder.CreateShuffleVector( 2920 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2921 "interleaved.vec"); 2922 2923 Instruction *NewStoreInstr; 2924 if (BlockInMask || MaskForGaps) { 2925 Value *GroupMask = MaskForGaps; 2926 if (BlockInMask) { 2927 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2928 Value *ShuffledMask = Builder.CreateShuffleVector( 2929 BlockInMaskPart, 2930 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2931 "interleaved.mask"); 2932 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2933 ShuffledMask, MaskForGaps) 2934 : ShuffledMask; 2935 } 2936 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2937 Group->getAlign(), GroupMask); 2938 } else 2939 NewStoreInstr = 2940 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2941 2942 Group->addMetadata(NewStoreInstr); 2943 } 2944 } 2945 2946 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2947 VPReplicateRecipe *RepRecipe, 2948 const VPIteration &Instance, 2949 bool IfPredicateInstr, 2950 VPTransformState &State) { 2951 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2952 2953 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2954 // the first lane and part. 2955 if (isa<NoAliasScopeDeclInst>(Instr)) 2956 if (!Instance.isFirstIteration()) 2957 return; 2958 2959 setDebugLocFromInst(Instr); 2960 2961 // Does this instruction return a value ? 2962 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2963 2964 Instruction *Cloned = Instr->clone(); 2965 if (!IsVoidRetTy) 2966 Cloned->setName(Instr->getName() + ".cloned"); 2967 2968 // If the scalarized instruction contributes to the address computation of a 2969 // widen masked load/store which was in a basic block that needed predication 2970 // and is not predicated after vectorization, we can't propagate 2971 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2972 // instruction could feed a poison value to the base address of the widen 2973 // load/store. 2974 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2975 Cloned->dropPoisonGeneratingFlags(); 2976 2977 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2978 Builder.GetInsertPoint()); 2979 // Replace the operands of the cloned instructions with their scalar 2980 // equivalents in the new loop. 2981 for (auto &I : enumerate(RepRecipe->operands())) { 2982 auto InputInstance = Instance; 2983 VPValue *Operand = I.value(); 2984 if (State.Plan->isUniformAfterVectorization(Operand)) 2985 InputInstance.Lane = VPLane::getFirstLane(); 2986 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2987 } 2988 addNewMetadata(Cloned, Instr); 2989 2990 // Place the cloned scalar in the new loop. 2991 Builder.Insert(Cloned); 2992 2993 State.set(RepRecipe, Cloned, Instance); 2994 2995 // If we just cloned a new assumption, add it the assumption cache. 2996 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2997 AC->registerAssumption(II); 2998 2999 // End if-block. 3000 if (IfPredicateInstr) 3001 PredicatedInstructions.push_back(Cloned); 3002 } 3003 3004 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3005 Value *End, Value *Step, 3006 Instruction *DL) { 3007 BasicBlock *Header = L->getHeader(); 3008 BasicBlock *Latch = L->getLoopLatch(); 3009 // As we're just creating this loop, it's possible no latch exists 3010 // yet. If so, use the header as this will be a single block loop. 3011 if (!Latch) 3012 Latch = Header; 3013 3014 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3015 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3016 setDebugLocFromInst(OldInst, &B); 3017 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3018 3019 B.SetInsertPoint(Latch->getTerminator()); 3020 setDebugLocFromInst(OldInst, &B); 3021 3022 // Create i+1 and fill the PHINode. 3023 // 3024 // If the tail is not folded, we know that End - Start >= Step (either 3025 // statically or through the minimum iteration checks). We also know that both 3026 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3027 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3028 // overflows and we can mark the induction increment as NUW. 3029 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3030 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3031 Induction->addIncoming(Start, L->getLoopPreheader()); 3032 Induction->addIncoming(Next, Latch); 3033 // Create the compare. 3034 Value *ICmp = B.CreateICmpEQ(Next, End); 3035 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3036 3037 // Now we have two terminators. Remove the old one from the block. 3038 Latch->getTerminator()->eraseFromParent(); 3039 3040 return Induction; 3041 } 3042 3043 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3044 if (TripCount) 3045 return TripCount; 3046 3047 assert(L && "Create Trip Count for null loop."); 3048 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3049 // Find the loop boundaries. 3050 ScalarEvolution *SE = PSE.getSE(); 3051 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3052 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3053 "Invalid loop count"); 3054 3055 Type *IdxTy = Legal->getWidestInductionType(); 3056 assert(IdxTy && "No type for induction"); 3057 3058 // The exit count might have the type of i64 while the phi is i32. This can 3059 // happen if we have an induction variable that is sign extended before the 3060 // compare. The only way that we get a backedge taken count is that the 3061 // induction variable was signed and as such will not overflow. In such a case 3062 // truncation is legal. 3063 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3064 IdxTy->getPrimitiveSizeInBits()) 3065 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3066 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3067 3068 // Get the total trip count from the count by adding 1. 3069 const SCEV *ExitCount = SE->getAddExpr( 3070 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3071 3072 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3073 3074 // Expand the trip count and place the new instructions in the preheader. 3075 // Notice that the pre-header does not change, only the loop body. 3076 SCEVExpander Exp(*SE, DL, "induction"); 3077 3078 // Count holds the overall loop count (N). 3079 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3080 L->getLoopPreheader()->getTerminator()); 3081 3082 if (TripCount->getType()->isPointerTy()) 3083 TripCount = 3084 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3085 L->getLoopPreheader()->getTerminator()); 3086 3087 return TripCount; 3088 } 3089 3090 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3091 if (VectorTripCount) 3092 return VectorTripCount; 3093 3094 Value *TC = getOrCreateTripCount(L); 3095 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3096 3097 Type *Ty = TC->getType(); 3098 // This is where we can make the step a runtime constant. 3099 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3100 3101 // If the tail is to be folded by masking, round the number of iterations N 3102 // up to a multiple of Step instead of rounding down. This is done by first 3103 // adding Step-1 and then rounding down. Note that it's ok if this addition 3104 // overflows: the vector induction variable will eventually wrap to zero given 3105 // that it starts at zero and its Step is a power of two; the loop will then 3106 // exit, with the last early-exit vector comparison also producing all-true. 3107 if (Cost->foldTailByMasking()) { 3108 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3109 "VF*UF must be a power of 2 when folding tail by masking"); 3110 assert(!VF.isScalable() && 3111 "Tail folding not yet supported for scalable vectors"); 3112 TC = Builder.CreateAdd( 3113 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3114 } 3115 3116 // Now we need to generate the expression for the part of the loop that the 3117 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3118 // iterations are not required for correctness, or N - Step, otherwise. Step 3119 // is equal to the vectorization factor (number of SIMD elements) times the 3120 // unroll factor (number of SIMD instructions). 3121 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3122 3123 // There are cases where we *must* run at least one iteration in the remainder 3124 // loop. See the cost model for when this can happen. If the step evenly 3125 // divides the trip count, we set the remainder to be equal to the step. If 3126 // the step does not evenly divide the trip count, no adjustment is necessary 3127 // since there will already be scalar iterations. Note that the minimum 3128 // iterations check ensures that N >= Step. 3129 if (Cost->requiresScalarEpilogue(VF)) { 3130 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3131 R = Builder.CreateSelect(IsZero, Step, R); 3132 } 3133 3134 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3135 3136 return VectorTripCount; 3137 } 3138 3139 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3140 const DataLayout &DL) { 3141 // Verify that V is a vector type with same number of elements as DstVTy. 3142 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3143 unsigned VF = DstFVTy->getNumElements(); 3144 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3145 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3146 Type *SrcElemTy = SrcVecTy->getElementType(); 3147 Type *DstElemTy = DstFVTy->getElementType(); 3148 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3149 "Vector elements must have same size"); 3150 3151 // Do a direct cast if element types are castable. 3152 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3153 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3154 } 3155 // V cannot be directly casted to desired vector type. 3156 // May happen when V is a floating point vector but DstVTy is a vector of 3157 // pointers or vice-versa. Handle this using a two-step bitcast using an 3158 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3159 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3160 "Only one type should be a pointer type"); 3161 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3162 "Only one type should be a floating point type"); 3163 Type *IntTy = 3164 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3165 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3166 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3167 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3168 } 3169 3170 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3171 BasicBlock *Bypass) { 3172 Value *Count = getOrCreateTripCount(L); 3173 // Reuse existing vector loop preheader for TC checks. 3174 // Note that new preheader block is generated for vector loop. 3175 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3176 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3177 3178 // Generate code to check if the loop's trip count is less than VF * UF, or 3179 // equal to it in case a scalar epilogue is required; this implies that the 3180 // vector trip count is zero. This check also covers the case where adding one 3181 // to the backedge-taken count overflowed leading to an incorrect trip count 3182 // of zero. In this case we will also jump to the scalar loop. 3183 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3184 : ICmpInst::ICMP_ULT; 3185 3186 // If tail is to be folded, vector loop takes care of all iterations. 3187 Value *CheckMinIters = Builder.getFalse(); 3188 if (!Cost->foldTailByMasking()) { 3189 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3190 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3191 } 3192 // Create new preheader for vector loop. 3193 LoopVectorPreHeader = 3194 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3195 "vector.ph"); 3196 3197 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3198 DT->getNode(Bypass)->getIDom()) && 3199 "TC check is expected to dominate Bypass"); 3200 3201 // Update dominator for Bypass & LoopExit (if needed). 3202 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3203 if (!Cost->requiresScalarEpilogue(VF)) 3204 // If there is an epilogue which must run, there's no edge from the 3205 // middle block to exit blocks and thus no need to update the immediate 3206 // dominator of the exit blocks. 3207 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3208 3209 ReplaceInstWithInst( 3210 TCCheckBlock->getTerminator(), 3211 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3212 LoopBypassBlocks.push_back(TCCheckBlock); 3213 } 3214 3215 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3216 3217 BasicBlock *const SCEVCheckBlock = 3218 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3219 if (!SCEVCheckBlock) 3220 return nullptr; 3221 3222 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3223 (OptForSizeBasedOnProfile && 3224 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3225 "Cannot SCEV check stride or overflow when optimizing for size"); 3226 3227 3228 // Update dominator only if this is first RT check. 3229 if (LoopBypassBlocks.empty()) { 3230 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3231 if (!Cost->requiresScalarEpilogue(VF)) 3232 // If there is an epilogue which must run, there's no edge from the 3233 // middle block to exit blocks and thus no need to update the immediate 3234 // dominator of the exit blocks. 3235 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3236 } 3237 3238 LoopBypassBlocks.push_back(SCEVCheckBlock); 3239 AddedSafetyChecks = true; 3240 return SCEVCheckBlock; 3241 } 3242 3243 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3244 BasicBlock *Bypass) { 3245 // VPlan-native path does not do any analysis for runtime checks currently. 3246 if (EnableVPlanNativePath) 3247 return nullptr; 3248 3249 BasicBlock *const MemCheckBlock = 3250 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3251 3252 // Check if we generated code that checks in runtime if arrays overlap. We put 3253 // the checks into a separate block to make the more common case of few 3254 // elements faster. 3255 if (!MemCheckBlock) 3256 return nullptr; 3257 3258 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3259 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3260 "Cannot emit memory checks when optimizing for size, unless forced " 3261 "to vectorize."); 3262 ORE->emit([&]() { 3263 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3264 L->getStartLoc(), L->getHeader()) 3265 << "Code-size may be reduced by not forcing " 3266 "vectorization, or by source-code modifications " 3267 "eliminating the need for runtime checks " 3268 "(e.g., adding 'restrict')."; 3269 }); 3270 } 3271 3272 LoopBypassBlocks.push_back(MemCheckBlock); 3273 3274 AddedSafetyChecks = true; 3275 3276 // We currently don't use LoopVersioning for the actual loop cloning but we 3277 // still use it to add the noalias metadata. 3278 LVer = std::make_unique<LoopVersioning>( 3279 *Legal->getLAI(), 3280 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3281 DT, PSE.getSE()); 3282 LVer->prepareNoAliasMetadata(); 3283 return MemCheckBlock; 3284 } 3285 3286 Value *InnerLoopVectorizer::emitTransformedIndex( 3287 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3288 const InductionDescriptor &ID, BasicBlock *VectorHeader) const { 3289 3290 SCEVExpander Exp(*SE, DL, "induction"); 3291 auto Step = ID.getStep(); 3292 auto StartValue = ID.getStartValue(); 3293 assert(Index->getType()->getScalarType() == Step->getType() && 3294 "Index scalar type does not match StepValue type"); 3295 3296 // Note: the IR at this point is broken. We cannot use SE to create any new 3297 // SCEV and then expand it, hoping that SCEV's simplification will give us 3298 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3299 // lead to various SCEV crashes. So all we can do is to use builder and rely 3300 // on InstCombine for future simplifications. Here we handle some trivial 3301 // cases only. 3302 auto CreateAdd = [&B](Value *X, Value *Y) { 3303 assert(X->getType() == Y->getType() && "Types don't match!"); 3304 if (auto *CX = dyn_cast<ConstantInt>(X)) 3305 if (CX->isZero()) 3306 return Y; 3307 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3308 if (CY->isZero()) 3309 return X; 3310 return B.CreateAdd(X, Y); 3311 }; 3312 3313 // We allow X to be a vector type, in which case Y will potentially be 3314 // splatted into a vector with the same element count. 3315 auto CreateMul = [&B](Value *X, Value *Y) { 3316 assert(X->getType()->getScalarType() == Y->getType() && 3317 "Types don't match!"); 3318 if (auto *CX = dyn_cast<ConstantInt>(X)) 3319 if (CX->isOne()) 3320 return Y; 3321 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3322 if (CY->isOne()) 3323 return X; 3324 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3325 if (XVTy && !isa<VectorType>(Y->getType())) 3326 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3327 return B.CreateMul(X, Y); 3328 }; 3329 3330 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3331 // loop, choose the end of the vector loop header (=VectorHeader), because 3332 // the DomTree is not kept up-to-date for additional blocks generated in the 3333 // vector loop. By using the header as insertion point, we guarantee that the 3334 // expanded instructions dominate all their uses. 3335 auto GetInsertPoint = [this, &B, VectorHeader]() { 3336 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3337 if (InsertBB != LoopVectorBody && 3338 LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) 3339 return VectorHeader->getTerminator(); 3340 return &*B.GetInsertPoint(); 3341 }; 3342 3343 switch (ID.getKind()) { 3344 case InductionDescriptor::IK_IntInduction: { 3345 assert(!isa<VectorType>(Index->getType()) && 3346 "Vector indices not supported for integer inductions yet"); 3347 assert(Index->getType() == StartValue->getType() && 3348 "Index type does not match StartValue type"); 3349 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3350 return B.CreateSub(StartValue, Index); 3351 auto *Offset = CreateMul( 3352 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3353 return CreateAdd(StartValue, Offset); 3354 } 3355 case InductionDescriptor::IK_PtrInduction: { 3356 assert(isa<SCEVConstant>(Step) && 3357 "Expected constant step for pointer induction"); 3358 return B.CreateGEP( 3359 ID.getElementType(), StartValue, 3360 CreateMul(Index, 3361 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3362 GetInsertPoint()))); 3363 } 3364 case InductionDescriptor::IK_FpInduction: { 3365 assert(!isa<VectorType>(Index->getType()) && 3366 "Vector indices not supported for FP inductions yet"); 3367 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3368 auto InductionBinOp = ID.getInductionBinOp(); 3369 assert(InductionBinOp && 3370 (InductionBinOp->getOpcode() == Instruction::FAdd || 3371 InductionBinOp->getOpcode() == Instruction::FSub) && 3372 "Original bin op should be defined for FP induction"); 3373 3374 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3375 Value *MulExp = B.CreateFMul(StepValue, Index); 3376 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3377 "induction"); 3378 } 3379 case InductionDescriptor::IK_NoInduction: 3380 return nullptr; 3381 } 3382 llvm_unreachable("invalid enum"); 3383 } 3384 3385 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3386 LoopScalarBody = OrigLoop->getHeader(); 3387 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3388 assert(LoopVectorPreHeader && "Invalid loop structure"); 3389 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3390 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3391 "multiple exit loop without required epilogue?"); 3392 3393 LoopMiddleBlock = 3394 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3395 LI, nullptr, Twine(Prefix) + "middle.block"); 3396 LoopScalarPreHeader = 3397 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3398 nullptr, Twine(Prefix) + "scalar.ph"); 3399 3400 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3401 3402 // Set up the middle block terminator. Two cases: 3403 // 1) If we know that we must execute the scalar epilogue, emit an 3404 // unconditional branch. 3405 // 2) Otherwise, we must have a single unique exit block (due to how we 3406 // implement the multiple exit case). In this case, set up a conditonal 3407 // branch from the middle block to the loop scalar preheader, and the 3408 // exit block. completeLoopSkeleton will update the condition to use an 3409 // iteration check, if required to decide whether to execute the remainder. 3410 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3411 BranchInst::Create(LoopScalarPreHeader) : 3412 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3413 Builder.getTrue()); 3414 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3415 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3416 3417 // We intentionally don't let SplitBlock to update LoopInfo since 3418 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3419 // LoopVectorBody is explicitly added to the correct place few lines later. 3420 LoopVectorBody = 3421 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3422 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3423 3424 // Update dominator for loop exit. 3425 if (!Cost->requiresScalarEpilogue(VF)) 3426 // If there is an epilogue which must run, there's no edge from the 3427 // middle block to exit blocks and thus no need to update the immediate 3428 // dominator of the exit blocks. 3429 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3430 3431 // Create and register the new vector loop. 3432 Loop *Lp = LI->AllocateLoop(); 3433 Loop *ParentLoop = OrigLoop->getParentLoop(); 3434 3435 // Insert the new loop into the loop nest and register the new basic blocks 3436 // before calling any utilities such as SCEV that require valid LoopInfo. 3437 if (ParentLoop) { 3438 ParentLoop->addChildLoop(Lp); 3439 } else { 3440 LI->addTopLevelLoop(Lp); 3441 } 3442 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3443 return Lp; 3444 } 3445 3446 void InnerLoopVectorizer::createInductionResumeValues( 3447 Loop *L, Value *VectorTripCount, 3448 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3449 assert(VectorTripCount && L && "Expected valid arguments"); 3450 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3451 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3452 "Inconsistent information about additional bypass."); 3453 // We are going to resume the execution of the scalar loop. 3454 // Go over all of the induction variables that we found and fix the 3455 // PHIs that are left in the scalar version of the loop. 3456 // The starting values of PHI nodes depend on the counter of the last 3457 // iteration in the vectorized loop. 3458 // If we come from a bypass edge then we need to start from the original 3459 // start value. 3460 for (auto &InductionEntry : Legal->getInductionVars()) { 3461 PHINode *OrigPhi = InductionEntry.first; 3462 InductionDescriptor II = InductionEntry.second; 3463 3464 // Create phi nodes to merge from the backedge-taken check block. 3465 PHINode *BCResumeVal = 3466 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3467 LoopScalarPreHeader->getTerminator()); 3468 // Copy original phi DL over to the new one. 3469 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3470 Value *&EndValue = IVEndValues[OrigPhi]; 3471 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3472 if (OrigPhi == OldInduction) { 3473 // We know what the end value is. 3474 EndValue = VectorTripCount; 3475 } else { 3476 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3477 3478 // Fast-math-flags propagate from the original induction instruction. 3479 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3480 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3481 3482 Type *StepType = II.getStep()->getType(); 3483 Instruction::CastOps CastOp = 3484 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3485 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3486 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3487 EndValue = 3488 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3489 EndValue->setName("ind.end"); 3490 3491 // Compute the end value for the additional bypass (if applicable). 3492 if (AdditionalBypass.first) { 3493 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3494 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3495 StepType, true); 3496 CRD = 3497 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3498 EndValueFromAdditionalBypass = 3499 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3500 EndValueFromAdditionalBypass->setName("ind.end"); 3501 } 3502 } 3503 // The new PHI merges the original incoming value, in case of a bypass, 3504 // or the value at the end of the vectorized loop. 3505 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3506 3507 // Fix the scalar body counter (PHI node). 3508 // The old induction's phi node in the scalar body needs the truncated 3509 // value. 3510 for (BasicBlock *BB : LoopBypassBlocks) 3511 BCResumeVal->addIncoming(II.getStartValue(), BB); 3512 3513 if (AdditionalBypass.first) 3514 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3515 EndValueFromAdditionalBypass); 3516 3517 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3518 } 3519 } 3520 3521 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3522 MDNode *OrigLoopID) { 3523 assert(L && "Expected valid loop."); 3524 3525 // The trip counts should be cached by now. 3526 Value *Count = getOrCreateTripCount(L); 3527 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3528 3529 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3530 3531 // Add a check in the middle block to see if we have completed 3532 // all of the iterations in the first vector loop. Three cases: 3533 // 1) If we require a scalar epilogue, there is no conditional branch as 3534 // we unconditionally branch to the scalar preheader. Do nothing. 3535 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3536 // Thus if tail is to be folded, we know we don't need to run the 3537 // remainder and we can use the previous value for the condition (true). 3538 // 3) Otherwise, construct a runtime check. 3539 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3540 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3541 Count, VectorTripCount, "cmp.n", 3542 LoopMiddleBlock->getTerminator()); 3543 3544 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3545 // of the corresponding compare because they may have ended up with 3546 // different line numbers and we want to avoid awkward line stepping while 3547 // debugging. Eg. if the compare has got a line number inside the loop. 3548 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3549 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3550 } 3551 3552 // Get ready to start creating new instructions into the vectorized body. 3553 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3554 "Inconsistent vector loop preheader"); 3555 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3556 3557 Optional<MDNode *> VectorizedLoopID = 3558 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3559 LLVMLoopVectorizeFollowupVectorized}); 3560 if (VectorizedLoopID.hasValue()) { 3561 L->setLoopID(VectorizedLoopID.getValue()); 3562 3563 // Do not setAlreadyVectorized if loop attributes have been defined 3564 // explicitly. 3565 return LoopVectorPreHeader; 3566 } 3567 3568 // Keep all loop hints from the original loop on the vector loop (we'll 3569 // replace the vectorizer-specific hints below). 3570 if (MDNode *LID = OrigLoop->getLoopID()) 3571 L->setLoopID(LID); 3572 3573 LoopVectorizeHints Hints(L, true, *ORE, TTI); 3574 Hints.setAlreadyVectorized(); 3575 3576 #ifdef EXPENSIVE_CHECKS 3577 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3578 LI->verify(*DT); 3579 #endif 3580 3581 return LoopVectorPreHeader; 3582 } 3583 3584 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3585 /* 3586 In this function we generate a new loop. The new loop will contain 3587 the vectorized instructions while the old loop will continue to run the 3588 scalar remainder. 3589 3590 [ ] <-- loop iteration number check. 3591 / | 3592 / v 3593 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3594 | / | 3595 | / v 3596 || [ ] <-- vector pre header. 3597 |/ | 3598 | v 3599 | [ ] \ 3600 | [ ]_| <-- vector loop. 3601 | | 3602 | v 3603 \ -[ ] <--- middle-block. 3604 \/ | 3605 /\ v 3606 | ->[ ] <--- new preheader. 3607 | | 3608 (opt) v <-- edge from middle to exit iff epilogue is not required. 3609 | [ ] \ 3610 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3611 \ | 3612 \ v 3613 >[ ] <-- exit block(s). 3614 ... 3615 */ 3616 3617 // Get the metadata of the original loop before it gets modified. 3618 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3619 3620 // Workaround! Compute the trip count of the original loop and cache it 3621 // before we start modifying the CFG. This code has a systemic problem 3622 // wherein it tries to run analysis over partially constructed IR; this is 3623 // wrong, and not simply for SCEV. The trip count of the original loop 3624 // simply happens to be prone to hitting this in practice. In theory, we 3625 // can hit the same issue for any SCEV, or ValueTracking query done during 3626 // mutation. See PR49900. 3627 getOrCreateTripCount(OrigLoop); 3628 3629 // Create an empty vector loop, and prepare basic blocks for the runtime 3630 // checks. 3631 Loop *Lp = createVectorLoopSkeleton(""); 3632 3633 // Now, compare the new count to zero. If it is zero skip the vector loop and 3634 // jump to the scalar loop. This check also covers the case where the 3635 // backedge-taken count is uint##_max: adding one to it will overflow leading 3636 // to an incorrect trip count of zero. In this (rare) case we will also jump 3637 // to the scalar loop. 3638 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3639 3640 // Generate the code to check any assumptions that we've made for SCEV 3641 // expressions. 3642 emitSCEVChecks(Lp, LoopScalarPreHeader); 3643 3644 // Generate the code that checks in runtime if arrays overlap. We put the 3645 // checks into a separate block to make the more common case of few elements 3646 // faster. 3647 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3648 3649 // Some loops have a single integer induction variable, while other loops 3650 // don't. One example is c++ iterators that often have multiple pointer 3651 // induction variables. In the code below we also support a case where we 3652 // don't have a single induction variable. 3653 // 3654 // We try to obtain an induction variable from the original loop as hard 3655 // as possible. However if we don't find one that: 3656 // - is an integer 3657 // - counts from zero, stepping by one 3658 // - is the size of the widest induction variable type 3659 // then we create a new one. 3660 OldInduction = Legal->getPrimaryInduction(); 3661 Type *IdxTy = Legal->getWidestInductionType(); 3662 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3663 // The loop step is equal to the vectorization factor (num of SIMD elements) 3664 // times the unroll factor (num of SIMD instructions). 3665 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3666 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3667 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3668 Induction = 3669 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3670 getDebugLocFromInstOrOperands(OldInduction)); 3671 3672 // Emit phis for the new starting index of the scalar loop. 3673 createInductionResumeValues(Lp, CountRoundDown); 3674 3675 return completeLoopSkeleton(Lp, OrigLoopID); 3676 } 3677 3678 // Fix up external users of the induction variable. At this point, we are 3679 // in LCSSA form, with all external PHIs that use the IV having one input value, 3680 // coming from the remainder loop. We need those PHIs to also have a correct 3681 // value for the IV when arriving directly from the middle block. 3682 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3683 const InductionDescriptor &II, 3684 Value *CountRoundDown, Value *EndValue, 3685 BasicBlock *MiddleBlock) { 3686 // There are two kinds of external IV usages - those that use the value 3687 // computed in the last iteration (the PHI) and those that use the penultimate 3688 // value (the value that feeds into the phi from the loop latch). 3689 // We allow both, but they, obviously, have different values. 3690 3691 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3692 3693 DenseMap<Value *, Value *> MissingVals; 3694 3695 // An external user of the last iteration's value should see the value that 3696 // the remainder loop uses to initialize its own IV. 3697 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3698 for (User *U : PostInc->users()) { 3699 Instruction *UI = cast<Instruction>(U); 3700 if (!OrigLoop->contains(UI)) { 3701 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3702 MissingVals[UI] = EndValue; 3703 } 3704 } 3705 3706 // An external user of the penultimate value need to see EndValue - Step. 3707 // The simplest way to get this is to recompute it from the constituent SCEVs, 3708 // that is Start + (Step * (CRD - 1)). 3709 for (User *U : OrigPhi->users()) { 3710 auto *UI = cast<Instruction>(U); 3711 if (!OrigLoop->contains(UI)) { 3712 const DataLayout &DL = 3713 OrigLoop->getHeader()->getModule()->getDataLayout(); 3714 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3715 3716 IRBuilder<> B(MiddleBlock->getTerminator()); 3717 3718 // Fast-math-flags propagate from the original induction instruction. 3719 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3720 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3721 3722 Value *CountMinusOne = B.CreateSub( 3723 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3724 Value *CMO = 3725 !II.getStep()->getType()->isIntegerTy() 3726 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3727 II.getStep()->getType()) 3728 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3729 CMO->setName("cast.cmo"); 3730 Value *Escape = 3731 emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); 3732 Escape->setName("ind.escape"); 3733 MissingVals[UI] = Escape; 3734 } 3735 } 3736 3737 for (auto &I : MissingVals) { 3738 PHINode *PHI = cast<PHINode>(I.first); 3739 // One corner case we have to handle is two IVs "chasing" each-other, 3740 // that is %IV2 = phi [...], [ %IV1, %latch ] 3741 // In this case, if IV1 has an external use, we need to avoid adding both 3742 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3743 // don't already have an incoming value for the middle block. 3744 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3745 PHI->addIncoming(I.second, MiddleBlock); 3746 } 3747 } 3748 3749 namespace { 3750 3751 struct CSEDenseMapInfo { 3752 static bool canHandle(const Instruction *I) { 3753 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3754 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3755 } 3756 3757 static inline Instruction *getEmptyKey() { 3758 return DenseMapInfo<Instruction *>::getEmptyKey(); 3759 } 3760 3761 static inline Instruction *getTombstoneKey() { 3762 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3763 } 3764 3765 static unsigned getHashValue(const Instruction *I) { 3766 assert(canHandle(I) && "Unknown instruction!"); 3767 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3768 I->value_op_end())); 3769 } 3770 3771 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3772 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3773 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3774 return LHS == RHS; 3775 return LHS->isIdenticalTo(RHS); 3776 } 3777 }; 3778 3779 } // end anonymous namespace 3780 3781 ///Perform cse of induction variable instructions. 3782 static void cse(BasicBlock *BB) { 3783 // Perform simple cse. 3784 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3785 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3786 if (!CSEDenseMapInfo::canHandle(&In)) 3787 continue; 3788 3789 // Check if we can replace this instruction with any of the 3790 // visited instructions. 3791 if (Instruction *V = CSEMap.lookup(&In)) { 3792 In.replaceAllUsesWith(V); 3793 In.eraseFromParent(); 3794 continue; 3795 } 3796 3797 CSEMap[&In] = &In; 3798 } 3799 } 3800 3801 InstructionCost 3802 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3803 bool &NeedToScalarize) const { 3804 Function *F = CI->getCalledFunction(); 3805 Type *ScalarRetTy = CI->getType(); 3806 SmallVector<Type *, 4> Tys, ScalarTys; 3807 for (auto &ArgOp : CI->args()) 3808 ScalarTys.push_back(ArgOp->getType()); 3809 3810 // Estimate cost of scalarized vector call. The source operands are assumed 3811 // to be vectors, so we need to extract individual elements from there, 3812 // execute VF scalar calls, and then gather the result into the vector return 3813 // value. 3814 InstructionCost ScalarCallCost = 3815 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3816 if (VF.isScalar()) 3817 return ScalarCallCost; 3818 3819 // Compute corresponding vector type for return value and arguments. 3820 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3821 for (Type *ScalarTy : ScalarTys) 3822 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3823 3824 // Compute costs of unpacking argument values for the scalar calls and 3825 // packing the return values to a vector. 3826 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3827 3828 InstructionCost Cost = 3829 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3830 3831 // If we can't emit a vector call for this function, then the currently found 3832 // cost is the cost we need to return. 3833 NeedToScalarize = true; 3834 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3835 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3836 3837 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3838 return Cost; 3839 3840 // If the corresponding vector cost is cheaper, return its cost. 3841 InstructionCost VectorCallCost = 3842 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3843 if (VectorCallCost < Cost) { 3844 NeedToScalarize = false; 3845 Cost = VectorCallCost; 3846 } 3847 return Cost; 3848 } 3849 3850 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3851 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3852 return Elt; 3853 return VectorType::get(Elt, VF); 3854 } 3855 3856 InstructionCost 3857 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3858 ElementCount VF) const { 3859 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3860 assert(ID && "Expected intrinsic call!"); 3861 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3862 FastMathFlags FMF; 3863 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3864 FMF = FPMO->getFastMathFlags(); 3865 3866 SmallVector<const Value *> Arguments(CI->args()); 3867 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3868 SmallVector<Type *> ParamTys; 3869 std::transform(FTy->param_begin(), FTy->param_end(), 3870 std::back_inserter(ParamTys), 3871 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3872 3873 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3874 dyn_cast<IntrinsicInst>(CI)); 3875 return TTI.getIntrinsicInstrCost(CostAttrs, 3876 TargetTransformInfo::TCK_RecipThroughput); 3877 } 3878 3879 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3880 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3881 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3882 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3883 } 3884 3885 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3886 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3887 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3888 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3889 } 3890 3891 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3892 // For every instruction `I` in MinBWs, truncate the operands, create a 3893 // truncated version of `I` and reextend its result. InstCombine runs 3894 // later and will remove any ext/trunc pairs. 3895 SmallPtrSet<Value *, 4> Erased; 3896 for (const auto &KV : Cost->getMinimalBitwidths()) { 3897 // If the value wasn't vectorized, we must maintain the original scalar 3898 // type. The absence of the value from State indicates that it 3899 // wasn't vectorized. 3900 // FIXME: Should not rely on getVPValue at this point. 3901 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3902 if (!State.hasAnyVectorValue(Def)) 3903 continue; 3904 for (unsigned Part = 0; Part < UF; ++Part) { 3905 Value *I = State.get(Def, Part); 3906 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3907 continue; 3908 Type *OriginalTy = I->getType(); 3909 Type *ScalarTruncatedTy = 3910 IntegerType::get(OriginalTy->getContext(), KV.second); 3911 auto *TruncatedTy = VectorType::get( 3912 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3913 if (TruncatedTy == OriginalTy) 3914 continue; 3915 3916 IRBuilder<> B(cast<Instruction>(I)); 3917 auto ShrinkOperand = [&](Value *V) -> Value * { 3918 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3919 if (ZI->getSrcTy() == TruncatedTy) 3920 return ZI->getOperand(0); 3921 return B.CreateZExtOrTrunc(V, TruncatedTy); 3922 }; 3923 3924 // The actual instruction modification depends on the instruction type, 3925 // unfortunately. 3926 Value *NewI = nullptr; 3927 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3928 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3929 ShrinkOperand(BO->getOperand(1))); 3930 3931 // Any wrapping introduced by shrinking this operation shouldn't be 3932 // considered undefined behavior. So, we can't unconditionally copy 3933 // arithmetic wrapping flags to NewI. 3934 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3935 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3936 NewI = 3937 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3938 ShrinkOperand(CI->getOperand(1))); 3939 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3940 NewI = B.CreateSelect(SI->getCondition(), 3941 ShrinkOperand(SI->getTrueValue()), 3942 ShrinkOperand(SI->getFalseValue())); 3943 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3944 switch (CI->getOpcode()) { 3945 default: 3946 llvm_unreachable("Unhandled cast!"); 3947 case Instruction::Trunc: 3948 NewI = ShrinkOperand(CI->getOperand(0)); 3949 break; 3950 case Instruction::SExt: 3951 NewI = B.CreateSExtOrTrunc( 3952 CI->getOperand(0), 3953 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3954 break; 3955 case Instruction::ZExt: 3956 NewI = B.CreateZExtOrTrunc( 3957 CI->getOperand(0), 3958 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3959 break; 3960 } 3961 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3962 auto Elements0 = 3963 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3964 auto *O0 = B.CreateZExtOrTrunc( 3965 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3966 auto Elements1 = 3967 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3968 auto *O1 = B.CreateZExtOrTrunc( 3969 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3970 3971 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3972 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3973 // Don't do anything with the operands, just extend the result. 3974 continue; 3975 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3976 auto Elements = 3977 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3978 auto *O0 = B.CreateZExtOrTrunc( 3979 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3980 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3981 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3982 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3983 auto Elements = 3984 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3985 auto *O0 = B.CreateZExtOrTrunc( 3986 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3987 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3988 } else { 3989 // If we don't know what to do, be conservative and don't do anything. 3990 continue; 3991 } 3992 3993 // Lastly, extend the result. 3994 NewI->takeName(cast<Instruction>(I)); 3995 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3996 I->replaceAllUsesWith(Res); 3997 cast<Instruction>(I)->eraseFromParent(); 3998 Erased.insert(I); 3999 State.reset(Def, Res, Part); 4000 } 4001 } 4002 4003 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4004 for (const auto &KV : Cost->getMinimalBitwidths()) { 4005 // If the value wasn't vectorized, we must maintain the original scalar 4006 // type. The absence of the value from State indicates that it 4007 // wasn't vectorized. 4008 // FIXME: Should not rely on getVPValue at this point. 4009 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4010 if (!State.hasAnyVectorValue(Def)) 4011 continue; 4012 for (unsigned Part = 0; Part < UF; ++Part) { 4013 Value *I = State.get(Def, Part); 4014 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4015 if (Inst && Inst->use_empty()) { 4016 Value *NewI = Inst->getOperand(0); 4017 Inst->eraseFromParent(); 4018 State.reset(Def, NewI, Part); 4019 } 4020 } 4021 } 4022 } 4023 4024 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4025 // Insert truncates and extends for any truncated instructions as hints to 4026 // InstCombine. 4027 if (VF.isVector()) 4028 truncateToMinimalBitwidths(State); 4029 4030 // Fix widened non-induction PHIs by setting up the PHI operands. 4031 if (OrigPHIsToFix.size()) { 4032 assert(EnableVPlanNativePath && 4033 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4034 fixNonInductionPHIs(State); 4035 } 4036 4037 // At this point every instruction in the original loop is widened to a 4038 // vector form. Now we need to fix the recurrences in the loop. These PHI 4039 // nodes are currently empty because we did not want to introduce cycles. 4040 // This is the second stage of vectorizing recurrences. 4041 fixCrossIterationPHIs(State); 4042 4043 // Forget the original basic block. 4044 PSE.getSE()->forgetLoop(OrigLoop); 4045 4046 // If we inserted an edge from the middle block to the unique exit block, 4047 // update uses outside the loop (phis) to account for the newly inserted 4048 // edge. 4049 if (!Cost->requiresScalarEpilogue(VF)) { 4050 // Fix-up external users of the induction variables. 4051 for (auto &Entry : Legal->getInductionVars()) 4052 fixupIVUsers(Entry.first, Entry.second, 4053 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4054 IVEndValues[Entry.first], LoopMiddleBlock); 4055 4056 fixLCSSAPHIs(State); 4057 } 4058 4059 for (Instruction *PI : PredicatedInstructions) 4060 sinkScalarOperands(&*PI); 4061 4062 // Remove redundant induction instructions. 4063 cse(LoopVectorBody); 4064 4065 // Set/update profile weights for the vector and remainder loops as original 4066 // loop iterations are now distributed among them. Note that original loop 4067 // represented by LoopScalarBody becomes remainder loop after vectorization. 4068 // 4069 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4070 // end up getting slightly roughened result but that should be OK since 4071 // profile is not inherently precise anyway. Note also possible bypass of 4072 // vector code caused by legality checks is ignored, assigning all the weight 4073 // to the vector loop, optimistically. 4074 // 4075 // For scalable vectorization we can't know at compile time how many iterations 4076 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4077 // vscale of '1'. 4078 setProfileInfoAfterUnrolling( 4079 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4080 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4081 } 4082 4083 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4084 // In order to support recurrences we need to be able to vectorize Phi nodes. 4085 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4086 // stage #2: We now need to fix the recurrences by adding incoming edges to 4087 // the currently empty PHI nodes. At this point every instruction in the 4088 // original loop is widened to a vector form so we can use them to construct 4089 // the incoming edges. 4090 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4091 for (VPRecipeBase &R : Header->phis()) { 4092 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4093 fixReduction(ReductionPhi, State); 4094 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4095 fixFirstOrderRecurrence(FOR, State); 4096 } 4097 } 4098 4099 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4100 VPTransformState &State) { 4101 // This is the second phase of vectorizing first-order recurrences. An 4102 // overview of the transformation is described below. Suppose we have the 4103 // following loop. 4104 // 4105 // for (int i = 0; i < n; ++i) 4106 // b[i] = a[i] - a[i - 1]; 4107 // 4108 // There is a first-order recurrence on "a". For this loop, the shorthand 4109 // scalar IR looks like: 4110 // 4111 // scalar.ph: 4112 // s_init = a[-1] 4113 // br scalar.body 4114 // 4115 // scalar.body: 4116 // i = phi [0, scalar.ph], [i+1, scalar.body] 4117 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4118 // s2 = a[i] 4119 // b[i] = s2 - s1 4120 // br cond, scalar.body, ... 4121 // 4122 // In this example, s1 is a recurrence because it's value depends on the 4123 // previous iteration. In the first phase of vectorization, we created a 4124 // vector phi v1 for s1. We now complete the vectorization and produce the 4125 // shorthand vector IR shown below (for VF = 4, UF = 1). 4126 // 4127 // vector.ph: 4128 // v_init = vector(..., ..., ..., a[-1]) 4129 // br vector.body 4130 // 4131 // vector.body 4132 // i = phi [0, vector.ph], [i+4, vector.body] 4133 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4134 // v2 = a[i, i+1, i+2, i+3]; 4135 // v3 = vector(v1(3), v2(0, 1, 2)) 4136 // b[i, i+1, i+2, i+3] = v2 - v3 4137 // br cond, vector.body, middle.block 4138 // 4139 // middle.block: 4140 // x = v2(3) 4141 // br scalar.ph 4142 // 4143 // scalar.ph: 4144 // s_init = phi [x, middle.block], [a[-1], otherwise] 4145 // br scalar.body 4146 // 4147 // After execution completes the vector loop, we extract the next value of 4148 // the recurrence (x) to use as the initial value in the scalar loop. 4149 4150 // Extract the last vector element in the middle block. This will be the 4151 // initial value for the recurrence when jumping to the scalar loop. 4152 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4153 Value *Incoming = State.get(PreviousDef, UF - 1); 4154 auto *ExtractForScalar = Incoming; 4155 auto *IdxTy = Builder.getInt32Ty(); 4156 if (VF.isVector()) { 4157 auto *One = ConstantInt::get(IdxTy, 1); 4158 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4159 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4160 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4161 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4162 "vector.recur.extract"); 4163 } 4164 // Extract the second last element in the middle block if the 4165 // Phi is used outside the loop. We need to extract the phi itself 4166 // and not the last element (the phi update in the current iteration). This 4167 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4168 // when the scalar loop is not run at all. 4169 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4170 if (VF.isVector()) { 4171 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4172 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4173 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4174 Incoming, Idx, "vector.recur.extract.for.phi"); 4175 } else if (UF > 1) 4176 // When loop is unrolled without vectorizing, initialize 4177 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4178 // of `Incoming`. This is analogous to the vectorized case above: extracting 4179 // the second last element when VF > 1. 4180 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4181 4182 // Fix the initial value of the original recurrence in the scalar loop. 4183 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4184 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4185 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4186 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4187 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4188 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4189 Start->addIncoming(Incoming, BB); 4190 } 4191 4192 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4193 Phi->setName("scalar.recur"); 4194 4195 // Finally, fix users of the recurrence outside the loop. The users will need 4196 // either the last value of the scalar recurrence or the last value of the 4197 // vector recurrence we extracted in the middle block. Since the loop is in 4198 // LCSSA form, we just need to find all the phi nodes for the original scalar 4199 // recurrence in the exit block, and then add an edge for the middle block. 4200 // Note that LCSSA does not imply single entry when the original scalar loop 4201 // had multiple exiting edges (as we always run the last iteration in the 4202 // scalar epilogue); in that case, there is no edge from middle to exit and 4203 // and thus no phis which needed updated. 4204 if (!Cost->requiresScalarEpilogue(VF)) 4205 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4206 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4207 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4208 } 4209 4210 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4211 VPTransformState &State) { 4212 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4213 // Get it's reduction variable descriptor. 4214 assert(Legal->isReductionVariable(OrigPhi) && 4215 "Unable to find the reduction variable"); 4216 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4217 4218 RecurKind RK = RdxDesc.getRecurrenceKind(); 4219 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4220 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4221 setDebugLocFromInst(ReductionStartValue); 4222 4223 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4224 // This is the vector-clone of the value that leaves the loop. 4225 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4226 4227 // Wrap flags are in general invalid after vectorization, clear them. 4228 clearReductionWrapFlags(RdxDesc, State); 4229 4230 // Before each round, move the insertion point right between 4231 // the PHIs and the values we are going to write. 4232 // This allows us to write both PHINodes and the extractelement 4233 // instructions. 4234 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4235 4236 setDebugLocFromInst(LoopExitInst); 4237 4238 Type *PhiTy = OrigPhi->getType(); 4239 // If tail is folded by masking, the vector value to leave the loop should be 4240 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4241 // instead of the former. For an inloop reduction the reduction will already 4242 // be predicated, and does not need to be handled here. 4243 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4244 for (unsigned Part = 0; Part < UF; ++Part) { 4245 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4246 Value *Sel = nullptr; 4247 for (User *U : VecLoopExitInst->users()) { 4248 if (isa<SelectInst>(U)) { 4249 assert(!Sel && "Reduction exit feeding two selects"); 4250 Sel = U; 4251 } else 4252 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4253 } 4254 assert(Sel && "Reduction exit feeds no select"); 4255 State.reset(LoopExitInstDef, Sel, Part); 4256 4257 // If the target can create a predicated operator for the reduction at no 4258 // extra cost in the loop (for example a predicated vadd), it can be 4259 // cheaper for the select to remain in the loop than be sunk out of it, 4260 // and so use the select value for the phi instead of the old 4261 // LoopExitValue. 4262 if (PreferPredicatedReductionSelect || 4263 TTI->preferPredicatedReductionSelect( 4264 RdxDesc.getOpcode(), PhiTy, 4265 TargetTransformInfo::ReductionFlags())) { 4266 auto *VecRdxPhi = 4267 cast<PHINode>(State.get(PhiR, Part)); 4268 VecRdxPhi->setIncomingValueForBlock( 4269 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4270 } 4271 } 4272 } 4273 4274 // If the vector reduction can be performed in a smaller type, we truncate 4275 // then extend the loop exit value to enable InstCombine to evaluate the 4276 // entire expression in the smaller type. 4277 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4278 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4279 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4280 Builder.SetInsertPoint( 4281 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4282 VectorParts RdxParts(UF); 4283 for (unsigned Part = 0; Part < UF; ++Part) { 4284 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4285 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4286 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4287 : Builder.CreateZExt(Trunc, VecTy); 4288 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4289 if (U != Trunc) { 4290 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4291 RdxParts[Part] = Extnd; 4292 } 4293 } 4294 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4295 for (unsigned Part = 0; Part < UF; ++Part) { 4296 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4297 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4298 } 4299 } 4300 4301 // Reduce all of the unrolled parts into a single vector. 4302 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4303 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4304 4305 // The middle block terminator has already been assigned a DebugLoc here (the 4306 // OrigLoop's single latch terminator). We want the whole middle block to 4307 // appear to execute on this line because: (a) it is all compiler generated, 4308 // (b) these instructions are always executed after evaluating the latch 4309 // conditional branch, and (c) other passes may add new predecessors which 4310 // terminate on this line. This is the easiest way to ensure we don't 4311 // accidentally cause an extra step back into the loop while debugging. 4312 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4313 if (PhiR->isOrdered()) 4314 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4315 else { 4316 // Floating-point operations should have some FMF to enable the reduction. 4317 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4318 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4319 for (unsigned Part = 1; Part < UF; ++Part) { 4320 Value *RdxPart = State.get(LoopExitInstDef, Part); 4321 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4322 ReducedPartRdx = Builder.CreateBinOp( 4323 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4324 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4325 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4326 ReducedPartRdx, RdxPart); 4327 else 4328 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4329 } 4330 } 4331 4332 // Create the reduction after the loop. Note that inloop reductions create the 4333 // target reduction in the loop using a Reduction recipe. 4334 if (VF.isVector() && !PhiR->isInLoop()) { 4335 ReducedPartRdx = 4336 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4337 // If the reduction can be performed in a smaller type, we need to extend 4338 // the reduction to the wider type before we branch to the original loop. 4339 if (PhiTy != RdxDesc.getRecurrenceType()) 4340 ReducedPartRdx = RdxDesc.isSigned() 4341 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4342 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4343 } 4344 4345 // Create a phi node that merges control-flow from the backedge-taken check 4346 // block and the middle block. 4347 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4348 LoopScalarPreHeader->getTerminator()); 4349 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4350 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4351 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4352 4353 // Now, we need to fix the users of the reduction variable 4354 // inside and outside of the scalar remainder loop. 4355 4356 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4357 // in the exit blocks. See comment on analogous loop in 4358 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4359 if (!Cost->requiresScalarEpilogue(VF)) 4360 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4361 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4362 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4363 4364 // Fix the scalar loop reduction variable with the incoming reduction sum 4365 // from the vector body and from the backedge value. 4366 int IncomingEdgeBlockIdx = 4367 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4368 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4369 // Pick the other block. 4370 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4371 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4372 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4373 } 4374 4375 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4376 VPTransformState &State) { 4377 RecurKind RK = RdxDesc.getRecurrenceKind(); 4378 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4379 return; 4380 4381 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4382 assert(LoopExitInstr && "null loop exit instruction"); 4383 SmallVector<Instruction *, 8> Worklist; 4384 SmallPtrSet<Instruction *, 8> Visited; 4385 Worklist.push_back(LoopExitInstr); 4386 Visited.insert(LoopExitInstr); 4387 4388 while (!Worklist.empty()) { 4389 Instruction *Cur = Worklist.pop_back_val(); 4390 if (isa<OverflowingBinaryOperator>(Cur)) 4391 for (unsigned Part = 0; Part < UF; ++Part) { 4392 // FIXME: Should not rely on getVPValue at this point. 4393 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4394 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4395 } 4396 4397 for (User *U : Cur->users()) { 4398 Instruction *UI = cast<Instruction>(U); 4399 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4400 Visited.insert(UI).second) 4401 Worklist.push_back(UI); 4402 } 4403 } 4404 } 4405 4406 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4407 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4408 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4409 // Some phis were already hand updated by the reduction and recurrence 4410 // code above, leave them alone. 4411 continue; 4412 4413 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4414 // Non-instruction incoming values will have only one value. 4415 4416 VPLane Lane = VPLane::getFirstLane(); 4417 if (isa<Instruction>(IncomingValue) && 4418 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4419 VF)) 4420 Lane = VPLane::getLastLaneForVF(VF); 4421 4422 // Can be a loop invariant incoming value or the last scalar value to be 4423 // extracted from the vectorized loop. 4424 // FIXME: Should not rely on getVPValue at this point. 4425 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4426 Value *lastIncomingValue = 4427 OrigLoop->isLoopInvariant(IncomingValue) 4428 ? IncomingValue 4429 : State.get(State.Plan->getVPValue(IncomingValue, true), 4430 VPIteration(UF - 1, Lane)); 4431 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4432 } 4433 } 4434 4435 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4436 // The basic block and loop containing the predicated instruction. 4437 auto *PredBB = PredInst->getParent(); 4438 auto *VectorLoop = LI->getLoopFor(PredBB); 4439 4440 // Initialize a worklist with the operands of the predicated instruction. 4441 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4442 4443 // Holds instructions that we need to analyze again. An instruction may be 4444 // reanalyzed if we don't yet know if we can sink it or not. 4445 SmallVector<Instruction *, 8> InstsToReanalyze; 4446 4447 // Returns true if a given use occurs in the predicated block. Phi nodes use 4448 // their operands in their corresponding predecessor blocks. 4449 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4450 auto *I = cast<Instruction>(U.getUser()); 4451 BasicBlock *BB = I->getParent(); 4452 if (auto *Phi = dyn_cast<PHINode>(I)) 4453 BB = Phi->getIncomingBlock( 4454 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4455 return BB == PredBB; 4456 }; 4457 4458 // Iteratively sink the scalarized operands of the predicated instruction 4459 // into the block we created for it. When an instruction is sunk, it's 4460 // operands are then added to the worklist. The algorithm ends after one pass 4461 // through the worklist doesn't sink a single instruction. 4462 bool Changed; 4463 do { 4464 // Add the instructions that need to be reanalyzed to the worklist, and 4465 // reset the changed indicator. 4466 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4467 InstsToReanalyze.clear(); 4468 Changed = false; 4469 4470 while (!Worklist.empty()) { 4471 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4472 4473 // We can't sink an instruction if it is a phi node, is not in the loop, 4474 // or may have side effects. 4475 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4476 I->mayHaveSideEffects()) 4477 continue; 4478 4479 // If the instruction is already in PredBB, check if we can sink its 4480 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4481 // sinking the scalar instruction I, hence it appears in PredBB; but it 4482 // may have failed to sink I's operands (recursively), which we try 4483 // (again) here. 4484 if (I->getParent() == PredBB) { 4485 Worklist.insert(I->op_begin(), I->op_end()); 4486 continue; 4487 } 4488 4489 // It's legal to sink the instruction if all its uses occur in the 4490 // predicated block. Otherwise, there's nothing to do yet, and we may 4491 // need to reanalyze the instruction. 4492 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4493 InstsToReanalyze.push_back(I); 4494 continue; 4495 } 4496 4497 // Move the instruction to the beginning of the predicated block, and add 4498 // it's operands to the worklist. 4499 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4500 Worklist.insert(I->op_begin(), I->op_end()); 4501 4502 // The sinking may have enabled other instructions to be sunk, so we will 4503 // need to iterate. 4504 Changed = true; 4505 } 4506 } while (Changed); 4507 } 4508 4509 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4510 for (PHINode *OrigPhi : OrigPHIsToFix) { 4511 VPWidenPHIRecipe *VPPhi = 4512 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4513 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4514 // Make sure the builder has a valid insert point. 4515 Builder.SetInsertPoint(NewPhi); 4516 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4517 VPValue *Inc = VPPhi->getIncomingValue(i); 4518 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4519 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4520 } 4521 } 4522 } 4523 4524 bool InnerLoopVectorizer::useOrderedReductions( 4525 const RecurrenceDescriptor &RdxDesc) { 4526 return Cost->useOrderedReductions(RdxDesc); 4527 } 4528 4529 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4530 VPWidenPHIRecipe *PhiR, 4531 VPTransformState &State) { 4532 PHINode *P = cast<PHINode>(PN); 4533 if (EnableVPlanNativePath) { 4534 // Currently we enter here in the VPlan-native path for non-induction 4535 // PHIs where all control flow is uniform. We simply widen these PHIs. 4536 // Create a vector phi with no operands - the vector phi operands will be 4537 // set at the end of vector code generation. 4538 Type *VecTy = (State.VF.isScalar()) 4539 ? PN->getType() 4540 : VectorType::get(PN->getType(), State.VF); 4541 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4542 State.set(PhiR, VecPhi, 0); 4543 OrigPHIsToFix.push_back(P); 4544 4545 return; 4546 } 4547 4548 assert(PN->getParent() == OrigLoop->getHeader() && 4549 "Non-header phis should have been handled elsewhere"); 4550 4551 // In order to support recurrences we need to be able to vectorize Phi nodes. 4552 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4553 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4554 // this value when we vectorize all of the instructions that use the PHI. 4555 4556 assert(!Legal->isReductionVariable(P) && 4557 "reductions should be handled elsewhere"); 4558 4559 setDebugLocFromInst(P); 4560 4561 // This PHINode must be an induction variable. 4562 // Make sure that we know about it. 4563 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4564 4565 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4566 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4567 4568 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4569 // which can be found from the original scalar operations. 4570 switch (II.getKind()) { 4571 case InductionDescriptor::IK_NoInduction: 4572 llvm_unreachable("Unknown induction"); 4573 case InductionDescriptor::IK_IntInduction: 4574 case InductionDescriptor::IK_FpInduction: 4575 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4576 case InductionDescriptor::IK_PtrInduction: { 4577 // Handle the pointer induction variable case. 4578 assert(P->getType()->isPointerTy() && "Unexpected type."); 4579 4580 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4581 // This is the normalized GEP that starts counting at zero. 4582 Value *PtrInd = 4583 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4584 // Determine the number of scalars we need to generate for each unroll 4585 // iteration. If the instruction is uniform, we only need to generate the 4586 // first lane. Otherwise, we generate all VF values. 4587 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4588 assert((IsUniform || !State.VF.isScalable()) && 4589 "Cannot scalarize a scalable VF"); 4590 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4591 4592 for (unsigned Part = 0; Part < UF; ++Part) { 4593 Value *PartStart = 4594 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4595 4596 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4597 Value *Idx = Builder.CreateAdd( 4598 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4599 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4600 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), 4601 DL, II, State.CFG.PrevBB); 4602 SclrGep->setName("next.gep"); 4603 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4604 } 4605 } 4606 return; 4607 } 4608 assert(isa<SCEVConstant>(II.getStep()) && 4609 "Induction step not a SCEV constant!"); 4610 Type *PhiType = II.getStep()->getType(); 4611 4612 // Build a pointer phi 4613 Value *ScalarStartValue = II.getStartValue(); 4614 Type *ScStValueType = ScalarStartValue->getType(); 4615 PHINode *NewPointerPhi = 4616 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4617 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4618 4619 // A pointer induction, performed by using a gep 4620 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4621 Instruction *InductionLoc = LoopLatch->getTerminator(); 4622 const SCEV *ScalarStep = II.getStep(); 4623 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4624 Value *ScalarStepValue = 4625 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4626 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4627 Value *NumUnrolledElems = 4628 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4629 Value *InductionGEP = GetElementPtrInst::Create( 4630 II.getElementType(), NewPointerPhi, 4631 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4632 InductionLoc); 4633 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4634 4635 // Create UF many actual address geps that use the pointer 4636 // phi as base and a vectorized version of the step value 4637 // (<step*0, ..., step*N>) as offset. 4638 for (unsigned Part = 0; Part < State.UF; ++Part) { 4639 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4640 Value *StartOffsetScalar = 4641 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4642 Value *StartOffset = 4643 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4644 // Create a vector of consecutive numbers from zero to VF. 4645 StartOffset = 4646 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4647 4648 Value *GEP = Builder.CreateGEP( 4649 II.getElementType(), NewPointerPhi, 4650 Builder.CreateMul( 4651 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4652 "vector.gep")); 4653 State.set(PhiR, GEP, Part); 4654 } 4655 } 4656 } 4657 } 4658 4659 /// A helper function for checking whether an integer division-related 4660 /// instruction may divide by zero (in which case it must be predicated if 4661 /// executed conditionally in the scalar code). 4662 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4663 /// Non-zero divisors that are non compile-time constants will not be 4664 /// converted into multiplication, so we will still end up scalarizing 4665 /// the division, but can do so w/o predication. 4666 static bool mayDivideByZero(Instruction &I) { 4667 assert((I.getOpcode() == Instruction::UDiv || 4668 I.getOpcode() == Instruction::SDiv || 4669 I.getOpcode() == Instruction::URem || 4670 I.getOpcode() == Instruction::SRem) && 4671 "Unexpected instruction"); 4672 Value *Divisor = I.getOperand(1); 4673 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4674 return !CInt || CInt->isZero(); 4675 } 4676 4677 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4678 VPUser &ArgOperands, 4679 VPTransformState &State) { 4680 assert(!isa<DbgInfoIntrinsic>(I) && 4681 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4682 setDebugLocFromInst(&I); 4683 4684 Module *M = I.getParent()->getParent()->getParent(); 4685 auto *CI = cast<CallInst>(&I); 4686 4687 SmallVector<Type *, 4> Tys; 4688 for (Value *ArgOperand : CI->args()) 4689 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4690 4691 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4692 4693 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4694 // version of the instruction. 4695 // Is it beneficial to perform intrinsic call compared to lib call? 4696 bool NeedToScalarize = false; 4697 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4698 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4699 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4700 assert((UseVectorIntrinsic || !NeedToScalarize) && 4701 "Instruction should be scalarized elsewhere."); 4702 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4703 "Either the intrinsic cost or vector call cost must be valid"); 4704 4705 for (unsigned Part = 0; Part < UF; ++Part) { 4706 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4707 SmallVector<Value *, 4> Args; 4708 for (auto &I : enumerate(ArgOperands.operands())) { 4709 // Some intrinsics have a scalar argument - don't replace it with a 4710 // vector. 4711 Value *Arg; 4712 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4713 Arg = State.get(I.value(), Part); 4714 else { 4715 Arg = State.get(I.value(), VPIteration(0, 0)); 4716 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4717 TysForDecl.push_back(Arg->getType()); 4718 } 4719 Args.push_back(Arg); 4720 } 4721 4722 Function *VectorF; 4723 if (UseVectorIntrinsic) { 4724 // Use vector version of the intrinsic. 4725 if (VF.isVector()) 4726 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4727 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4728 assert(VectorF && "Can't retrieve vector intrinsic."); 4729 } else { 4730 // Use vector version of the function call. 4731 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4732 #ifndef NDEBUG 4733 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4734 "Can't create vector function."); 4735 #endif 4736 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4737 } 4738 SmallVector<OperandBundleDef, 1> OpBundles; 4739 CI->getOperandBundlesAsDefs(OpBundles); 4740 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4741 4742 if (isa<FPMathOperator>(V)) 4743 V->copyFastMathFlags(CI); 4744 4745 State.set(Def, V, Part); 4746 addMetadata(V, &I); 4747 } 4748 } 4749 4750 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4751 // We should not collect Scalars more than once per VF. Right now, this 4752 // function is called from collectUniformsAndScalars(), which already does 4753 // this check. Collecting Scalars for VF=1 does not make any sense. 4754 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4755 "This function should not be visited twice for the same VF"); 4756 4757 SmallSetVector<Instruction *, 8> Worklist; 4758 4759 // These sets are used to seed the analysis with pointers used by memory 4760 // accesses that will remain scalar. 4761 SmallSetVector<Instruction *, 8> ScalarPtrs; 4762 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4763 auto *Latch = TheLoop->getLoopLatch(); 4764 4765 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4766 // The pointer operands of loads and stores will be scalar as long as the 4767 // memory access is not a gather or scatter operation. The value operand of a 4768 // store will remain scalar if the store is scalarized. 4769 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4770 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4771 assert(WideningDecision != CM_Unknown && 4772 "Widening decision should be ready at this moment"); 4773 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4774 if (Ptr == Store->getValueOperand()) 4775 return WideningDecision == CM_Scalarize; 4776 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4777 "Ptr is neither a value or pointer operand"); 4778 return WideningDecision != CM_GatherScatter; 4779 }; 4780 4781 // A helper that returns true if the given value is a bitcast or 4782 // getelementptr instruction contained in the loop. 4783 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4784 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4785 isa<GetElementPtrInst>(V)) && 4786 !TheLoop->isLoopInvariant(V); 4787 }; 4788 4789 // A helper that evaluates a memory access's use of a pointer. If the use will 4790 // be a scalar use and the pointer is only used by memory accesses, we place 4791 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4792 // PossibleNonScalarPtrs. 4793 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4794 // We only care about bitcast and getelementptr instructions contained in 4795 // the loop. 4796 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4797 return; 4798 4799 // If the pointer has already been identified as scalar (e.g., if it was 4800 // also identified as uniform), there's nothing to do. 4801 auto *I = cast<Instruction>(Ptr); 4802 if (Worklist.count(I)) 4803 return; 4804 4805 // If the use of the pointer will be a scalar use, and all users of the 4806 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4807 // place the pointer in PossibleNonScalarPtrs. 4808 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4809 return isa<LoadInst>(U) || isa<StoreInst>(U); 4810 })) 4811 ScalarPtrs.insert(I); 4812 else 4813 PossibleNonScalarPtrs.insert(I); 4814 }; 4815 4816 // We seed the scalars analysis with three classes of instructions: (1) 4817 // instructions marked uniform-after-vectorization and (2) bitcast, 4818 // getelementptr and (pointer) phi instructions used by memory accesses 4819 // requiring a scalar use. 4820 // 4821 // (1) Add to the worklist all instructions that have been identified as 4822 // uniform-after-vectorization. 4823 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4824 4825 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4826 // memory accesses requiring a scalar use. The pointer operands of loads and 4827 // stores will be scalar as long as the memory accesses is not a gather or 4828 // scatter operation. The value operand of a store will remain scalar if the 4829 // store is scalarized. 4830 for (auto *BB : TheLoop->blocks()) 4831 for (auto &I : *BB) { 4832 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4833 evaluatePtrUse(Load, Load->getPointerOperand()); 4834 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4835 evaluatePtrUse(Store, Store->getPointerOperand()); 4836 evaluatePtrUse(Store, Store->getValueOperand()); 4837 } 4838 } 4839 for (auto *I : ScalarPtrs) 4840 if (!PossibleNonScalarPtrs.count(I)) { 4841 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4842 Worklist.insert(I); 4843 } 4844 4845 // Insert the forced scalars. 4846 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4847 // induction variable when the PHI user is scalarized. 4848 auto ForcedScalar = ForcedScalars.find(VF); 4849 if (ForcedScalar != ForcedScalars.end()) 4850 for (auto *I : ForcedScalar->second) 4851 Worklist.insert(I); 4852 4853 // Expand the worklist by looking through any bitcasts and getelementptr 4854 // instructions we've already identified as scalar. This is similar to the 4855 // expansion step in collectLoopUniforms(); however, here we're only 4856 // expanding to include additional bitcasts and getelementptr instructions. 4857 unsigned Idx = 0; 4858 while (Idx != Worklist.size()) { 4859 Instruction *Dst = Worklist[Idx++]; 4860 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4861 continue; 4862 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4863 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4864 auto *J = cast<Instruction>(U); 4865 return !TheLoop->contains(J) || Worklist.count(J) || 4866 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4867 isScalarUse(J, Src)); 4868 })) { 4869 Worklist.insert(Src); 4870 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4871 } 4872 } 4873 4874 // An induction variable will remain scalar if all users of the induction 4875 // variable and induction variable update remain scalar. 4876 for (auto &Induction : Legal->getInductionVars()) { 4877 auto *Ind = Induction.first; 4878 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4879 4880 // If tail-folding is applied, the primary induction variable will be used 4881 // to feed a vector compare. 4882 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4883 continue; 4884 4885 // Returns true if \p Indvar is a pointer induction that is used directly by 4886 // load/store instruction \p I. 4887 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4888 Instruction *I) { 4889 return Induction.second.getKind() == 4890 InductionDescriptor::IK_PtrInduction && 4891 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4892 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4893 }; 4894 4895 // Determine if all users of the induction variable are scalar after 4896 // vectorization. 4897 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4898 auto *I = cast<Instruction>(U); 4899 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4900 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4901 }); 4902 if (!ScalarInd) 4903 continue; 4904 4905 // Determine if all users of the induction variable update instruction are 4906 // scalar after vectorization. 4907 auto ScalarIndUpdate = 4908 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4909 auto *I = cast<Instruction>(U); 4910 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4911 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4912 }); 4913 if (!ScalarIndUpdate) 4914 continue; 4915 4916 // The induction variable and its update instruction will remain scalar. 4917 Worklist.insert(Ind); 4918 Worklist.insert(IndUpdate); 4919 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4920 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4921 << "\n"); 4922 } 4923 4924 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4925 } 4926 4927 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 4928 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4929 return false; 4930 switch(I->getOpcode()) { 4931 default: 4932 break; 4933 case Instruction::Load: 4934 case Instruction::Store: { 4935 if (!Legal->isMaskRequired(I)) 4936 return false; 4937 auto *Ptr = getLoadStorePointerOperand(I); 4938 auto *Ty = getLoadStoreType(I); 4939 const Align Alignment = getLoadStoreAlignment(I); 4940 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4941 TTI.isLegalMaskedGather(Ty, Alignment)) 4942 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4943 TTI.isLegalMaskedScatter(Ty, Alignment)); 4944 } 4945 case Instruction::UDiv: 4946 case Instruction::SDiv: 4947 case Instruction::SRem: 4948 case Instruction::URem: 4949 return mayDivideByZero(*I); 4950 } 4951 return false; 4952 } 4953 4954 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4955 Instruction *I, ElementCount VF) { 4956 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4957 assert(getWideningDecision(I, VF) == CM_Unknown && 4958 "Decision should not be set yet."); 4959 auto *Group = getInterleavedAccessGroup(I); 4960 assert(Group && "Must have a group."); 4961 4962 // If the instruction's allocated size doesn't equal it's type size, it 4963 // requires padding and will be scalarized. 4964 auto &DL = I->getModule()->getDataLayout(); 4965 auto *ScalarTy = getLoadStoreType(I); 4966 if (hasIrregularType(ScalarTy, DL)) 4967 return false; 4968 4969 // Check if masking is required. 4970 // A Group may need masking for one of two reasons: it resides in a block that 4971 // needs predication, or it was decided to use masking to deal with gaps 4972 // (either a gap at the end of a load-access that may result in a speculative 4973 // load, or any gaps in a store-access). 4974 bool PredicatedAccessRequiresMasking = 4975 blockNeedsPredicationForAnyReason(I->getParent()) && 4976 Legal->isMaskRequired(I); 4977 bool LoadAccessWithGapsRequiresEpilogMasking = 4978 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4979 !isScalarEpilogueAllowed(); 4980 bool StoreAccessWithGapsRequiresMasking = 4981 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4982 if (!PredicatedAccessRequiresMasking && 4983 !LoadAccessWithGapsRequiresEpilogMasking && 4984 !StoreAccessWithGapsRequiresMasking) 4985 return true; 4986 4987 // If masked interleaving is required, we expect that the user/target had 4988 // enabled it, because otherwise it either wouldn't have been created or 4989 // it should have been invalidated by the CostModel. 4990 assert(useMaskedInterleavedAccesses(TTI) && 4991 "Masked interleave-groups for predicated accesses are not enabled."); 4992 4993 if (Group->isReverse()) 4994 return false; 4995 4996 auto *Ty = getLoadStoreType(I); 4997 const Align Alignment = getLoadStoreAlignment(I); 4998 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4999 : TTI.isLegalMaskedStore(Ty, Alignment); 5000 } 5001 5002 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5003 Instruction *I, ElementCount VF) { 5004 // Get and ensure we have a valid memory instruction. 5005 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 5006 5007 auto *Ptr = getLoadStorePointerOperand(I); 5008 auto *ScalarTy = getLoadStoreType(I); 5009 5010 // In order to be widened, the pointer should be consecutive, first of all. 5011 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5012 return false; 5013 5014 // If the instruction is a store located in a predicated block, it will be 5015 // scalarized. 5016 if (isScalarWithPredication(I)) 5017 return false; 5018 5019 // If the instruction's allocated size doesn't equal it's type size, it 5020 // requires padding and will be scalarized. 5021 auto &DL = I->getModule()->getDataLayout(); 5022 if (hasIrregularType(ScalarTy, DL)) 5023 return false; 5024 5025 return true; 5026 } 5027 5028 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5029 // We should not collect Uniforms more than once per VF. Right now, 5030 // this function is called from collectUniformsAndScalars(), which 5031 // already does this check. Collecting Uniforms for VF=1 does not make any 5032 // sense. 5033 5034 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5035 "This function should not be visited twice for the same VF"); 5036 5037 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5038 // not analyze again. Uniforms.count(VF) will return 1. 5039 Uniforms[VF].clear(); 5040 5041 // We now know that the loop is vectorizable! 5042 // Collect instructions inside the loop that will remain uniform after 5043 // vectorization. 5044 5045 // Global values, params and instructions outside of current loop are out of 5046 // scope. 5047 auto isOutOfScope = [&](Value *V) -> bool { 5048 Instruction *I = dyn_cast<Instruction>(V); 5049 return (!I || !TheLoop->contains(I)); 5050 }; 5051 5052 // Worklist containing uniform instructions demanding lane 0. 5053 SetVector<Instruction *> Worklist; 5054 BasicBlock *Latch = TheLoop->getLoopLatch(); 5055 5056 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5057 // that are scalar with predication must not be considered uniform after 5058 // vectorization, because that would create an erroneous replicating region 5059 // where only a single instance out of VF should be formed. 5060 // TODO: optimize such seldom cases if found important, see PR40816. 5061 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5062 if (isOutOfScope(I)) { 5063 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5064 << *I << "\n"); 5065 return; 5066 } 5067 if (isScalarWithPredication(I)) { 5068 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5069 << *I << "\n"); 5070 return; 5071 } 5072 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5073 Worklist.insert(I); 5074 }; 5075 5076 // Start with the conditional branch. If the branch condition is an 5077 // instruction contained in the loop that is only used by the branch, it is 5078 // uniform. 5079 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5080 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5081 addToWorklistIfAllowed(Cmp); 5082 5083 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5084 InstWidening WideningDecision = getWideningDecision(I, VF); 5085 assert(WideningDecision != CM_Unknown && 5086 "Widening decision should be ready at this moment"); 5087 5088 // A uniform memory op is itself uniform. We exclude uniform stores 5089 // here as they demand the last lane, not the first one. 5090 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5091 assert(WideningDecision == CM_Scalarize); 5092 return true; 5093 } 5094 5095 return (WideningDecision == CM_Widen || 5096 WideningDecision == CM_Widen_Reverse || 5097 WideningDecision == CM_Interleave); 5098 }; 5099 5100 5101 // Returns true if Ptr is the pointer operand of a memory access instruction 5102 // I, and I is known to not require scalarization. 5103 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5104 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5105 }; 5106 5107 // Holds a list of values which are known to have at least one uniform use. 5108 // Note that there may be other uses which aren't uniform. A "uniform use" 5109 // here is something which only demands lane 0 of the unrolled iterations; 5110 // it does not imply that all lanes produce the same value (e.g. this is not 5111 // the usual meaning of uniform) 5112 SetVector<Value *> HasUniformUse; 5113 5114 // Scan the loop for instructions which are either a) known to have only 5115 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5116 for (auto *BB : TheLoop->blocks()) 5117 for (auto &I : *BB) { 5118 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5119 switch (II->getIntrinsicID()) { 5120 case Intrinsic::sideeffect: 5121 case Intrinsic::experimental_noalias_scope_decl: 5122 case Intrinsic::assume: 5123 case Intrinsic::lifetime_start: 5124 case Intrinsic::lifetime_end: 5125 if (TheLoop->hasLoopInvariantOperands(&I)) 5126 addToWorklistIfAllowed(&I); 5127 break; 5128 default: 5129 break; 5130 } 5131 } 5132 5133 // ExtractValue instructions must be uniform, because the operands are 5134 // known to be loop-invariant. 5135 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5136 assert(isOutOfScope(EVI->getAggregateOperand()) && 5137 "Expected aggregate value to be loop invariant"); 5138 addToWorklistIfAllowed(EVI); 5139 continue; 5140 } 5141 5142 // If there's no pointer operand, there's nothing to do. 5143 auto *Ptr = getLoadStorePointerOperand(&I); 5144 if (!Ptr) 5145 continue; 5146 5147 // A uniform memory op is itself uniform. We exclude uniform stores 5148 // here as they demand the last lane, not the first one. 5149 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5150 addToWorklistIfAllowed(&I); 5151 5152 if (isUniformDecision(&I, VF)) { 5153 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5154 HasUniformUse.insert(Ptr); 5155 } 5156 } 5157 5158 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5159 // demanding) users. Since loops are assumed to be in LCSSA form, this 5160 // disallows uses outside the loop as well. 5161 for (auto *V : HasUniformUse) { 5162 if (isOutOfScope(V)) 5163 continue; 5164 auto *I = cast<Instruction>(V); 5165 auto UsersAreMemAccesses = 5166 llvm::all_of(I->users(), [&](User *U) -> bool { 5167 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5168 }); 5169 if (UsersAreMemAccesses) 5170 addToWorklistIfAllowed(I); 5171 } 5172 5173 // Expand Worklist in topological order: whenever a new instruction 5174 // is added , its users should be already inside Worklist. It ensures 5175 // a uniform instruction will only be used by uniform instructions. 5176 unsigned idx = 0; 5177 while (idx != Worklist.size()) { 5178 Instruction *I = Worklist[idx++]; 5179 5180 for (auto OV : I->operand_values()) { 5181 // isOutOfScope operands cannot be uniform instructions. 5182 if (isOutOfScope(OV)) 5183 continue; 5184 // First order recurrence Phi's should typically be considered 5185 // non-uniform. 5186 auto *OP = dyn_cast<PHINode>(OV); 5187 if (OP && Legal->isFirstOrderRecurrence(OP)) 5188 continue; 5189 // If all the users of the operand are uniform, then add the 5190 // operand into the uniform worklist. 5191 auto *OI = cast<Instruction>(OV); 5192 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5193 auto *J = cast<Instruction>(U); 5194 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5195 })) 5196 addToWorklistIfAllowed(OI); 5197 } 5198 } 5199 5200 // For an instruction to be added into Worklist above, all its users inside 5201 // the loop should also be in Worklist. However, this condition cannot be 5202 // true for phi nodes that form a cyclic dependence. We must process phi 5203 // nodes separately. An induction variable will remain uniform if all users 5204 // of the induction variable and induction variable update remain uniform. 5205 // The code below handles both pointer and non-pointer induction variables. 5206 for (auto &Induction : Legal->getInductionVars()) { 5207 auto *Ind = Induction.first; 5208 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5209 5210 // Determine if all users of the induction variable are uniform after 5211 // vectorization. 5212 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5213 auto *I = cast<Instruction>(U); 5214 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5215 isVectorizedMemAccessUse(I, Ind); 5216 }); 5217 if (!UniformInd) 5218 continue; 5219 5220 // Determine if all users of the induction variable update instruction are 5221 // uniform after vectorization. 5222 auto UniformIndUpdate = 5223 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5224 auto *I = cast<Instruction>(U); 5225 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5226 isVectorizedMemAccessUse(I, IndUpdate); 5227 }); 5228 if (!UniformIndUpdate) 5229 continue; 5230 5231 // The induction variable and its update instruction will remain uniform. 5232 addToWorklistIfAllowed(Ind); 5233 addToWorklistIfAllowed(IndUpdate); 5234 } 5235 5236 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5237 } 5238 5239 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5240 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5241 5242 if (Legal->getRuntimePointerChecking()->Need) { 5243 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5244 "runtime pointer checks needed. Enable vectorization of this " 5245 "loop with '#pragma clang loop vectorize(enable)' when " 5246 "compiling with -Os/-Oz", 5247 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5248 return true; 5249 } 5250 5251 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5252 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5253 "runtime SCEV checks needed. Enable vectorization of this " 5254 "loop with '#pragma clang loop vectorize(enable)' when " 5255 "compiling with -Os/-Oz", 5256 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5257 return true; 5258 } 5259 5260 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5261 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5262 reportVectorizationFailure("Runtime stride check for small trip count", 5263 "runtime stride == 1 checks needed. Enable vectorization of " 5264 "this loop without such check by compiling with -Os/-Oz", 5265 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5266 return true; 5267 } 5268 5269 return false; 5270 } 5271 5272 ElementCount 5273 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5274 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5275 return ElementCount::getScalable(0); 5276 5277 if (Hints->isScalableVectorizationDisabled()) { 5278 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5279 "ScalableVectorizationDisabled", ORE, TheLoop); 5280 return ElementCount::getScalable(0); 5281 } 5282 5283 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5284 5285 auto MaxScalableVF = ElementCount::getScalable( 5286 std::numeric_limits<ElementCount::ScalarTy>::max()); 5287 5288 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5289 // FIXME: While for scalable vectors this is currently sufficient, this should 5290 // be replaced by a more detailed mechanism that filters out specific VFs, 5291 // instead of invalidating vectorization for a whole set of VFs based on the 5292 // MaxVF. 5293 5294 // Disable scalable vectorization if the loop contains unsupported reductions. 5295 if (!canVectorizeReductions(MaxScalableVF)) { 5296 reportVectorizationInfo( 5297 "Scalable vectorization not supported for the reduction " 5298 "operations found in this loop.", 5299 "ScalableVFUnfeasible", ORE, TheLoop); 5300 return ElementCount::getScalable(0); 5301 } 5302 5303 // Disable scalable vectorization if the loop contains any instructions 5304 // with element types not supported for scalable vectors. 5305 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5306 return !Ty->isVoidTy() && 5307 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5308 })) { 5309 reportVectorizationInfo("Scalable vectorization is not supported " 5310 "for all element types found in this loop.", 5311 "ScalableVFUnfeasible", ORE, TheLoop); 5312 return ElementCount::getScalable(0); 5313 } 5314 5315 if (Legal->isSafeForAnyVectorWidth()) 5316 return MaxScalableVF; 5317 5318 // Limit MaxScalableVF by the maximum safe dependence distance. 5319 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5320 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5321 MaxVScale = 5322 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5323 MaxScalableVF = ElementCount::getScalable( 5324 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5325 if (!MaxScalableVF) 5326 reportVectorizationInfo( 5327 "Max legal vector width too small, scalable vectorization " 5328 "unfeasible.", 5329 "ScalableVFUnfeasible", ORE, TheLoop); 5330 5331 return MaxScalableVF; 5332 } 5333 5334 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5335 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5336 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5337 unsigned SmallestType, WidestType; 5338 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5339 5340 // Get the maximum safe dependence distance in bits computed by LAA. 5341 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5342 // the memory accesses that is most restrictive (involved in the smallest 5343 // dependence distance). 5344 unsigned MaxSafeElements = 5345 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5346 5347 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5348 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5349 5350 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5351 << ".\n"); 5352 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5353 << ".\n"); 5354 5355 // First analyze the UserVF, fall back if the UserVF should be ignored. 5356 if (UserVF) { 5357 auto MaxSafeUserVF = 5358 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5359 5360 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5361 // If `VF=vscale x N` is safe, then so is `VF=N` 5362 if (UserVF.isScalable()) 5363 return FixedScalableVFPair( 5364 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5365 else 5366 return UserVF; 5367 } 5368 5369 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5370 5371 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5372 // is better to ignore the hint and let the compiler choose a suitable VF. 5373 if (!UserVF.isScalable()) { 5374 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5375 << " is unsafe, clamping to max safe VF=" 5376 << MaxSafeFixedVF << ".\n"); 5377 ORE->emit([&]() { 5378 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5379 TheLoop->getStartLoc(), 5380 TheLoop->getHeader()) 5381 << "User-specified vectorization factor " 5382 << ore::NV("UserVectorizationFactor", UserVF) 5383 << " is unsafe, clamping to maximum safe vectorization factor " 5384 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5385 }); 5386 return MaxSafeFixedVF; 5387 } 5388 5389 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5390 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5391 << " is ignored because scalable vectors are not " 5392 "available.\n"); 5393 ORE->emit([&]() { 5394 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5395 TheLoop->getStartLoc(), 5396 TheLoop->getHeader()) 5397 << "User-specified vectorization factor " 5398 << ore::NV("UserVectorizationFactor", UserVF) 5399 << " is ignored because the target does not support scalable " 5400 "vectors. The compiler will pick a more suitable value."; 5401 }); 5402 } else { 5403 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5404 << " is unsafe. Ignoring scalable UserVF.\n"); 5405 ORE->emit([&]() { 5406 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5407 TheLoop->getStartLoc(), 5408 TheLoop->getHeader()) 5409 << "User-specified vectorization factor " 5410 << ore::NV("UserVectorizationFactor", UserVF) 5411 << " is unsafe. Ignoring the hint to let the compiler pick a " 5412 "more suitable value."; 5413 }); 5414 } 5415 } 5416 5417 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5418 << " / " << WidestType << " bits.\n"); 5419 5420 FixedScalableVFPair Result(ElementCount::getFixed(1), 5421 ElementCount::getScalable(0)); 5422 if (auto MaxVF = 5423 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5424 MaxSafeFixedVF, FoldTailByMasking)) 5425 Result.FixedVF = MaxVF; 5426 5427 if (auto MaxVF = 5428 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5429 MaxSafeScalableVF, FoldTailByMasking)) 5430 if (MaxVF.isScalable()) { 5431 Result.ScalableVF = MaxVF; 5432 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5433 << "\n"); 5434 } 5435 5436 return Result; 5437 } 5438 5439 FixedScalableVFPair 5440 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5441 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5442 // TODO: It may by useful to do since it's still likely to be dynamically 5443 // uniform if the target can skip. 5444 reportVectorizationFailure( 5445 "Not inserting runtime ptr check for divergent target", 5446 "runtime pointer checks needed. Not enabled for divergent target", 5447 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5448 return FixedScalableVFPair::getNone(); 5449 } 5450 5451 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5452 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5453 if (TC == 1) { 5454 reportVectorizationFailure("Single iteration (non) loop", 5455 "loop trip count is one, irrelevant for vectorization", 5456 "SingleIterationLoop", ORE, TheLoop); 5457 return FixedScalableVFPair::getNone(); 5458 } 5459 5460 switch (ScalarEpilogueStatus) { 5461 case CM_ScalarEpilogueAllowed: 5462 return computeFeasibleMaxVF(TC, UserVF, false); 5463 case CM_ScalarEpilogueNotAllowedUsePredicate: 5464 LLVM_FALLTHROUGH; 5465 case CM_ScalarEpilogueNotNeededUsePredicate: 5466 LLVM_DEBUG( 5467 dbgs() << "LV: vector predicate hint/switch found.\n" 5468 << "LV: Not allowing scalar epilogue, creating predicated " 5469 << "vector loop.\n"); 5470 break; 5471 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5472 // fallthrough as a special case of OptForSize 5473 case CM_ScalarEpilogueNotAllowedOptSize: 5474 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5475 LLVM_DEBUG( 5476 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5477 else 5478 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5479 << "count.\n"); 5480 5481 // Bail if runtime checks are required, which are not good when optimising 5482 // for size. 5483 if (runtimeChecksRequired()) 5484 return FixedScalableVFPair::getNone(); 5485 5486 break; 5487 } 5488 5489 // The only loops we can vectorize without a scalar epilogue, are loops with 5490 // a bottom-test and a single exiting block. We'd have to handle the fact 5491 // that not every instruction executes on the last iteration. This will 5492 // require a lane mask which varies through the vector loop body. (TODO) 5493 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5494 // If there was a tail-folding hint/switch, but we can't fold the tail by 5495 // masking, fallback to a vectorization with a scalar epilogue. 5496 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5497 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5498 "scalar epilogue instead.\n"); 5499 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5500 return computeFeasibleMaxVF(TC, UserVF, false); 5501 } 5502 return FixedScalableVFPair::getNone(); 5503 } 5504 5505 // Now try the tail folding 5506 5507 // Invalidate interleave groups that require an epilogue if we can't mask 5508 // the interleave-group. 5509 if (!useMaskedInterleavedAccesses(TTI)) { 5510 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5511 "No decisions should have been taken at this point"); 5512 // Note: There is no need to invalidate any cost modeling decisions here, as 5513 // non where taken so far. 5514 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5515 } 5516 5517 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5518 // Avoid tail folding if the trip count is known to be a multiple of any VF 5519 // we chose. 5520 // FIXME: The condition below pessimises the case for fixed-width vectors, 5521 // when scalable VFs are also candidates for vectorization. 5522 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5523 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5524 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5525 "MaxFixedVF must be a power of 2"); 5526 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5527 : MaxFixedVF.getFixedValue(); 5528 ScalarEvolution *SE = PSE.getSE(); 5529 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5530 const SCEV *ExitCount = SE->getAddExpr( 5531 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5532 const SCEV *Rem = SE->getURemExpr( 5533 SE->applyLoopGuards(ExitCount, TheLoop), 5534 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5535 if (Rem->isZero()) { 5536 // Accept MaxFixedVF if we do not have a tail. 5537 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5538 return MaxFactors; 5539 } 5540 } 5541 5542 // For scalable vectors, don't use tail folding as this is currently not yet 5543 // supported. The code is likely to have ended up here if the tripcount is 5544 // low, in which case it makes sense not to use scalable vectors. 5545 if (MaxFactors.ScalableVF.isVector()) 5546 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5547 5548 // If we don't know the precise trip count, or if the trip count that we 5549 // found modulo the vectorization factor is not zero, try to fold the tail 5550 // by masking. 5551 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5552 if (Legal->prepareToFoldTailByMasking()) { 5553 FoldTailByMasking = true; 5554 return MaxFactors; 5555 } 5556 5557 // If there was a tail-folding hint/switch, but we can't fold the tail by 5558 // masking, fallback to a vectorization with a scalar epilogue. 5559 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5560 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5561 "scalar epilogue instead.\n"); 5562 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5563 return MaxFactors; 5564 } 5565 5566 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5567 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5568 return FixedScalableVFPair::getNone(); 5569 } 5570 5571 if (TC == 0) { 5572 reportVectorizationFailure( 5573 "Unable to calculate the loop count due to complex control flow", 5574 "unable to calculate the loop count due to complex control flow", 5575 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5576 return FixedScalableVFPair::getNone(); 5577 } 5578 5579 reportVectorizationFailure( 5580 "Cannot optimize for size and vectorize at the same time.", 5581 "cannot optimize for size and vectorize at the same time. " 5582 "Enable vectorization of this loop with '#pragma clang loop " 5583 "vectorize(enable)' when compiling with -Os/-Oz", 5584 "NoTailLoopWithOptForSize", ORE, TheLoop); 5585 return FixedScalableVFPair::getNone(); 5586 } 5587 5588 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5589 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5590 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5591 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5592 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5593 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5594 : TargetTransformInfo::RGK_FixedWidthVector); 5595 5596 // Convenience function to return the minimum of two ElementCounts. 5597 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5598 assert((LHS.isScalable() == RHS.isScalable()) && 5599 "Scalable flags must match"); 5600 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5601 }; 5602 5603 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5604 // Note that both WidestRegister and WidestType may not be a powers of 2. 5605 auto MaxVectorElementCount = ElementCount::get( 5606 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5607 ComputeScalableMaxVF); 5608 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5609 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5610 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5611 5612 if (!MaxVectorElementCount) { 5613 LLVM_DEBUG(dbgs() << "LV: The target has no " 5614 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5615 << " vector registers.\n"); 5616 return ElementCount::getFixed(1); 5617 } 5618 5619 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5620 if (ConstTripCount && 5621 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5622 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5623 // If loop trip count (TC) is known at compile time there is no point in 5624 // choosing VF greater than TC (as done in the loop below). Select maximum 5625 // power of two which doesn't exceed TC. 5626 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5627 // when the TC is less than or equal to the known number of lanes. 5628 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5629 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5630 "exceeding the constant trip count: " 5631 << ClampedConstTripCount << "\n"); 5632 return ElementCount::getFixed(ClampedConstTripCount); 5633 } 5634 5635 ElementCount MaxVF = MaxVectorElementCount; 5636 if (TTI.shouldMaximizeVectorBandwidth() || 5637 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5638 auto MaxVectorElementCountMaxBW = ElementCount::get( 5639 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5640 ComputeScalableMaxVF); 5641 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5642 5643 // Collect all viable vectorization factors larger than the default MaxVF 5644 // (i.e. MaxVectorElementCount). 5645 SmallVector<ElementCount, 8> VFs; 5646 for (ElementCount VS = MaxVectorElementCount * 2; 5647 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5648 VFs.push_back(VS); 5649 5650 // For each VF calculate its register usage. 5651 auto RUs = calculateRegisterUsage(VFs); 5652 5653 // Select the largest VF which doesn't require more registers than existing 5654 // ones. 5655 for (int i = RUs.size() - 1; i >= 0; --i) { 5656 bool Selected = true; 5657 for (auto &pair : RUs[i].MaxLocalUsers) { 5658 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5659 if (pair.second > TargetNumRegisters) 5660 Selected = false; 5661 } 5662 if (Selected) { 5663 MaxVF = VFs[i]; 5664 break; 5665 } 5666 } 5667 if (ElementCount MinVF = 5668 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5669 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5670 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5671 << ") with target's minimum: " << MinVF << '\n'); 5672 MaxVF = MinVF; 5673 } 5674 } 5675 } 5676 return MaxVF; 5677 } 5678 5679 bool LoopVectorizationCostModel::isMoreProfitable( 5680 const VectorizationFactor &A, const VectorizationFactor &B) const { 5681 InstructionCost CostA = A.Cost; 5682 InstructionCost CostB = B.Cost; 5683 5684 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5685 5686 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5687 MaxTripCount) { 5688 // If we are folding the tail and the trip count is a known (possibly small) 5689 // constant, the trip count will be rounded up to an integer number of 5690 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5691 // which we compare directly. When not folding the tail, the total cost will 5692 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5693 // approximated with the per-lane cost below instead of using the tripcount 5694 // as here. 5695 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5696 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5697 return RTCostA < RTCostB; 5698 } 5699 5700 // Improve estimate for the vector width if it is scalable. 5701 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5702 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5703 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5704 if (A.Width.isScalable()) 5705 EstimatedWidthA *= VScale.getValue(); 5706 if (B.Width.isScalable()) 5707 EstimatedWidthB *= VScale.getValue(); 5708 } 5709 5710 // Assume vscale may be larger than 1 (or the value being tuned for), 5711 // so that scalable vectorization is slightly favorable over fixed-width 5712 // vectorization. 5713 if (A.Width.isScalable() && !B.Width.isScalable()) 5714 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5715 5716 // To avoid the need for FP division: 5717 // (CostA / A.Width) < (CostB / B.Width) 5718 // <=> (CostA * B.Width) < (CostB * A.Width) 5719 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5720 } 5721 5722 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5723 const ElementCountSet &VFCandidates) { 5724 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5725 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5726 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5727 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5728 "Expected Scalar VF to be a candidate"); 5729 5730 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5731 VectorizationFactor ChosenFactor = ScalarCost; 5732 5733 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5734 if (ForceVectorization && VFCandidates.size() > 1) { 5735 // Ignore scalar width, because the user explicitly wants vectorization. 5736 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5737 // evaluation. 5738 ChosenFactor.Cost = InstructionCost::getMax(); 5739 } 5740 5741 SmallVector<InstructionVFPair> InvalidCosts; 5742 for (const auto &i : VFCandidates) { 5743 // The cost for scalar VF=1 is already calculated, so ignore it. 5744 if (i.isScalar()) 5745 continue; 5746 5747 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5748 VectorizationFactor Candidate(i, C.first); 5749 5750 #ifndef NDEBUG 5751 unsigned AssumedMinimumVscale = 1; 5752 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5753 AssumedMinimumVscale = VScale.getValue(); 5754 unsigned Width = 5755 Candidate.Width.isScalable() 5756 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5757 : Candidate.Width.getFixedValue(); 5758 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5759 << " costs: " << (Candidate.Cost / Width)); 5760 if (i.isScalable()) 5761 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5762 << AssumedMinimumVscale << ")"); 5763 LLVM_DEBUG(dbgs() << ".\n"); 5764 #endif 5765 5766 if (!C.second && !ForceVectorization) { 5767 LLVM_DEBUG( 5768 dbgs() << "LV: Not considering vector loop of width " << i 5769 << " because it will not generate any vector instructions.\n"); 5770 continue; 5771 } 5772 5773 // If profitable add it to ProfitableVF list. 5774 if (isMoreProfitable(Candidate, ScalarCost)) 5775 ProfitableVFs.push_back(Candidate); 5776 5777 if (isMoreProfitable(Candidate, ChosenFactor)) 5778 ChosenFactor = Candidate; 5779 } 5780 5781 // Emit a report of VFs with invalid costs in the loop. 5782 if (!InvalidCosts.empty()) { 5783 // Group the remarks per instruction, keeping the instruction order from 5784 // InvalidCosts. 5785 std::map<Instruction *, unsigned> Numbering; 5786 unsigned I = 0; 5787 for (auto &Pair : InvalidCosts) 5788 if (!Numbering.count(Pair.first)) 5789 Numbering[Pair.first] = I++; 5790 5791 // Sort the list, first on instruction(number) then on VF. 5792 llvm::sort(InvalidCosts, 5793 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5794 if (Numbering[A.first] != Numbering[B.first]) 5795 return Numbering[A.first] < Numbering[B.first]; 5796 ElementCountComparator ECC; 5797 return ECC(A.second, B.second); 5798 }); 5799 5800 // For a list of ordered instruction-vf pairs: 5801 // [(load, vf1), (load, vf2), (store, vf1)] 5802 // Group the instructions together to emit separate remarks for: 5803 // load (vf1, vf2) 5804 // store (vf1) 5805 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5806 auto Subset = ArrayRef<InstructionVFPair>(); 5807 do { 5808 if (Subset.empty()) 5809 Subset = Tail.take_front(1); 5810 5811 Instruction *I = Subset.front().first; 5812 5813 // If the next instruction is different, or if there are no other pairs, 5814 // emit a remark for the collated subset. e.g. 5815 // [(load, vf1), (load, vf2))] 5816 // to emit: 5817 // remark: invalid costs for 'load' at VF=(vf, vf2) 5818 if (Subset == Tail || Tail[Subset.size()].first != I) { 5819 std::string OutString; 5820 raw_string_ostream OS(OutString); 5821 assert(!Subset.empty() && "Unexpected empty range"); 5822 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5823 for (auto &Pair : Subset) 5824 OS << (Pair.second == Subset.front().second ? "" : ", ") 5825 << Pair.second; 5826 OS << "):"; 5827 if (auto *CI = dyn_cast<CallInst>(I)) 5828 OS << " call to " << CI->getCalledFunction()->getName(); 5829 else 5830 OS << " " << I->getOpcodeName(); 5831 OS.flush(); 5832 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5833 Tail = Tail.drop_front(Subset.size()); 5834 Subset = {}; 5835 } else 5836 // Grow the subset by one element 5837 Subset = Tail.take_front(Subset.size() + 1); 5838 } while (!Tail.empty()); 5839 } 5840 5841 if (!EnableCondStoresVectorization && NumPredStores) { 5842 reportVectorizationFailure("There are conditional stores.", 5843 "store that is conditionally executed prevents vectorization", 5844 "ConditionalStore", ORE, TheLoop); 5845 ChosenFactor = ScalarCost; 5846 } 5847 5848 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5849 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5850 << "LV: Vectorization seems to be not beneficial, " 5851 << "but was forced by a user.\n"); 5852 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5853 return ChosenFactor; 5854 } 5855 5856 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5857 const Loop &L, ElementCount VF) const { 5858 // Cross iteration phis such as reductions need special handling and are 5859 // currently unsupported. 5860 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5861 return Legal->isFirstOrderRecurrence(&Phi) || 5862 Legal->isReductionVariable(&Phi); 5863 })) 5864 return false; 5865 5866 // Phis with uses outside of the loop require special handling and are 5867 // currently unsupported. 5868 for (auto &Entry : Legal->getInductionVars()) { 5869 // Look for uses of the value of the induction at the last iteration. 5870 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5871 for (User *U : PostInc->users()) 5872 if (!L.contains(cast<Instruction>(U))) 5873 return false; 5874 // Look for uses of penultimate value of the induction. 5875 for (User *U : Entry.first->users()) 5876 if (!L.contains(cast<Instruction>(U))) 5877 return false; 5878 } 5879 5880 // Induction variables that are widened require special handling that is 5881 // currently not supported. 5882 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5883 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5884 this->isProfitableToScalarize(Entry.first, VF)); 5885 })) 5886 return false; 5887 5888 // Epilogue vectorization code has not been auditted to ensure it handles 5889 // non-latch exits properly. It may be fine, but it needs auditted and 5890 // tested. 5891 if (L.getExitingBlock() != L.getLoopLatch()) 5892 return false; 5893 5894 return true; 5895 } 5896 5897 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5898 const ElementCount VF) const { 5899 // FIXME: We need a much better cost-model to take different parameters such 5900 // as register pressure, code size increase and cost of extra branches into 5901 // account. For now we apply a very crude heuristic and only consider loops 5902 // with vectorization factors larger than a certain value. 5903 // We also consider epilogue vectorization unprofitable for targets that don't 5904 // consider interleaving beneficial (eg. MVE). 5905 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5906 return false; 5907 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5908 return true; 5909 return false; 5910 } 5911 5912 VectorizationFactor 5913 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5914 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5915 VectorizationFactor Result = VectorizationFactor::Disabled(); 5916 if (!EnableEpilogueVectorization) { 5917 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5918 return Result; 5919 } 5920 5921 if (!isScalarEpilogueAllowed()) { 5922 LLVM_DEBUG( 5923 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5924 "allowed.\n";); 5925 return Result; 5926 } 5927 5928 // Not really a cost consideration, but check for unsupported cases here to 5929 // simplify the logic. 5930 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5931 LLVM_DEBUG( 5932 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5933 "not a supported candidate.\n";); 5934 return Result; 5935 } 5936 5937 if (EpilogueVectorizationForceVF > 1) { 5938 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5939 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5940 if (LVP.hasPlanWithVF(ForcedEC)) 5941 return {ForcedEC, 0}; 5942 else { 5943 LLVM_DEBUG( 5944 dbgs() 5945 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5946 return Result; 5947 } 5948 } 5949 5950 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5951 TheLoop->getHeader()->getParent()->hasMinSize()) { 5952 LLVM_DEBUG( 5953 dbgs() 5954 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5955 return Result; 5956 } 5957 5958 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5959 if (MainLoopVF.isScalable()) 5960 LLVM_DEBUG( 5961 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 5962 "yet supported. Converting to fixed-width (VF=" 5963 << FixedMainLoopVF << ") instead\n"); 5964 5965 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 5966 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5967 "this loop\n"); 5968 return Result; 5969 } 5970 5971 for (auto &NextVF : ProfitableVFs) 5972 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 5973 (Result.Width.getFixedValue() == 1 || 5974 isMoreProfitable(NextVF, Result)) && 5975 LVP.hasPlanWithVF(NextVF.Width)) 5976 Result = NextVF; 5977 5978 if (Result != VectorizationFactor::Disabled()) 5979 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5980 << Result.Width.getFixedValue() << "\n";); 5981 return Result; 5982 } 5983 5984 std::pair<unsigned, unsigned> 5985 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5986 unsigned MinWidth = -1U; 5987 unsigned MaxWidth = 8; 5988 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5989 for (Type *T : ElementTypesInLoop) { 5990 MinWidth = std::min<unsigned>( 5991 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5992 MaxWidth = std::max<unsigned>( 5993 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5994 } 5995 return {MinWidth, MaxWidth}; 5996 } 5997 5998 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5999 ElementTypesInLoop.clear(); 6000 // For each block. 6001 for (BasicBlock *BB : TheLoop->blocks()) { 6002 // For each instruction in the loop. 6003 for (Instruction &I : BB->instructionsWithoutDebug()) { 6004 Type *T = I.getType(); 6005 6006 // Skip ignored values. 6007 if (ValuesToIgnore.count(&I)) 6008 continue; 6009 6010 // Only examine Loads, Stores and PHINodes. 6011 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6012 continue; 6013 6014 // Examine PHI nodes that are reduction variables. Update the type to 6015 // account for the recurrence type. 6016 if (auto *PN = dyn_cast<PHINode>(&I)) { 6017 if (!Legal->isReductionVariable(PN)) 6018 continue; 6019 const RecurrenceDescriptor &RdxDesc = 6020 Legal->getReductionVars().find(PN)->second; 6021 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6022 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6023 RdxDesc.getRecurrenceType(), 6024 TargetTransformInfo::ReductionFlags())) 6025 continue; 6026 T = RdxDesc.getRecurrenceType(); 6027 } 6028 6029 // Examine the stored values. 6030 if (auto *ST = dyn_cast<StoreInst>(&I)) 6031 T = ST->getValueOperand()->getType(); 6032 6033 // Ignore loaded pointer types and stored pointer types that are not 6034 // vectorizable. 6035 // 6036 // FIXME: The check here attempts to predict whether a load or store will 6037 // be vectorized. We only know this for certain after a VF has 6038 // been selected. Here, we assume that if an access can be 6039 // vectorized, it will be. We should also look at extending this 6040 // optimization to non-pointer types. 6041 // 6042 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6043 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6044 continue; 6045 6046 ElementTypesInLoop.insert(T); 6047 } 6048 } 6049 } 6050 6051 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6052 unsigned LoopCost) { 6053 // -- The interleave heuristics -- 6054 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6055 // There are many micro-architectural considerations that we can't predict 6056 // at this level. For example, frontend pressure (on decode or fetch) due to 6057 // code size, or the number and capabilities of the execution ports. 6058 // 6059 // We use the following heuristics to select the interleave count: 6060 // 1. If the code has reductions, then we interleave to break the cross 6061 // iteration dependency. 6062 // 2. If the loop is really small, then we interleave to reduce the loop 6063 // overhead. 6064 // 3. We don't interleave if we think that we will spill registers to memory 6065 // due to the increased register pressure. 6066 6067 if (!isScalarEpilogueAllowed()) 6068 return 1; 6069 6070 // We used the distance for the interleave count. 6071 if (Legal->getMaxSafeDepDistBytes() != -1U) 6072 return 1; 6073 6074 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6075 const bool HasReductions = !Legal->getReductionVars().empty(); 6076 // Do not interleave loops with a relatively small known or estimated trip 6077 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6078 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6079 // because with the above conditions interleaving can expose ILP and break 6080 // cross iteration dependences for reductions. 6081 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6082 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6083 return 1; 6084 6085 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6086 // We divide by these constants so assume that we have at least one 6087 // instruction that uses at least one register. 6088 for (auto& pair : R.MaxLocalUsers) { 6089 pair.second = std::max(pair.second, 1U); 6090 } 6091 6092 // We calculate the interleave count using the following formula. 6093 // Subtract the number of loop invariants from the number of available 6094 // registers. These registers are used by all of the interleaved instances. 6095 // Next, divide the remaining registers by the number of registers that is 6096 // required by the loop, in order to estimate how many parallel instances 6097 // fit without causing spills. All of this is rounded down if necessary to be 6098 // a power of two. We want power of two interleave count to simplify any 6099 // addressing operations or alignment considerations. 6100 // We also want power of two interleave counts to ensure that the induction 6101 // variable of the vector loop wraps to zero, when tail is folded by masking; 6102 // this currently happens when OptForSize, in which case IC is set to 1 above. 6103 unsigned IC = UINT_MAX; 6104 6105 for (auto& pair : R.MaxLocalUsers) { 6106 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6107 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6108 << " registers of " 6109 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6110 if (VF.isScalar()) { 6111 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6112 TargetNumRegisters = ForceTargetNumScalarRegs; 6113 } else { 6114 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6115 TargetNumRegisters = ForceTargetNumVectorRegs; 6116 } 6117 unsigned MaxLocalUsers = pair.second; 6118 unsigned LoopInvariantRegs = 0; 6119 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6120 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6121 6122 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6123 // Don't count the induction variable as interleaved. 6124 if (EnableIndVarRegisterHeur) { 6125 TmpIC = 6126 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6127 std::max(1U, (MaxLocalUsers - 1))); 6128 } 6129 6130 IC = std::min(IC, TmpIC); 6131 } 6132 6133 // Clamp the interleave ranges to reasonable counts. 6134 unsigned MaxInterleaveCount = 6135 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6136 6137 // Check if the user has overridden the max. 6138 if (VF.isScalar()) { 6139 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6140 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6141 } else { 6142 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6143 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6144 } 6145 6146 // If trip count is known or estimated compile time constant, limit the 6147 // interleave count to be less than the trip count divided by VF, provided it 6148 // is at least 1. 6149 // 6150 // For scalable vectors we can't know if interleaving is beneficial. It may 6151 // not be beneficial for small loops if none of the lanes in the second vector 6152 // iterations is enabled. However, for larger loops, there is likely to be a 6153 // similar benefit as for fixed-width vectors. For now, we choose to leave 6154 // the InterleaveCount as if vscale is '1', although if some information about 6155 // the vector is known (e.g. min vector size), we can make a better decision. 6156 if (BestKnownTC) { 6157 MaxInterleaveCount = 6158 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6159 // Make sure MaxInterleaveCount is greater than 0. 6160 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6161 } 6162 6163 assert(MaxInterleaveCount > 0 && 6164 "Maximum interleave count must be greater than 0"); 6165 6166 // Clamp the calculated IC to be between the 1 and the max interleave count 6167 // that the target and trip count allows. 6168 if (IC > MaxInterleaveCount) 6169 IC = MaxInterleaveCount; 6170 else 6171 // Make sure IC is greater than 0. 6172 IC = std::max(1u, IC); 6173 6174 assert(IC > 0 && "Interleave count must be greater than 0."); 6175 6176 // If we did not calculate the cost for VF (because the user selected the VF) 6177 // then we calculate the cost of VF here. 6178 if (LoopCost == 0) { 6179 InstructionCost C = expectedCost(VF).first; 6180 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6181 LoopCost = *C.getValue(); 6182 } 6183 6184 assert(LoopCost && "Non-zero loop cost expected"); 6185 6186 // Interleave if we vectorized this loop and there is a reduction that could 6187 // benefit from interleaving. 6188 if (VF.isVector() && HasReductions) { 6189 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6190 return IC; 6191 } 6192 6193 // Note that if we've already vectorized the loop we will have done the 6194 // runtime check and so interleaving won't require further checks. 6195 bool InterleavingRequiresRuntimePointerCheck = 6196 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6197 6198 // We want to interleave small loops in order to reduce the loop overhead and 6199 // potentially expose ILP opportunities. 6200 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6201 << "LV: IC is " << IC << '\n' 6202 << "LV: VF is " << VF << '\n'); 6203 const bool AggressivelyInterleaveReductions = 6204 TTI.enableAggressiveInterleaving(HasReductions); 6205 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6206 // We assume that the cost overhead is 1 and we use the cost model 6207 // to estimate the cost of the loop and interleave until the cost of the 6208 // loop overhead is about 5% of the cost of the loop. 6209 unsigned SmallIC = 6210 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6211 6212 // Interleave until store/load ports (estimated by max interleave count) are 6213 // saturated. 6214 unsigned NumStores = Legal->getNumStores(); 6215 unsigned NumLoads = Legal->getNumLoads(); 6216 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6217 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6218 6219 // There is little point in interleaving for reductions containing selects 6220 // and compares when VF=1 since it may just create more overhead than it's 6221 // worth for loops with small trip counts. This is because we still have to 6222 // do the final reduction after the loop. 6223 bool HasSelectCmpReductions = 6224 HasReductions && 6225 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6226 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6227 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6228 RdxDesc.getRecurrenceKind()); 6229 }); 6230 if (HasSelectCmpReductions) { 6231 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6232 return 1; 6233 } 6234 6235 // If we have a scalar reduction (vector reductions are already dealt with 6236 // by this point), we can increase the critical path length if the loop 6237 // we're interleaving is inside another loop. For tree-wise reductions 6238 // set the limit to 2, and for ordered reductions it's best to disable 6239 // interleaving entirely. 6240 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6241 bool HasOrderedReductions = 6242 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6243 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6244 return RdxDesc.isOrdered(); 6245 }); 6246 if (HasOrderedReductions) { 6247 LLVM_DEBUG( 6248 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6249 return 1; 6250 } 6251 6252 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6253 SmallIC = std::min(SmallIC, F); 6254 StoresIC = std::min(StoresIC, F); 6255 LoadsIC = std::min(LoadsIC, F); 6256 } 6257 6258 if (EnableLoadStoreRuntimeInterleave && 6259 std::max(StoresIC, LoadsIC) > SmallIC) { 6260 LLVM_DEBUG( 6261 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6262 return std::max(StoresIC, LoadsIC); 6263 } 6264 6265 // If there are scalar reductions and TTI has enabled aggressive 6266 // interleaving for reductions, we will interleave to expose ILP. 6267 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6268 AggressivelyInterleaveReductions) { 6269 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6270 // Interleave no less than SmallIC but not as aggressive as the normal IC 6271 // to satisfy the rare situation when resources are too limited. 6272 return std::max(IC / 2, SmallIC); 6273 } else { 6274 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6275 return SmallIC; 6276 } 6277 } 6278 6279 // Interleave if this is a large loop (small loops are already dealt with by 6280 // this point) that could benefit from interleaving. 6281 if (AggressivelyInterleaveReductions) { 6282 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6283 return IC; 6284 } 6285 6286 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6287 return 1; 6288 } 6289 6290 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6291 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6292 // This function calculates the register usage by measuring the highest number 6293 // of values that are alive at a single location. Obviously, this is a very 6294 // rough estimation. We scan the loop in a topological order in order and 6295 // assign a number to each instruction. We use RPO to ensure that defs are 6296 // met before their users. We assume that each instruction that has in-loop 6297 // users starts an interval. We record every time that an in-loop value is 6298 // used, so we have a list of the first and last occurrences of each 6299 // instruction. Next, we transpose this data structure into a multi map that 6300 // holds the list of intervals that *end* at a specific location. This multi 6301 // map allows us to perform a linear search. We scan the instructions linearly 6302 // and record each time that a new interval starts, by placing it in a set. 6303 // If we find this value in the multi-map then we remove it from the set. 6304 // The max register usage is the maximum size of the set. 6305 // We also search for instructions that are defined outside the loop, but are 6306 // used inside the loop. We need this number separately from the max-interval 6307 // usage number because when we unroll, loop-invariant values do not take 6308 // more register. 6309 LoopBlocksDFS DFS(TheLoop); 6310 DFS.perform(LI); 6311 6312 RegisterUsage RU; 6313 6314 // Each 'key' in the map opens a new interval. The values 6315 // of the map are the index of the 'last seen' usage of the 6316 // instruction that is the key. 6317 using IntervalMap = DenseMap<Instruction *, unsigned>; 6318 6319 // Maps instruction to its index. 6320 SmallVector<Instruction *, 64> IdxToInstr; 6321 // Marks the end of each interval. 6322 IntervalMap EndPoint; 6323 // Saves the list of instruction indices that are used in the loop. 6324 SmallPtrSet<Instruction *, 8> Ends; 6325 // Saves the list of values that are used in the loop but are 6326 // defined outside the loop, such as arguments and constants. 6327 SmallPtrSet<Value *, 8> LoopInvariants; 6328 6329 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6330 for (Instruction &I : BB->instructionsWithoutDebug()) { 6331 IdxToInstr.push_back(&I); 6332 6333 // Save the end location of each USE. 6334 for (Value *U : I.operands()) { 6335 auto *Instr = dyn_cast<Instruction>(U); 6336 6337 // Ignore non-instruction values such as arguments, constants, etc. 6338 if (!Instr) 6339 continue; 6340 6341 // If this instruction is outside the loop then record it and continue. 6342 if (!TheLoop->contains(Instr)) { 6343 LoopInvariants.insert(Instr); 6344 continue; 6345 } 6346 6347 // Overwrite previous end points. 6348 EndPoint[Instr] = IdxToInstr.size(); 6349 Ends.insert(Instr); 6350 } 6351 } 6352 } 6353 6354 // Saves the list of intervals that end with the index in 'key'. 6355 using InstrList = SmallVector<Instruction *, 2>; 6356 DenseMap<unsigned, InstrList> TransposeEnds; 6357 6358 // Transpose the EndPoints to a list of values that end at each index. 6359 for (auto &Interval : EndPoint) 6360 TransposeEnds[Interval.second].push_back(Interval.first); 6361 6362 SmallPtrSet<Instruction *, 8> OpenIntervals; 6363 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6364 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6365 6366 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6367 6368 // A lambda that gets the register usage for the given type and VF. 6369 const auto &TTICapture = TTI; 6370 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6371 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6372 return 0; 6373 InstructionCost::CostType RegUsage = 6374 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6375 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6376 "Nonsensical values for register usage."); 6377 return RegUsage; 6378 }; 6379 6380 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6381 Instruction *I = IdxToInstr[i]; 6382 6383 // Remove all of the instructions that end at this location. 6384 InstrList &List = TransposeEnds[i]; 6385 for (Instruction *ToRemove : List) 6386 OpenIntervals.erase(ToRemove); 6387 6388 // Ignore instructions that are never used within the loop. 6389 if (!Ends.count(I)) 6390 continue; 6391 6392 // Skip ignored values. 6393 if (ValuesToIgnore.count(I)) 6394 continue; 6395 6396 // For each VF find the maximum usage of registers. 6397 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6398 // Count the number of live intervals. 6399 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6400 6401 if (VFs[j].isScalar()) { 6402 for (auto Inst : OpenIntervals) { 6403 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6404 if (RegUsage.find(ClassID) == RegUsage.end()) 6405 RegUsage[ClassID] = 1; 6406 else 6407 RegUsage[ClassID] += 1; 6408 } 6409 } else { 6410 collectUniformsAndScalars(VFs[j]); 6411 for (auto Inst : OpenIntervals) { 6412 // Skip ignored values for VF > 1. 6413 if (VecValuesToIgnore.count(Inst)) 6414 continue; 6415 if (isScalarAfterVectorization(Inst, VFs[j])) { 6416 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6417 if (RegUsage.find(ClassID) == RegUsage.end()) 6418 RegUsage[ClassID] = 1; 6419 else 6420 RegUsage[ClassID] += 1; 6421 } else { 6422 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6423 if (RegUsage.find(ClassID) == RegUsage.end()) 6424 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6425 else 6426 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6427 } 6428 } 6429 } 6430 6431 for (auto& pair : RegUsage) { 6432 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6433 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6434 else 6435 MaxUsages[j][pair.first] = pair.second; 6436 } 6437 } 6438 6439 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6440 << OpenIntervals.size() << '\n'); 6441 6442 // Add the current instruction to the list of open intervals. 6443 OpenIntervals.insert(I); 6444 } 6445 6446 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6447 SmallMapVector<unsigned, unsigned, 4> Invariant; 6448 6449 for (auto Inst : LoopInvariants) { 6450 unsigned Usage = 6451 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6452 unsigned ClassID = 6453 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6454 if (Invariant.find(ClassID) == Invariant.end()) 6455 Invariant[ClassID] = Usage; 6456 else 6457 Invariant[ClassID] += Usage; 6458 } 6459 6460 LLVM_DEBUG({ 6461 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6462 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6463 << " item\n"; 6464 for (const auto &pair : MaxUsages[i]) { 6465 dbgs() << "LV(REG): RegisterClass: " 6466 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6467 << " registers\n"; 6468 } 6469 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6470 << " item\n"; 6471 for (const auto &pair : Invariant) { 6472 dbgs() << "LV(REG): RegisterClass: " 6473 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6474 << " registers\n"; 6475 } 6476 }); 6477 6478 RU.LoopInvariantRegs = Invariant; 6479 RU.MaxLocalUsers = MaxUsages[i]; 6480 RUs[i] = RU; 6481 } 6482 6483 return RUs; 6484 } 6485 6486 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6487 // TODO: Cost model for emulated masked load/store is completely 6488 // broken. This hack guides the cost model to use an artificially 6489 // high enough value to practically disable vectorization with such 6490 // operations, except where previously deployed legality hack allowed 6491 // using very low cost values. This is to avoid regressions coming simply 6492 // from moving "masked load/store" check from legality to cost model. 6493 // Masked Load/Gather emulation was previously never allowed. 6494 // Limited number of Masked Store/Scatter emulation was allowed. 6495 assert(isPredicatedInst(I) && 6496 "Expecting a scalar emulated instruction"); 6497 return isa<LoadInst>(I) || 6498 (isa<StoreInst>(I) && 6499 NumPredStores > NumberOfStoresToPredicate); 6500 } 6501 6502 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6503 // If we aren't vectorizing the loop, or if we've already collected the 6504 // instructions to scalarize, there's nothing to do. Collection may already 6505 // have occurred if we have a user-selected VF and are now computing the 6506 // expected cost for interleaving. 6507 if (VF.isScalar() || VF.isZero() || 6508 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6509 return; 6510 6511 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6512 // not profitable to scalarize any instructions, the presence of VF in the 6513 // map will indicate that we've analyzed it already. 6514 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6515 6516 // Find all the instructions that are scalar with predication in the loop and 6517 // determine if it would be better to not if-convert the blocks they are in. 6518 // If so, we also record the instructions to scalarize. 6519 for (BasicBlock *BB : TheLoop->blocks()) { 6520 if (!blockNeedsPredicationForAnyReason(BB)) 6521 continue; 6522 for (Instruction &I : *BB) 6523 if (isScalarWithPredication(&I)) { 6524 ScalarCostsTy ScalarCosts; 6525 // Do not apply discount if scalable, because that would lead to 6526 // invalid scalarization costs. 6527 // Do not apply discount logic if hacked cost is needed 6528 // for emulated masked memrefs. 6529 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6530 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6531 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6532 // Remember that BB will remain after vectorization. 6533 PredicatedBBsAfterVectorization.insert(BB); 6534 } 6535 } 6536 } 6537 6538 int LoopVectorizationCostModel::computePredInstDiscount( 6539 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6540 assert(!isUniformAfterVectorization(PredInst, VF) && 6541 "Instruction marked uniform-after-vectorization will be predicated"); 6542 6543 // Initialize the discount to zero, meaning that the scalar version and the 6544 // vector version cost the same. 6545 InstructionCost Discount = 0; 6546 6547 // Holds instructions to analyze. The instructions we visit are mapped in 6548 // ScalarCosts. Those instructions are the ones that would be scalarized if 6549 // we find that the scalar version costs less. 6550 SmallVector<Instruction *, 8> Worklist; 6551 6552 // Returns true if the given instruction can be scalarized. 6553 auto canBeScalarized = [&](Instruction *I) -> bool { 6554 // We only attempt to scalarize instructions forming a single-use chain 6555 // from the original predicated block that would otherwise be vectorized. 6556 // Although not strictly necessary, we give up on instructions we know will 6557 // already be scalar to avoid traversing chains that are unlikely to be 6558 // beneficial. 6559 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6560 isScalarAfterVectorization(I, VF)) 6561 return false; 6562 6563 // If the instruction is scalar with predication, it will be analyzed 6564 // separately. We ignore it within the context of PredInst. 6565 if (isScalarWithPredication(I)) 6566 return false; 6567 6568 // If any of the instruction's operands are uniform after vectorization, 6569 // the instruction cannot be scalarized. This prevents, for example, a 6570 // masked load from being scalarized. 6571 // 6572 // We assume we will only emit a value for lane zero of an instruction 6573 // marked uniform after vectorization, rather than VF identical values. 6574 // Thus, if we scalarize an instruction that uses a uniform, we would 6575 // create uses of values corresponding to the lanes we aren't emitting code 6576 // for. This behavior can be changed by allowing getScalarValue to clone 6577 // the lane zero values for uniforms rather than asserting. 6578 for (Use &U : I->operands()) 6579 if (auto *J = dyn_cast<Instruction>(U.get())) 6580 if (isUniformAfterVectorization(J, VF)) 6581 return false; 6582 6583 // Otherwise, we can scalarize the instruction. 6584 return true; 6585 }; 6586 6587 // Compute the expected cost discount from scalarizing the entire expression 6588 // feeding the predicated instruction. We currently only consider expressions 6589 // that are single-use instruction chains. 6590 Worklist.push_back(PredInst); 6591 while (!Worklist.empty()) { 6592 Instruction *I = Worklist.pop_back_val(); 6593 6594 // If we've already analyzed the instruction, there's nothing to do. 6595 if (ScalarCosts.find(I) != ScalarCosts.end()) 6596 continue; 6597 6598 // Compute the cost of the vector instruction. Note that this cost already 6599 // includes the scalarization overhead of the predicated instruction. 6600 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6601 6602 // Compute the cost of the scalarized instruction. This cost is the cost of 6603 // the instruction as if it wasn't if-converted and instead remained in the 6604 // predicated block. We will scale this cost by block probability after 6605 // computing the scalarization overhead. 6606 InstructionCost ScalarCost = 6607 VF.getFixedValue() * 6608 getInstructionCost(I, ElementCount::getFixed(1)).first; 6609 6610 // Compute the scalarization overhead of needed insertelement instructions 6611 // and phi nodes. 6612 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6613 ScalarCost += TTI.getScalarizationOverhead( 6614 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6615 APInt::getAllOnes(VF.getFixedValue()), true, false); 6616 ScalarCost += 6617 VF.getFixedValue() * 6618 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6619 } 6620 6621 // Compute the scalarization overhead of needed extractelement 6622 // instructions. For each of the instruction's operands, if the operand can 6623 // be scalarized, add it to the worklist; otherwise, account for the 6624 // overhead. 6625 for (Use &U : I->operands()) 6626 if (auto *J = dyn_cast<Instruction>(U.get())) { 6627 assert(VectorType::isValidElementType(J->getType()) && 6628 "Instruction has non-scalar type"); 6629 if (canBeScalarized(J)) 6630 Worklist.push_back(J); 6631 else if (needsExtract(J, VF)) { 6632 ScalarCost += TTI.getScalarizationOverhead( 6633 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6634 APInt::getAllOnes(VF.getFixedValue()), false, true); 6635 } 6636 } 6637 6638 // Scale the total scalar cost by block probability. 6639 ScalarCost /= getReciprocalPredBlockProb(); 6640 6641 // Compute the discount. A non-negative discount means the vector version 6642 // of the instruction costs more, and scalarizing would be beneficial. 6643 Discount += VectorCost - ScalarCost; 6644 ScalarCosts[I] = ScalarCost; 6645 } 6646 6647 return *Discount.getValue(); 6648 } 6649 6650 LoopVectorizationCostModel::VectorizationCostTy 6651 LoopVectorizationCostModel::expectedCost( 6652 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6653 VectorizationCostTy Cost; 6654 6655 // For each block. 6656 for (BasicBlock *BB : TheLoop->blocks()) { 6657 VectorizationCostTy BlockCost; 6658 6659 // For each instruction in the old loop. 6660 for (Instruction &I : BB->instructionsWithoutDebug()) { 6661 // Skip ignored values. 6662 if (ValuesToIgnore.count(&I) || 6663 (VF.isVector() && VecValuesToIgnore.count(&I))) 6664 continue; 6665 6666 VectorizationCostTy C = getInstructionCost(&I, VF); 6667 6668 // Check if we should override the cost. 6669 if (C.first.isValid() && 6670 ForceTargetInstructionCost.getNumOccurrences() > 0) 6671 C.first = InstructionCost(ForceTargetInstructionCost); 6672 6673 // Keep a list of instructions with invalid costs. 6674 if (Invalid && !C.first.isValid()) 6675 Invalid->emplace_back(&I, VF); 6676 6677 BlockCost.first += C.first; 6678 BlockCost.second |= C.second; 6679 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6680 << " for VF " << VF << " For instruction: " << I 6681 << '\n'); 6682 } 6683 6684 // If we are vectorizing a predicated block, it will have been 6685 // if-converted. This means that the block's instructions (aside from 6686 // stores and instructions that may divide by zero) will now be 6687 // unconditionally executed. For the scalar case, we may not always execute 6688 // the predicated block, if it is an if-else block. Thus, scale the block's 6689 // cost by the probability of executing it. blockNeedsPredication from 6690 // Legal is used so as to not include all blocks in tail folded loops. 6691 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6692 BlockCost.first /= getReciprocalPredBlockProb(); 6693 6694 Cost.first += BlockCost.first; 6695 Cost.second |= BlockCost.second; 6696 } 6697 6698 return Cost; 6699 } 6700 6701 /// Gets Address Access SCEV after verifying that the access pattern 6702 /// is loop invariant except the induction variable dependence. 6703 /// 6704 /// This SCEV can be sent to the Target in order to estimate the address 6705 /// calculation cost. 6706 static const SCEV *getAddressAccessSCEV( 6707 Value *Ptr, 6708 LoopVectorizationLegality *Legal, 6709 PredicatedScalarEvolution &PSE, 6710 const Loop *TheLoop) { 6711 6712 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6713 if (!Gep) 6714 return nullptr; 6715 6716 // We are looking for a gep with all loop invariant indices except for one 6717 // which should be an induction variable. 6718 auto SE = PSE.getSE(); 6719 unsigned NumOperands = Gep->getNumOperands(); 6720 for (unsigned i = 1; i < NumOperands; ++i) { 6721 Value *Opd = Gep->getOperand(i); 6722 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6723 !Legal->isInductionVariable(Opd)) 6724 return nullptr; 6725 } 6726 6727 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6728 return PSE.getSCEV(Ptr); 6729 } 6730 6731 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6732 return Legal->hasStride(I->getOperand(0)) || 6733 Legal->hasStride(I->getOperand(1)); 6734 } 6735 6736 InstructionCost 6737 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6738 ElementCount VF) { 6739 assert(VF.isVector() && 6740 "Scalarization cost of instruction implies vectorization."); 6741 if (VF.isScalable()) 6742 return InstructionCost::getInvalid(); 6743 6744 Type *ValTy = getLoadStoreType(I); 6745 auto SE = PSE.getSE(); 6746 6747 unsigned AS = getLoadStoreAddressSpace(I); 6748 Value *Ptr = getLoadStorePointerOperand(I); 6749 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6750 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6751 // that it is being called from this specific place. 6752 6753 // Figure out whether the access is strided and get the stride value 6754 // if it's known in compile time 6755 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6756 6757 // Get the cost of the scalar memory instruction and address computation. 6758 InstructionCost Cost = 6759 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6760 6761 // Don't pass *I here, since it is scalar but will actually be part of a 6762 // vectorized loop where the user of it is a vectorized instruction. 6763 const Align Alignment = getLoadStoreAlignment(I); 6764 Cost += VF.getKnownMinValue() * 6765 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6766 AS, TTI::TCK_RecipThroughput); 6767 6768 // Get the overhead of the extractelement and insertelement instructions 6769 // we might create due to scalarization. 6770 Cost += getScalarizationOverhead(I, VF); 6771 6772 // If we have a predicated load/store, it will need extra i1 extracts and 6773 // conditional branches, but may not be executed for each vector lane. Scale 6774 // the cost by the probability of executing the predicated block. 6775 if (isPredicatedInst(I)) { 6776 Cost /= getReciprocalPredBlockProb(); 6777 6778 // Add the cost of an i1 extract and a branch 6779 auto *Vec_i1Ty = 6780 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6781 Cost += TTI.getScalarizationOverhead( 6782 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6783 /*Insert=*/false, /*Extract=*/true); 6784 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6785 6786 if (useEmulatedMaskMemRefHack(I)) 6787 // Artificially setting to a high enough value to practically disable 6788 // vectorization with such operations. 6789 Cost = 3000000; 6790 } 6791 6792 return Cost; 6793 } 6794 6795 InstructionCost 6796 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6797 ElementCount VF) { 6798 Type *ValTy = getLoadStoreType(I); 6799 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6800 Value *Ptr = getLoadStorePointerOperand(I); 6801 unsigned AS = getLoadStoreAddressSpace(I); 6802 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6803 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6804 6805 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6806 "Stride should be 1 or -1 for consecutive memory access"); 6807 const Align Alignment = getLoadStoreAlignment(I); 6808 InstructionCost Cost = 0; 6809 if (Legal->isMaskRequired(I)) 6810 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6811 CostKind); 6812 else 6813 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6814 CostKind, I); 6815 6816 bool Reverse = ConsecutiveStride < 0; 6817 if (Reverse) 6818 Cost += 6819 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6820 return Cost; 6821 } 6822 6823 InstructionCost 6824 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6825 ElementCount VF) { 6826 assert(Legal->isUniformMemOp(*I)); 6827 6828 Type *ValTy = getLoadStoreType(I); 6829 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6830 const Align Alignment = getLoadStoreAlignment(I); 6831 unsigned AS = getLoadStoreAddressSpace(I); 6832 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6833 if (isa<LoadInst>(I)) { 6834 return TTI.getAddressComputationCost(ValTy) + 6835 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6836 CostKind) + 6837 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6838 } 6839 StoreInst *SI = cast<StoreInst>(I); 6840 6841 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6842 return TTI.getAddressComputationCost(ValTy) + 6843 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6844 CostKind) + 6845 (isLoopInvariantStoreValue 6846 ? 0 6847 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6848 VF.getKnownMinValue() - 1)); 6849 } 6850 6851 InstructionCost 6852 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6853 ElementCount VF) { 6854 Type *ValTy = getLoadStoreType(I); 6855 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6856 const Align Alignment = getLoadStoreAlignment(I); 6857 const Value *Ptr = getLoadStorePointerOperand(I); 6858 6859 return TTI.getAddressComputationCost(VectorTy) + 6860 TTI.getGatherScatterOpCost( 6861 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6862 TargetTransformInfo::TCK_RecipThroughput, I); 6863 } 6864 6865 InstructionCost 6866 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6867 ElementCount VF) { 6868 // TODO: Once we have support for interleaving with scalable vectors 6869 // we can calculate the cost properly here. 6870 if (VF.isScalable()) 6871 return InstructionCost::getInvalid(); 6872 6873 Type *ValTy = getLoadStoreType(I); 6874 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6875 unsigned AS = getLoadStoreAddressSpace(I); 6876 6877 auto Group = getInterleavedAccessGroup(I); 6878 assert(Group && "Fail to get an interleaved access group."); 6879 6880 unsigned InterleaveFactor = Group->getFactor(); 6881 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6882 6883 // Holds the indices of existing members in the interleaved group. 6884 SmallVector<unsigned, 4> Indices; 6885 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6886 if (Group->getMember(IF)) 6887 Indices.push_back(IF); 6888 6889 // Calculate the cost of the whole interleaved group. 6890 bool UseMaskForGaps = 6891 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6892 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6893 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6894 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6895 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6896 6897 if (Group->isReverse()) { 6898 // TODO: Add support for reversed masked interleaved access. 6899 assert(!Legal->isMaskRequired(I) && 6900 "Reverse masked interleaved access not supported."); 6901 Cost += 6902 Group->getNumMembers() * 6903 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6904 } 6905 return Cost; 6906 } 6907 6908 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6909 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6910 using namespace llvm::PatternMatch; 6911 // Early exit for no inloop reductions 6912 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6913 return None; 6914 auto *VectorTy = cast<VectorType>(Ty); 6915 6916 // We are looking for a pattern of, and finding the minimal acceptable cost: 6917 // reduce(mul(ext(A), ext(B))) or 6918 // reduce(mul(A, B)) or 6919 // reduce(ext(A)) or 6920 // reduce(A). 6921 // The basic idea is that we walk down the tree to do that, finding the root 6922 // reduction instruction in InLoopReductionImmediateChains. From there we find 6923 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6924 // of the components. If the reduction cost is lower then we return it for the 6925 // reduction instruction and 0 for the other instructions in the pattern. If 6926 // it is not we return an invalid cost specifying the orignal cost method 6927 // should be used. 6928 Instruction *RetI = I; 6929 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6930 if (!RetI->hasOneUser()) 6931 return None; 6932 RetI = RetI->user_back(); 6933 } 6934 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6935 RetI->user_back()->getOpcode() == Instruction::Add) { 6936 if (!RetI->hasOneUser()) 6937 return None; 6938 RetI = RetI->user_back(); 6939 } 6940 6941 // Test if the found instruction is a reduction, and if not return an invalid 6942 // cost specifying the parent to use the original cost modelling. 6943 if (!InLoopReductionImmediateChains.count(RetI)) 6944 return None; 6945 6946 // Find the reduction this chain is a part of and calculate the basic cost of 6947 // the reduction on its own. 6948 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6949 Instruction *ReductionPhi = LastChain; 6950 while (!isa<PHINode>(ReductionPhi)) 6951 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6952 6953 const RecurrenceDescriptor &RdxDesc = 6954 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6955 6956 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6957 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6958 6959 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6960 // normal fmul instruction to the cost of the fadd reduction. 6961 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6962 BaseCost += 6963 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6964 6965 // If we're using ordered reductions then we can just return the base cost 6966 // here, since getArithmeticReductionCost calculates the full ordered 6967 // reduction cost when FP reassociation is not allowed. 6968 if (useOrderedReductions(RdxDesc)) 6969 return BaseCost; 6970 6971 // Get the operand that was not the reduction chain and match it to one of the 6972 // patterns, returning the better cost if it is found. 6973 Instruction *RedOp = RetI->getOperand(1) == LastChain 6974 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6975 : dyn_cast<Instruction>(RetI->getOperand(1)); 6976 6977 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6978 6979 Instruction *Op0, *Op1; 6980 if (RedOp && 6981 match(RedOp, 6982 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6983 match(Op0, m_ZExtOrSExt(m_Value())) && 6984 Op0->getOpcode() == Op1->getOpcode() && 6985 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6986 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6987 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6988 6989 // Matched reduce(ext(mul(ext(A), ext(B))) 6990 // Note that the extend opcodes need to all match, or if A==B they will have 6991 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6992 // which is equally fine. 6993 bool IsUnsigned = isa<ZExtInst>(Op0); 6994 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6995 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6996 6997 InstructionCost ExtCost = 6998 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6999 TTI::CastContextHint::None, CostKind, Op0); 7000 InstructionCost MulCost = 7001 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7002 InstructionCost Ext2Cost = 7003 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7004 TTI::CastContextHint::None, CostKind, RedOp); 7005 7006 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7007 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7008 CostKind); 7009 7010 if (RedCost.isValid() && 7011 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7012 return I == RetI ? RedCost : 0; 7013 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7014 !TheLoop->isLoopInvariant(RedOp)) { 7015 // Matched reduce(ext(A)) 7016 bool IsUnsigned = isa<ZExtInst>(RedOp); 7017 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7018 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7019 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7020 CostKind); 7021 7022 InstructionCost ExtCost = 7023 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7024 TTI::CastContextHint::None, CostKind, RedOp); 7025 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7026 return I == RetI ? RedCost : 0; 7027 } else if (RedOp && 7028 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7029 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7030 Op0->getOpcode() == Op1->getOpcode() && 7031 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7032 bool IsUnsigned = isa<ZExtInst>(Op0); 7033 Type *Op0Ty = Op0->getOperand(0)->getType(); 7034 Type *Op1Ty = Op1->getOperand(0)->getType(); 7035 Type *LargestOpTy = 7036 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 7037 : Op0Ty; 7038 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 7039 7040 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 7041 // different sizes. We take the largest type as the ext to reduce, and add 7042 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 7043 InstructionCost ExtCost0 = TTI.getCastInstrCost( 7044 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 7045 TTI::CastContextHint::None, CostKind, Op0); 7046 InstructionCost ExtCost1 = TTI.getCastInstrCost( 7047 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 7048 TTI::CastContextHint::None, CostKind, Op1); 7049 InstructionCost MulCost = 7050 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7051 7052 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7053 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7054 CostKind); 7055 InstructionCost ExtraExtCost = 0; 7056 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7057 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7058 ExtraExtCost = TTI.getCastInstrCost( 7059 ExtraExtOp->getOpcode(), ExtType, 7060 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7061 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7062 } 7063 7064 if (RedCost.isValid() && 7065 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7066 return I == RetI ? RedCost : 0; 7067 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7068 // Matched reduce(mul()) 7069 InstructionCost MulCost = 7070 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7071 7072 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7073 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7074 CostKind); 7075 7076 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7077 return I == RetI ? RedCost : 0; 7078 } 7079 } 7080 7081 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7082 } 7083 7084 InstructionCost 7085 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7086 ElementCount VF) { 7087 // Calculate scalar cost only. Vectorization cost should be ready at this 7088 // moment. 7089 if (VF.isScalar()) { 7090 Type *ValTy = getLoadStoreType(I); 7091 const Align Alignment = getLoadStoreAlignment(I); 7092 unsigned AS = getLoadStoreAddressSpace(I); 7093 7094 return TTI.getAddressComputationCost(ValTy) + 7095 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7096 TTI::TCK_RecipThroughput, I); 7097 } 7098 return getWideningCost(I, VF); 7099 } 7100 7101 LoopVectorizationCostModel::VectorizationCostTy 7102 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7103 ElementCount VF) { 7104 // If we know that this instruction will remain uniform, check the cost of 7105 // the scalar version. 7106 if (isUniformAfterVectorization(I, VF)) 7107 VF = ElementCount::getFixed(1); 7108 7109 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7110 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7111 7112 // Forced scalars do not have any scalarization overhead. 7113 auto ForcedScalar = ForcedScalars.find(VF); 7114 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7115 auto InstSet = ForcedScalar->second; 7116 if (InstSet.count(I)) 7117 return VectorizationCostTy( 7118 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7119 VF.getKnownMinValue()), 7120 false); 7121 } 7122 7123 Type *VectorTy; 7124 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7125 7126 bool TypeNotScalarized = false; 7127 if (VF.isVector() && VectorTy->isVectorTy()) { 7128 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7129 if (NumParts) 7130 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7131 else 7132 C = InstructionCost::getInvalid(); 7133 } 7134 return VectorizationCostTy(C, TypeNotScalarized); 7135 } 7136 7137 InstructionCost 7138 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7139 ElementCount VF) const { 7140 7141 // There is no mechanism yet to create a scalable scalarization loop, 7142 // so this is currently Invalid. 7143 if (VF.isScalable()) 7144 return InstructionCost::getInvalid(); 7145 7146 if (VF.isScalar()) 7147 return 0; 7148 7149 InstructionCost Cost = 0; 7150 Type *RetTy = ToVectorTy(I->getType(), VF); 7151 if (!RetTy->isVoidTy() && 7152 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7153 Cost += TTI.getScalarizationOverhead( 7154 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7155 false); 7156 7157 // Some targets keep addresses scalar. 7158 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7159 return Cost; 7160 7161 // Some targets support efficient element stores. 7162 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7163 return Cost; 7164 7165 // Collect operands to consider. 7166 CallInst *CI = dyn_cast<CallInst>(I); 7167 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7168 7169 // Skip operands that do not require extraction/scalarization and do not incur 7170 // any overhead. 7171 SmallVector<Type *> Tys; 7172 for (auto *V : filterExtractingOperands(Ops, VF)) 7173 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7174 return Cost + TTI.getOperandsScalarizationOverhead( 7175 filterExtractingOperands(Ops, VF), Tys); 7176 } 7177 7178 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7179 if (VF.isScalar()) 7180 return; 7181 NumPredStores = 0; 7182 for (BasicBlock *BB : TheLoop->blocks()) { 7183 // For each instruction in the old loop. 7184 for (Instruction &I : *BB) { 7185 Value *Ptr = getLoadStorePointerOperand(&I); 7186 if (!Ptr) 7187 continue; 7188 7189 // TODO: We should generate better code and update the cost model for 7190 // predicated uniform stores. Today they are treated as any other 7191 // predicated store (see added test cases in 7192 // invariant-store-vectorization.ll). 7193 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7194 NumPredStores++; 7195 7196 if (Legal->isUniformMemOp(I)) { 7197 // TODO: Avoid replicating loads and stores instead of 7198 // relying on instcombine to remove them. 7199 // Load: Scalar load + broadcast 7200 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7201 InstructionCost Cost; 7202 if (isa<StoreInst>(&I) && VF.isScalable() && 7203 isLegalGatherOrScatter(&I)) { 7204 Cost = getGatherScatterCost(&I, VF); 7205 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7206 } else { 7207 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7208 "Cannot yet scalarize uniform stores"); 7209 Cost = getUniformMemOpCost(&I, VF); 7210 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7211 } 7212 continue; 7213 } 7214 7215 // We assume that widening is the best solution when possible. 7216 if (memoryInstructionCanBeWidened(&I, VF)) { 7217 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7218 int ConsecutiveStride = Legal->isConsecutivePtr( 7219 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7220 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7221 "Expected consecutive stride."); 7222 InstWidening Decision = 7223 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7224 setWideningDecision(&I, VF, Decision, Cost); 7225 continue; 7226 } 7227 7228 // Choose between Interleaving, Gather/Scatter or Scalarization. 7229 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7230 unsigned NumAccesses = 1; 7231 if (isAccessInterleaved(&I)) { 7232 auto Group = getInterleavedAccessGroup(&I); 7233 assert(Group && "Fail to get an interleaved access group."); 7234 7235 // Make one decision for the whole group. 7236 if (getWideningDecision(&I, VF) != CM_Unknown) 7237 continue; 7238 7239 NumAccesses = Group->getNumMembers(); 7240 if (interleavedAccessCanBeWidened(&I, VF)) 7241 InterleaveCost = getInterleaveGroupCost(&I, VF); 7242 } 7243 7244 InstructionCost GatherScatterCost = 7245 isLegalGatherOrScatter(&I) 7246 ? getGatherScatterCost(&I, VF) * NumAccesses 7247 : InstructionCost::getInvalid(); 7248 7249 InstructionCost ScalarizationCost = 7250 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7251 7252 // Choose better solution for the current VF, 7253 // write down this decision and use it during vectorization. 7254 InstructionCost Cost; 7255 InstWidening Decision; 7256 if (InterleaveCost <= GatherScatterCost && 7257 InterleaveCost < ScalarizationCost) { 7258 Decision = CM_Interleave; 7259 Cost = InterleaveCost; 7260 } else if (GatherScatterCost < ScalarizationCost) { 7261 Decision = CM_GatherScatter; 7262 Cost = GatherScatterCost; 7263 } else { 7264 Decision = CM_Scalarize; 7265 Cost = ScalarizationCost; 7266 } 7267 // If the instructions belongs to an interleave group, the whole group 7268 // receives the same decision. The whole group receives the cost, but 7269 // the cost will actually be assigned to one instruction. 7270 if (auto Group = getInterleavedAccessGroup(&I)) 7271 setWideningDecision(Group, VF, Decision, Cost); 7272 else 7273 setWideningDecision(&I, VF, Decision, Cost); 7274 } 7275 } 7276 7277 // Make sure that any load of address and any other address computation 7278 // remains scalar unless there is gather/scatter support. This avoids 7279 // inevitable extracts into address registers, and also has the benefit of 7280 // activating LSR more, since that pass can't optimize vectorized 7281 // addresses. 7282 if (TTI.prefersVectorizedAddressing()) 7283 return; 7284 7285 // Start with all scalar pointer uses. 7286 SmallPtrSet<Instruction *, 8> AddrDefs; 7287 for (BasicBlock *BB : TheLoop->blocks()) 7288 for (Instruction &I : *BB) { 7289 Instruction *PtrDef = 7290 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7291 if (PtrDef && TheLoop->contains(PtrDef) && 7292 getWideningDecision(&I, VF) != CM_GatherScatter) 7293 AddrDefs.insert(PtrDef); 7294 } 7295 7296 // Add all instructions used to generate the addresses. 7297 SmallVector<Instruction *, 4> Worklist; 7298 append_range(Worklist, AddrDefs); 7299 while (!Worklist.empty()) { 7300 Instruction *I = Worklist.pop_back_val(); 7301 for (auto &Op : I->operands()) 7302 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7303 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7304 AddrDefs.insert(InstOp).second) 7305 Worklist.push_back(InstOp); 7306 } 7307 7308 for (auto *I : AddrDefs) { 7309 if (isa<LoadInst>(I)) { 7310 // Setting the desired widening decision should ideally be handled in 7311 // by cost functions, but since this involves the task of finding out 7312 // if the loaded register is involved in an address computation, it is 7313 // instead changed here when we know this is the case. 7314 InstWidening Decision = getWideningDecision(I, VF); 7315 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7316 // Scalarize a widened load of address. 7317 setWideningDecision( 7318 I, VF, CM_Scalarize, 7319 (VF.getKnownMinValue() * 7320 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7321 else if (auto Group = getInterleavedAccessGroup(I)) { 7322 // Scalarize an interleave group of address loads. 7323 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7324 if (Instruction *Member = Group->getMember(I)) 7325 setWideningDecision( 7326 Member, VF, CM_Scalarize, 7327 (VF.getKnownMinValue() * 7328 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7329 } 7330 } 7331 } else 7332 // Make sure I gets scalarized and a cost estimate without 7333 // scalarization overhead. 7334 ForcedScalars[VF].insert(I); 7335 } 7336 } 7337 7338 InstructionCost 7339 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7340 Type *&VectorTy) { 7341 Type *RetTy = I->getType(); 7342 if (canTruncateToMinimalBitwidth(I, VF)) 7343 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7344 auto SE = PSE.getSE(); 7345 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7346 7347 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7348 ElementCount VF) -> bool { 7349 if (VF.isScalar()) 7350 return true; 7351 7352 auto Scalarized = InstsToScalarize.find(VF); 7353 assert(Scalarized != InstsToScalarize.end() && 7354 "VF not yet analyzed for scalarization profitability"); 7355 return !Scalarized->second.count(I) && 7356 llvm::all_of(I->users(), [&](User *U) { 7357 auto *UI = cast<Instruction>(U); 7358 return !Scalarized->second.count(UI); 7359 }); 7360 }; 7361 (void) hasSingleCopyAfterVectorization; 7362 7363 if (isScalarAfterVectorization(I, VF)) { 7364 // With the exception of GEPs and PHIs, after scalarization there should 7365 // only be one copy of the instruction generated in the loop. This is 7366 // because the VF is either 1, or any instructions that need scalarizing 7367 // have already been dealt with by the the time we get here. As a result, 7368 // it means we don't have to multiply the instruction cost by VF. 7369 assert(I->getOpcode() == Instruction::GetElementPtr || 7370 I->getOpcode() == Instruction::PHI || 7371 (I->getOpcode() == Instruction::BitCast && 7372 I->getType()->isPointerTy()) || 7373 hasSingleCopyAfterVectorization(I, VF)); 7374 VectorTy = RetTy; 7375 } else 7376 VectorTy = ToVectorTy(RetTy, VF); 7377 7378 // TODO: We need to estimate the cost of intrinsic calls. 7379 switch (I->getOpcode()) { 7380 case Instruction::GetElementPtr: 7381 // We mark this instruction as zero-cost because the cost of GEPs in 7382 // vectorized code depends on whether the corresponding memory instruction 7383 // is scalarized or not. Therefore, we handle GEPs with the memory 7384 // instruction cost. 7385 return 0; 7386 case Instruction::Br: { 7387 // In cases of scalarized and predicated instructions, there will be VF 7388 // predicated blocks in the vectorized loop. Each branch around these 7389 // blocks requires also an extract of its vector compare i1 element. 7390 bool ScalarPredicatedBB = false; 7391 BranchInst *BI = cast<BranchInst>(I); 7392 if (VF.isVector() && BI->isConditional() && 7393 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7394 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7395 ScalarPredicatedBB = true; 7396 7397 if (ScalarPredicatedBB) { 7398 // Not possible to scalarize scalable vector with predicated instructions. 7399 if (VF.isScalable()) 7400 return InstructionCost::getInvalid(); 7401 // Return cost for branches around scalarized and predicated blocks. 7402 auto *Vec_i1Ty = 7403 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7404 return ( 7405 TTI.getScalarizationOverhead( 7406 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7407 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7408 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7409 // The back-edge branch will remain, as will all scalar branches. 7410 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7411 else 7412 // This branch will be eliminated by if-conversion. 7413 return 0; 7414 // Note: We currently assume zero cost for an unconditional branch inside 7415 // a predicated block since it will become a fall-through, although we 7416 // may decide in the future to call TTI for all branches. 7417 } 7418 case Instruction::PHI: { 7419 auto *Phi = cast<PHINode>(I); 7420 7421 // First-order recurrences are replaced by vector shuffles inside the loop. 7422 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7423 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7424 return TTI.getShuffleCost( 7425 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7426 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7427 7428 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7429 // converted into select instructions. We require N - 1 selects per phi 7430 // node, where N is the number of incoming values. 7431 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7432 return (Phi->getNumIncomingValues() - 1) * 7433 TTI.getCmpSelInstrCost( 7434 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7435 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7436 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7437 7438 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7439 } 7440 case Instruction::UDiv: 7441 case Instruction::SDiv: 7442 case Instruction::URem: 7443 case Instruction::SRem: 7444 // If we have a predicated instruction, it may not be executed for each 7445 // vector lane. Get the scalarization cost and scale this amount by the 7446 // probability of executing the predicated block. If the instruction is not 7447 // predicated, we fall through to the next case. 7448 if (VF.isVector() && isScalarWithPredication(I)) { 7449 InstructionCost Cost = 0; 7450 7451 // These instructions have a non-void type, so account for the phi nodes 7452 // that we will create. This cost is likely to be zero. The phi node 7453 // cost, if any, should be scaled by the block probability because it 7454 // models a copy at the end of each predicated block. 7455 Cost += VF.getKnownMinValue() * 7456 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7457 7458 // The cost of the non-predicated instruction. 7459 Cost += VF.getKnownMinValue() * 7460 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7461 7462 // The cost of insertelement and extractelement instructions needed for 7463 // scalarization. 7464 Cost += getScalarizationOverhead(I, VF); 7465 7466 // Scale the cost by the probability of executing the predicated blocks. 7467 // This assumes the predicated block for each vector lane is equally 7468 // likely. 7469 return Cost / getReciprocalPredBlockProb(); 7470 } 7471 LLVM_FALLTHROUGH; 7472 case Instruction::Add: 7473 case Instruction::FAdd: 7474 case Instruction::Sub: 7475 case Instruction::FSub: 7476 case Instruction::Mul: 7477 case Instruction::FMul: 7478 case Instruction::FDiv: 7479 case Instruction::FRem: 7480 case Instruction::Shl: 7481 case Instruction::LShr: 7482 case Instruction::AShr: 7483 case Instruction::And: 7484 case Instruction::Or: 7485 case Instruction::Xor: { 7486 // Since we will replace the stride by 1 the multiplication should go away. 7487 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7488 return 0; 7489 7490 // Detect reduction patterns 7491 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7492 return *RedCost; 7493 7494 // Certain instructions can be cheaper to vectorize if they have a constant 7495 // second vector operand. One example of this are shifts on x86. 7496 Value *Op2 = I->getOperand(1); 7497 TargetTransformInfo::OperandValueProperties Op2VP; 7498 TargetTransformInfo::OperandValueKind Op2VK = 7499 TTI.getOperandInfo(Op2, Op2VP); 7500 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7501 Op2VK = TargetTransformInfo::OK_UniformValue; 7502 7503 SmallVector<const Value *, 4> Operands(I->operand_values()); 7504 return TTI.getArithmeticInstrCost( 7505 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7506 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7507 } 7508 case Instruction::FNeg: { 7509 return TTI.getArithmeticInstrCost( 7510 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7511 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7512 TargetTransformInfo::OP_None, I->getOperand(0), I); 7513 } 7514 case Instruction::Select: { 7515 SelectInst *SI = cast<SelectInst>(I); 7516 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7517 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7518 7519 const Value *Op0, *Op1; 7520 using namespace llvm::PatternMatch; 7521 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7522 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7523 // select x, y, false --> x & y 7524 // select x, true, y --> x | y 7525 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7526 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7527 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7528 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7529 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7530 Op1->getType()->getScalarSizeInBits() == 1); 7531 7532 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7533 return TTI.getArithmeticInstrCost( 7534 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7535 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7536 } 7537 7538 Type *CondTy = SI->getCondition()->getType(); 7539 if (!ScalarCond) 7540 CondTy = VectorType::get(CondTy, VF); 7541 7542 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7543 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7544 Pred = Cmp->getPredicate(); 7545 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7546 CostKind, I); 7547 } 7548 case Instruction::ICmp: 7549 case Instruction::FCmp: { 7550 Type *ValTy = I->getOperand(0)->getType(); 7551 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7552 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7553 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7554 VectorTy = ToVectorTy(ValTy, VF); 7555 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7556 cast<CmpInst>(I)->getPredicate(), CostKind, 7557 I); 7558 } 7559 case Instruction::Store: 7560 case Instruction::Load: { 7561 ElementCount Width = VF; 7562 if (Width.isVector()) { 7563 InstWidening Decision = getWideningDecision(I, Width); 7564 assert(Decision != CM_Unknown && 7565 "CM decision should be taken at this point"); 7566 if (Decision == CM_Scalarize) 7567 Width = ElementCount::getFixed(1); 7568 } 7569 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7570 return getMemoryInstructionCost(I, VF); 7571 } 7572 case Instruction::BitCast: 7573 if (I->getType()->isPointerTy()) 7574 return 0; 7575 LLVM_FALLTHROUGH; 7576 case Instruction::ZExt: 7577 case Instruction::SExt: 7578 case Instruction::FPToUI: 7579 case Instruction::FPToSI: 7580 case Instruction::FPExt: 7581 case Instruction::PtrToInt: 7582 case Instruction::IntToPtr: 7583 case Instruction::SIToFP: 7584 case Instruction::UIToFP: 7585 case Instruction::Trunc: 7586 case Instruction::FPTrunc: { 7587 // Computes the CastContextHint from a Load/Store instruction. 7588 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7589 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7590 "Expected a load or a store!"); 7591 7592 if (VF.isScalar() || !TheLoop->contains(I)) 7593 return TTI::CastContextHint::Normal; 7594 7595 switch (getWideningDecision(I, VF)) { 7596 case LoopVectorizationCostModel::CM_GatherScatter: 7597 return TTI::CastContextHint::GatherScatter; 7598 case LoopVectorizationCostModel::CM_Interleave: 7599 return TTI::CastContextHint::Interleave; 7600 case LoopVectorizationCostModel::CM_Scalarize: 7601 case LoopVectorizationCostModel::CM_Widen: 7602 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7603 : TTI::CastContextHint::Normal; 7604 case LoopVectorizationCostModel::CM_Widen_Reverse: 7605 return TTI::CastContextHint::Reversed; 7606 case LoopVectorizationCostModel::CM_Unknown: 7607 llvm_unreachable("Instr did not go through cost modelling?"); 7608 } 7609 7610 llvm_unreachable("Unhandled case!"); 7611 }; 7612 7613 unsigned Opcode = I->getOpcode(); 7614 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7615 // For Trunc, the context is the only user, which must be a StoreInst. 7616 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7617 if (I->hasOneUse()) 7618 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7619 CCH = ComputeCCH(Store); 7620 } 7621 // For Z/Sext, the context is the operand, which must be a LoadInst. 7622 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7623 Opcode == Instruction::FPExt) { 7624 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7625 CCH = ComputeCCH(Load); 7626 } 7627 7628 // We optimize the truncation of induction variables having constant 7629 // integer steps. The cost of these truncations is the same as the scalar 7630 // operation. 7631 if (isOptimizableIVTruncate(I, VF)) { 7632 auto *Trunc = cast<TruncInst>(I); 7633 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7634 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7635 } 7636 7637 // Detect reduction patterns 7638 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7639 return *RedCost; 7640 7641 Type *SrcScalarTy = I->getOperand(0)->getType(); 7642 Type *SrcVecTy = 7643 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7644 if (canTruncateToMinimalBitwidth(I, VF)) { 7645 // This cast is going to be shrunk. This may remove the cast or it might 7646 // turn it into slightly different cast. For example, if MinBW == 16, 7647 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7648 // 7649 // Calculate the modified src and dest types. 7650 Type *MinVecTy = VectorTy; 7651 if (Opcode == Instruction::Trunc) { 7652 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7653 VectorTy = 7654 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7655 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7656 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7657 VectorTy = 7658 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7659 } 7660 } 7661 7662 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7663 } 7664 case Instruction::Call: { 7665 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7666 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7667 return *RedCost; 7668 bool NeedToScalarize; 7669 CallInst *CI = cast<CallInst>(I); 7670 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7671 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7672 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7673 return std::min(CallCost, IntrinsicCost); 7674 } 7675 return CallCost; 7676 } 7677 case Instruction::ExtractValue: 7678 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7679 case Instruction::Alloca: 7680 // We cannot easily widen alloca to a scalable alloca, as 7681 // the result would need to be a vector of pointers. 7682 if (VF.isScalable()) 7683 return InstructionCost::getInvalid(); 7684 LLVM_FALLTHROUGH; 7685 default: 7686 // This opcode is unknown. Assume that it is the same as 'mul'. 7687 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7688 } // end of switch. 7689 } 7690 7691 char LoopVectorize::ID = 0; 7692 7693 static const char lv_name[] = "Loop Vectorization"; 7694 7695 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7696 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7697 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7698 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7699 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7700 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7701 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7702 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7703 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7704 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7705 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7706 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7707 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7708 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7709 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7710 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7711 7712 namespace llvm { 7713 7714 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7715 7716 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7717 bool VectorizeOnlyWhenForced) { 7718 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7719 } 7720 7721 } // end namespace llvm 7722 7723 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7724 // Check if the pointer operand of a load or store instruction is 7725 // consecutive. 7726 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7727 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7728 return false; 7729 } 7730 7731 void LoopVectorizationCostModel::collectValuesToIgnore() { 7732 // Ignore ephemeral values. 7733 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7734 7735 // Ignore type-promoting instructions we identified during reduction 7736 // detection. 7737 for (auto &Reduction : Legal->getReductionVars()) { 7738 const RecurrenceDescriptor &RedDes = Reduction.second; 7739 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7740 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7741 } 7742 // Ignore type-casting instructions we identified during induction 7743 // detection. 7744 for (auto &Induction : Legal->getInductionVars()) { 7745 const InductionDescriptor &IndDes = Induction.second; 7746 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7747 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7748 } 7749 } 7750 7751 void LoopVectorizationCostModel::collectInLoopReductions() { 7752 for (auto &Reduction : Legal->getReductionVars()) { 7753 PHINode *Phi = Reduction.first; 7754 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7755 7756 // We don't collect reductions that are type promoted (yet). 7757 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7758 continue; 7759 7760 // If the target would prefer this reduction to happen "in-loop", then we 7761 // want to record it as such. 7762 unsigned Opcode = RdxDesc.getOpcode(); 7763 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7764 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7765 TargetTransformInfo::ReductionFlags())) 7766 continue; 7767 7768 // Check that we can correctly put the reductions into the loop, by 7769 // finding the chain of operations that leads from the phi to the loop 7770 // exit value. 7771 SmallVector<Instruction *, 4> ReductionOperations = 7772 RdxDesc.getReductionOpChain(Phi, TheLoop); 7773 bool InLoop = !ReductionOperations.empty(); 7774 if (InLoop) { 7775 InLoopReductionChains[Phi] = ReductionOperations; 7776 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7777 Instruction *LastChain = Phi; 7778 for (auto *I : ReductionOperations) { 7779 InLoopReductionImmediateChains[I] = LastChain; 7780 LastChain = I; 7781 } 7782 } 7783 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7784 << " reduction for phi: " << *Phi << "\n"); 7785 } 7786 } 7787 7788 // TODO: we could return a pair of values that specify the max VF and 7789 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7790 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7791 // doesn't have a cost model that can choose which plan to execute if 7792 // more than one is generated. 7793 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7794 LoopVectorizationCostModel &CM) { 7795 unsigned WidestType; 7796 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7797 return WidestVectorRegBits / WidestType; 7798 } 7799 7800 VectorizationFactor 7801 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7802 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7803 ElementCount VF = UserVF; 7804 // Outer loop handling: They may require CFG and instruction level 7805 // transformations before even evaluating whether vectorization is profitable. 7806 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7807 // the vectorization pipeline. 7808 if (!OrigLoop->isInnermost()) { 7809 // If the user doesn't provide a vectorization factor, determine a 7810 // reasonable one. 7811 if (UserVF.isZero()) { 7812 VF = ElementCount::getFixed(determineVPlanVF( 7813 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7814 .getFixedSize(), 7815 CM)); 7816 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7817 7818 // Make sure we have a VF > 1 for stress testing. 7819 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7820 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7821 << "overriding computed VF.\n"); 7822 VF = ElementCount::getFixed(4); 7823 } 7824 } 7825 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7826 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7827 "VF needs to be a power of two"); 7828 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7829 << "VF " << VF << " to build VPlans.\n"); 7830 buildVPlans(VF, VF); 7831 7832 // For VPlan build stress testing, we bail out after VPlan construction. 7833 if (VPlanBuildStressTest) 7834 return VectorizationFactor::Disabled(); 7835 7836 return {VF, 0 /*Cost*/}; 7837 } 7838 7839 LLVM_DEBUG( 7840 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7841 "VPlan-native path.\n"); 7842 return VectorizationFactor::Disabled(); 7843 } 7844 7845 Optional<VectorizationFactor> 7846 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7847 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7848 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7849 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7850 return None; 7851 7852 // Invalidate interleave groups if all blocks of loop will be predicated. 7853 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7854 !useMaskedInterleavedAccesses(*TTI)) { 7855 LLVM_DEBUG( 7856 dbgs() 7857 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7858 "which requires masked-interleaved support.\n"); 7859 if (CM.InterleaveInfo.invalidateGroups()) 7860 // Invalidating interleave groups also requires invalidating all decisions 7861 // based on them, which includes widening decisions and uniform and scalar 7862 // values. 7863 CM.invalidateCostModelingDecisions(); 7864 } 7865 7866 ElementCount MaxUserVF = 7867 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7868 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7869 if (!UserVF.isZero() && UserVFIsLegal) { 7870 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7871 "VF needs to be a power of two"); 7872 // Collect the instructions (and their associated costs) that will be more 7873 // profitable to scalarize. 7874 if (CM.selectUserVectorizationFactor(UserVF)) { 7875 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7876 CM.collectInLoopReductions(); 7877 buildVPlansWithVPRecipes(UserVF, UserVF); 7878 LLVM_DEBUG(printPlans(dbgs())); 7879 return {{UserVF, 0}}; 7880 } else 7881 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7882 "InvalidCost", ORE, OrigLoop); 7883 } 7884 7885 // Populate the set of Vectorization Factor Candidates. 7886 ElementCountSet VFCandidates; 7887 for (auto VF = ElementCount::getFixed(1); 7888 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7889 VFCandidates.insert(VF); 7890 for (auto VF = ElementCount::getScalable(1); 7891 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7892 VFCandidates.insert(VF); 7893 7894 for (const auto &VF : VFCandidates) { 7895 // Collect Uniform and Scalar instructions after vectorization with VF. 7896 CM.collectUniformsAndScalars(VF); 7897 7898 // Collect the instructions (and their associated costs) that will be more 7899 // profitable to scalarize. 7900 if (VF.isVector()) 7901 CM.collectInstsToScalarize(VF); 7902 } 7903 7904 CM.collectInLoopReductions(); 7905 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7906 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7907 7908 LLVM_DEBUG(printPlans(dbgs())); 7909 if (!MaxFactors.hasVector()) 7910 return VectorizationFactor::Disabled(); 7911 7912 // Select the optimal vectorization factor. 7913 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7914 7915 // Check if it is profitable to vectorize with runtime checks. 7916 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7917 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7918 bool PragmaThresholdReached = 7919 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7920 bool ThresholdReached = 7921 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7922 if ((ThresholdReached && !Hints.allowReordering()) || 7923 PragmaThresholdReached) { 7924 ORE->emit([&]() { 7925 return OptimizationRemarkAnalysisAliasing( 7926 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7927 OrigLoop->getHeader()) 7928 << "loop not vectorized: cannot prove it is safe to reorder " 7929 "memory operations"; 7930 }); 7931 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7932 Hints.emitRemarkWithHints(); 7933 return VectorizationFactor::Disabled(); 7934 } 7935 } 7936 return SelectedVF; 7937 } 7938 7939 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7940 assert(count_if(VPlans, 7941 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7942 1 && 7943 "Best VF has not a single VPlan."); 7944 7945 for (const VPlanPtr &Plan : VPlans) { 7946 if (Plan->hasVF(VF)) 7947 return *Plan.get(); 7948 } 7949 llvm_unreachable("No plan found!"); 7950 } 7951 7952 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7953 VPlan &BestVPlan, 7954 InnerLoopVectorizer &ILV, 7955 DominatorTree *DT) { 7956 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7957 << '\n'); 7958 7959 // Perform the actual loop transformation. 7960 7961 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7962 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7963 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7964 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7965 State.CanonicalIV = ILV.Induction; 7966 ILV.collectPoisonGeneratingRecipes(State); 7967 7968 ILV.printDebugTracesAtStart(); 7969 7970 //===------------------------------------------------===// 7971 // 7972 // Notice: any optimization or new instruction that go 7973 // into the code below should also be implemented in 7974 // the cost-model. 7975 // 7976 //===------------------------------------------------===// 7977 7978 // 2. Copy and widen instructions from the old loop into the new loop. 7979 BestVPlan.execute(&State); 7980 7981 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7982 // predication, updating analyses. 7983 ILV.fixVectorizedLoop(State); 7984 7985 ILV.printDebugTracesAtEnd(); 7986 } 7987 7988 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7989 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7990 for (const auto &Plan : VPlans) 7991 if (PrintVPlansInDotFormat) 7992 Plan->printDOT(O); 7993 else 7994 Plan->print(O); 7995 } 7996 #endif 7997 7998 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7999 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8000 8001 // We create new control-flow for the vectorized loop, so the original exit 8002 // conditions will be dead after vectorization if it's only used by the 8003 // terminator 8004 SmallVector<BasicBlock*> ExitingBlocks; 8005 OrigLoop->getExitingBlocks(ExitingBlocks); 8006 for (auto *BB : ExitingBlocks) { 8007 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8008 if (!Cmp || !Cmp->hasOneUse()) 8009 continue; 8010 8011 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8012 if (!DeadInstructions.insert(Cmp).second) 8013 continue; 8014 8015 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8016 // TODO: can recurse through operands in general 8017 for (Value *Op : Cmp->operands()) { 8018 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8019 DeadInstructions.insert(cast<Instruction>(Op)); 8020 } 8021 } 8022 8023 // We create new "steps" for induction variable updates to which the original 8024 // induction variables map. An original update instruction will be dead if 8025 // all its users except the induction variable are dead. 8026 auto *Latch = OrigLoop->getLoopLatch(); 8027 for (auto &Induction : Legal->getInductionVars()) { 8028 PHINode *Ind = Induction.first; 8029 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8030 8031 // If the tail is to be folded by masking, the primary induction variable, 8032 // if exists, isn't dead: it will be used for masking. Don't kill it. 8033 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8034 continue; 8035 8036 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8037 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8038 })) 8039 DeadInstructions.insert(IndUpdate); 8040 } 8041 } 8042 8043 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8044 8045 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8046 8047 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8048 SmallVector<Metadata *, 4> MDs; 8049 // Reserve first location for self reference to the LoopID metadata node. 8050 MDs.push_back(nullptr); 8051 bool IsUnrollMetadata = false; 8052 MDNode *LoopID = L->getLoopID(); 8053 if (LoopID) { 8054 // First find existing loop unrolling disable metadata. 8055 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8056 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8057 if (MD) { 8058 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8059 IsUnrollMetadata = 8060 S && S->getString().startswith("llvm.loop.unroll.disable"); 8061 } 8062 MDs.push_back(LoopID->getOperand(i)); 8063 } 8064 } 8065 8066 if (!IsUnrollMetadata) { 8067 // Add runtime unroll disable metadata. 8068 LLVMContext &Context = L->getHeader()->getContext(); 8069 SmallVector<Metadata *, 1> DisableOperands; 8070 DisableOperands.push_back( 8071 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8072 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8073 MDs.push_back(DisableNode); 8074 MDNode *NewLoopID = MDNode::get(Context, MDs); 8075 // Set operand 0 to refer to the loop id itself. 8076 NewLoopID->replaceOperandWith(0, NewLoopID); 8077 L->setLoopID(NewLoopID); 8078 } 8079 } 8080 8081 //===--------------------------------------------------------------------===// 8082 // EpilogueVectorizerMainLoop 8083 //===--------------------------------------------------------------------===// 8084 8085 /// This function is partially responsible for generating the control flow 8086 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8087 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8088 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8089 Loop *Lp = createVectorLoopSkeleton(""); 8090 8091 // Generate the code to check the minimum iteration count of the vector 8092 // epilogue (see below). 8093 EPI.EpilogueIterationCountCheck = 8094 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8095 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8096 8097 // Generate the code to check any assumptions that we've made for SCEV 8098 // expressions. 8099 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8100 8101 // Generate the code that checks at runtime if arrays overlap. We put the 8102 // checks into a separate block to make the more common case of few elements 8103 // faster. 8104 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8105 8106 // Generate the iteration count check for the main loop, *after* the check 8107 // for the epilogue loop, so that the path-length is shorter for the case 8108 // that goes directly through the vector epilogue. The longer-path length for 8109 // the main loop is compensated for, by the gain from vectorizing the larger 8110 // trip count. Note: the branch will get updated later on when we vectorize 8111 // the epilogue. 8112 EPI.MainLoopIterationCountCheck = 8113 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8114 8115 // Generate the induction variable. 8116 OldInduction = Legal->getPrimaryInduction(); 8117 Type *IdxTy = Legal->getWidestInductionType(); 8118 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8119 8120 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8121 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8122 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8123 EPI.VectorTripCount = CountRoundDown; 8124 Induction = 8125 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8126 getDebugLocFromInstOrOperands(OldInduction)); 8127 8128 // Skip induction resume value creation here because they will be created in 8129 // the second pass. If we created them here, they wouldn't be used anyway, 8130 // because the vplan in the second pass still contains the inductions from the 8131 // original loop. 8132 8133 return completeLoopSkeleton(Lp, OrigLoopID); 8134 } 8135 8136 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8137 LLVM_DEBUG({ 8138 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8139 << "Main Loop VF:" << EPI.MainLoopVF 8140 << ", Main Loop UF:" << EPI.MainLoopUF 8141 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8142 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8143 }); 8144 } 8145 8146 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8147 DEBUG_WITH_TYPE(VerboseDebug, { 8148 dbgs() << "intermediate fn:\n" 8149 << *OrigLoop->getHeader()->getParent() << "\n"; 8150 }); 8151 } 8152 8153 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8154 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8155 assert(L && "Expected valid Loop."); 8156 assert(Bypass && "Expected valid bypass basic block."); 8157 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8158 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8159 Value *Count = getOrCreateTripCount(L); 8160 // Reuse existing vector loop preheader for TC checks. 8161 // Note that new preheader block is generated for vector loop. 8162 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8163 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8164 8165 // Generate code to check if the loop's trip count is less than VF * UF of the 8166 // main vector loop. 8167 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8168 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8169 8170 Value *CheckMinIters = Builder.CreateICmp( 8171 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8172 "min.iters.check"); 8173 8174 if (!ForEpilogue) 8175 TCCheckBlock->setName("vector.main.loop.iter.check"); 8176 8177 // Create new preheader for vector loop. 8178 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8179 DT, LI, nullptr, "vector.ph"); 8180 8181 if (ForEpilogue) { 8182 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8183 DT->getNode(Bypass)->getIDom()) && 8184 "TC check is expected to dominate Bypass"); 8185 8186 // Update dominator for Bypass & LoopExit. 8187 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8188 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8189 // For loops with multiple exits, there's no edge from the middle block 8190 // to exit blocks (as the epilogue must run) and thus no need to update 8191 // the immediate dominator of the exit blocks. 8192 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8193 8194 LoopBypassBlocks.push_back(TCCheckBlock); 8195 8196 // Save the trip count so we don't have to regenerate it in the 8197 // vec.epilog.iter.check. This is safe to do because the trip count 8198 // generated here dominates the vector epilog iter check. 8199 EPI.TripCount = Count; 8200 } 8201 8202 ReplaceInstWithInst( 8203 TCCheckBlock->getTerminator(), 8204 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8205 8206 return TCCheckBlock; 8207 } 8208 8209 //===--------------------------------------------------------------------===// 8210 // EpilogueVectorizerEpilogueLoop 8211 //===--------------------------------------------------------------------===// 8212 8213 /// This function is partially responsible for generating the control flow 8214 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8215 BasicBlock * 8216 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8217 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8218 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8219 8220 // Now, compare the remaining count and if there aren't enough iterations to 8221 // execute the vectorized epilogue skip to the scalar part. 8222 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8223 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8224 LoopVectorPreHeader = 8225 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8226 LI, nullptr, "vec.epilog.ph"); 8227 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8228 VecEpilogueIterationCountCheck); 8229 8230 // Adjust the control flow taking the state info from the main loop 8231 // vectorization into account. 8232 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8233 "expected this to be saved from the previous pass."); 8234 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8235 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8236 8237 DT->changeImmediateDominator(LoopVectorPreHeader, 8238 EPI.MainLoopIterationCountCheck); 8239 8240 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8241 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8242 8243 if (EPI.SCEVSafetyCheck) 8244 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8245 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8246 if (EPI.MemSafetyCheck) 8247 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8248 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8249 8250 DT->changeImmediateDominator( 8251 VecEpilogueIterationCountCheck, 8252 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8253 8254 DT->changeImmediateDominator(LoopScalarPreHeader, 8255 EPI.EpilogueIterationCountCheck); 8256 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8257 // If there is an epilogue which must run, there's no edge from the 8258 // middle block to exit blocks and thus no need to update the immediate 8259 // dominator of the exit blocks. 8260 DT->changeImmediateDominator(LoopExitBlock, 8261 EPI.EpilogueIterationCountCheck); 8262 8263 // Keep track of bypass blocks, as they feed start values to the induction 8264 // phis in the scalar loop preheader. 8265 if (EPI.SCEVSafetyCheck) 8266 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8267 if (EPI.MemSafetyCheck) 8268 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8269 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8270 8271 // Generate a resume induction for the vector epilogue and put it in the 8272 // vector epilogue preheader 8273 Type *IdxTy = Legal->getWidestInductionType(); 8274 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8275 LoopVectorPreHeader->getFirstNonPHI()); 8276 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8277 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8278 EPI.MainLoopIterationCountCheck); 8279 8280 // Generate the induction variable. 8281 OldInduction = Legal->getPrimaryInduction(); 8282 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8283 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8284 Value *StartIdx = EPResumeVal; 8285 Induction = 8286 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8287 getDebugLocFromInstOrOperands(OldInduction)); 8288 8289 // Generate induction resume values. These variables save the new starting 8290 // indexes for the scalar loop. They are used to test if there are any tail 8291 // iterations left once the vector loop has completed. 8292 // Note that when the vectorized epilogue is skipped due to iteration count 8293 // check, then the resume value for the induction variable comes from 8294 // the trip count of the main vector loop, hence passing the AdditionalBypass 8295 // argument. 8296 createInductionResumeValues(Lp, CountRoundDown, 8297 {VecEpilogueIterationCountCheck, 8298 EPI.VectorTripCount} /* AdditionalBypass */); 8299 8300 AddRuntimeUnrollDisableMetaData(Lp); 8301 return completeLoopSkeleton(Lp, OrigLoopID); 8302 } 8303 8304 BasicBlock * 8305 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8306 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8307 8308 assert(EPI.TripCount && 8309 "Expected trip count to have been safed in the first pass."); 8310 assert( 8311 (!isa<Instruction>(EPI.TripCount) || 8312 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8313 "saved trip count does not dominate insertion point."); 8314 Value *TC = EPI.TripCount; 8315 IRBuilder<> Builder(Insert->getTerminator()); 8316 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8317 8318 // Generate code to check if the loop's trip count is less than VF * UF of the 8319 // vector epilogue loop. 8320 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8321 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8322 8323 Value *CheckMinIters = 8324 Builder.CreateICmp(P, Count, 8325 createStepForVF(Builder, Count->getType(), 8326 EPI.EpilogueVF, EPI.EpilogueUF), 8327 "min.epilog.iters.check"); 8328 8329 ReplaceInstWithInst( 8330 Insert->getTerminator(), 8331 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8332 8333 LoopBypassBlocks.push_back(Insert); 8334 return Insert; 8335 } 8336 8337 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8338 LLVM_DEBUG({ 8339 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8340 << "Epilogue Loop VF:" << EPI.EpilogueVF 8341 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8342 }); 8343 } 8344 8345 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8346 DEBUG_WITH_TYPE(VerboseDebug, { 8347 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8348 }); 8349 } 8350 8351 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8352 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8353 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8354 bool PredicateAtRangeStart = Predicate(Range.Start); 8355 8356 for (ElementCount TmpVF = Range.Start * 2; 8357 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8358 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8359 Range.End = TmpVF; 8360 break; 8361 } 8362 8363 return PredicateAtRangeStart; 8364 } 8365 8366 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8367 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8368 /// of VF's starting at a given VF and extending it as much as possible. Each 8369 /// vectorization decision can potentially shorten this sub-range during 8370 /// buildVPlan(). 8371 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8372 ElementCount MaxVF) { 8373 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8374 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8375 VFRange SubRange = {VF, MaxVFPlusOne}; 8376 VPlans.push_back(buildVPlan(SubRange)); 8377 VF = SubRange.End; 8378 } 8379 } 8380 8381 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8382 VPlanPtr &Plan) { 8383 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8384 8385 // Look for cached value. 8386 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8387 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8388 if (ECEntryIt != EdgeMaskCache.end()) 8389 return ECEntryIt->second; 8390 8391 VPValue *SrcMask = createBlockInMask(Src, Plan); 8392 8393 // The terminator has to be a branch inst! 8394 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8395 assert(BI && "Unexpected terminator found"); 8396 8397 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8398 return EdgeMaskCache[Edge] = SrcMask; 8399 8400 // If source is an exiting block, we know the exit edge is dynamically dead 8401 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8402 // adding uses of an otherwise potentially dead instruction. 8403 if (OrigLoop->isLoopExiting(Src)) 8404 return EdgeMaskCache[Edge] = SrcMask; 8405 8406 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8407 assert(EdgeMask && "No Edge Mask found for condition"); 8408 8409 if (BI->getSuccessor(0) != Dst) 8410 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8411 8412 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8413 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8414 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8415 // The select version does not introduce new UB if SrcMask is false and 8416 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8417 VPValue *False = Plan->getOrAddVPValue( 8418 ConstantInt::getFalse(BI->getCondition()->getType())); 8419 EdgeMask = 8420 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8421 } 8422 8423 return EdgeMaskCache[Edge] = EdgeMask; 8424 } 8425 8426 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8427 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8428 8429 // Look for cached value. 8430 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8431 if (BCEntryIt != BlockMaskCache.end()) 8432 return BCEntryIt->second; 8433 8434 // All-one mask is modelled as no-mask following the convention for masked 8435 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8436 VPValue *BlockMask = nullptr; 8437 8438 if (OrigLoop->getHeader() == BB) { 8439 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8440 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8441 8442 // Introduce the early-exit compare IV <= BTC to form header block mask. 8443 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8444 // Start by constructing the desired canonical IV in the header block. 8445 VPValue *IV = nullptr; 8446 if (Legal->getPrimaryInduction()) 8447 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8448 else { 8449 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8450 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8451 HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi()); 8452 IV = IVRecipe; 8453 } 8454 8455 // Create the block in mask as the first non-phi instruction in the block. 8456 VPBuilder::InsertPointGuard Guard(Builder); 8457 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8458 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8459 8460 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8461 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8462 8463 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8464 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8465 // as a second argument, we only pass the IV here and extract the 8466 // tripcount from the transform state where codegen of the VP instructions 8467 // happen. 8468 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8469 } else { 8470 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8471 } 8472 return BlockMaskCache[BB] = BlockMask; 8473 } 8474 8475 // This is the block mask. We OR all incoming edges. 8476 for (auto *Predecessor : predecessors(BB)) { 8477 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8478 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8479 return BlockMaskCache[BB] = EdgeMask; 8480 8481 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8482 BlockMask = EdgeMask; 8483 continue; 8484 } 8485 8486 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8487 } 8488 8489 return BlockMaskCache[BB] = BlockMask; 8490 } 8491 8492 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8493 ArrayRef<VPValue *> Operands, 8494 VFRange &Range, 8495 VPlanPtr &Plan) { 8496 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8497 "Must be called with either a load or store"); 8498 8499 auto willWiden = [&](ElementCount VF) -> bool { 8500 if (VF.isScalar()) 8501 return false; 8502 LoopVectorizationCostModel::InstWidening Decision = 8503 CM.getWideningDecision(I, VF); 8504 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8505 "CM decision should be taken at this point."); 8506 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8507 return true; 8508 if (CM.isScalarAfterVectorization(I, VF) || 8509 CM.isProfitableToScalarize(I, VF)) 8510 return false; 8511 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8512 }; 8513 8514 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8515 return nullptr; 8516 8517 VPValue *Mask = nullptr; 8518 if (Legal->isMaskRequired(I)) 8519 Mask = createBlockInMask(I->getParent(), Plan); 8520 8521 // Determine if the pointer operand of the access is either consecutive or 8522 // reverse consecutive. 8523 LoopVectorizationCostModel::InstWidening Decision = 8524 CM.getWideningDecision(I, Range.Start); 8525 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8526 bool Consecutive = 8527 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8528 8529 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8530 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8531 Consecutive, Reverse); 8532 8533 StoreInst *Store = cast<StoreInst>(I); 8534 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8535 Mask, Consecutive, Reverse); 8536 } 8537 8538 VPWidenIntOrFpInductionRecipe * 8539 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8540 ArrayRef<VPValue *> Operands) const { 8541 // Check if this is an integer or fp induction. If so, build the recipe that 8542 // produces its scalar and vector values. 8543 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { 8544 assert(II->getStartValue() == 8545 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8546 return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); 8547 } 8548 8549 return nullptr; 8550 } 8551 8552 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8553 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8554 VPlan &Plan) const { 8555 // Optimize the special case where the source is a constant integer 8556 // induction variable. Notice that we can only optimize the 'trunc' case 8557 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8558 // (c) other casts depend on pointer size. 8559 8560 // Determine whether \p K is a truncation based on an induction variable that 8561 // can be optimized. 8562 auto isOptimizableIVTruncate = 8563 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8564 return [=](ElementCount VF) -> bool { 8565 return CM.isOptimizableIVTruncate(K, VF); 8566 }; 8567 }; 8568 8569 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8570 isOptimizableIVTruncate(I), Range)) { 8571 8572 auto *Phi = cast<PHINode>(I->getOperand(0)); 8573 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8574 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8575 return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); 8576 } 8577 return nullptr; 8578 } 8579 8580 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8581 ArrayRef<VPValue *> Operands, 8582 VPlanPtr &Plan) { 8583 // If all incoming values are equal, the incoming VPValue can be used directly 8584 // instead of creating a new VPBlendRecipe. 8585 VPValue *FirstIncoming = Operands[0]; 8586 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8587 return FirstIncoming == Inc; 8588 })) { 8589 return Operands[0]; 8590 } 8591 8592 // We know that all PHIs in non-header blocks are converted into selects, so 8593 // we don't have to worry about the insertion order and we can just use the 8594 // builder. At this point we generate the predication tree. There may be 8595 // duplications since this is a simple recursive scan, but future 8596 // optimizations will clean it up. 8597 SmallVector<VPValue *, 2> OperandsWithMask; 8598 unsigned NumIncoming = Phi->getNumIncomingValues(); 8599 8600 for (unsigned In = 0; In < NumIncoming; In++) { 8601 VPValue *EdgeMask = 8602 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8603 assert((EdgeMask || NumIncoming == 1) && 8604 "Multiple predecessors with one having a full mask"); 8605 OperandsWithMask.push_back(Operands[In]); 8606 if (EdgeMask) 8607 OperandsWithMask.push_back(EdgeMask); 8608 } 8609 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8610 } 8611 8612 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8613 ArrayRef<VPValue *> Operands, 8614 VFRange &Range) const { 8615 8616 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8617 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8618 Range); 8619 8620 if (IsPredicated) 8621 return nullptr; 8622 8623 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8624 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8625 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8626 ID == Intrinsic::pseudoprobe || 8627 ID == Intrinsic::experimental_noalias_scope_decl)) 8628 return nullptr; 8629 8630 auto willWiden = [&](ElementCount VF) -> bool { 8631 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8632 // The following case may be scalarized depending on the VF. 8633 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8634 // version of the instruction. 8635 // Is it beneficial to perform intrinsic call compared to lib call? 8636 bool NeedToScalarize = false; 8637 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8638 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8639 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8640 return UseVectorIntrinsic || !NeedToScalarize; 8641 }; 8642 8643 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8644 return nullptr; 8645 8646 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8647 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8648 } 8649 8650 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8651 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8652 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8653 // Instruction should be widened, unless it is scalar after vectorization, 8654 // scalarization is profitable or it is predicated. 8655 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8656 return CM.isScalarAfterVectorization(I, VF) || 8657 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8658 }; 8659 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8660 Range); 8661 } 8662 8663 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8664 ArrayRef<VPValue *> Operands) const { 8665 auto IsVectorizableOpcode = [](unsigned Opcode) { 8666 switch (Opcode) { 8667 case Instruction::Add: 8668 case Instruction::And: 8669 case Instruction::AShr: 8670 case Instruction::BitCast: 8671 case Instruction::FAdd: 8672 case Instruction::FCmp: 8673 case Instruction::FDiv: 8674 case Instruction::FMul: 8675 case Instruction::FNeg: 8676 case Instruction::FPExt: 8677 case Instruction::FPToSI: 8678 case Instruction::FPToUI: 8679 case Instruction::FPTrunc: 8680 case Instruction::FRem: 8681 case Instruction::FSub: 8682 case Instruction::ICmp: 8683 case Instruction::IntToPtr: 8684 case Instruction::LShr: 8685 case Instruction::Mul: 8686 case Instruction::Or: 8687 case Instruction::PtrToInt: 8688 case Instruction::SDiv: 8689 case Instruction::Select: 8690 case Instruction::SExt: 8691 case Instruction::Shl: 8692 case Instruction::SIToFP: 8693 case Instruction::SRem: 8694 case Instruction::Sub: 8695 case Instruction::Trunc: 8696 case Instruction::UDiv: 8697 case Instruction::UIToFP: 8698 case Instruction::URem: 8699 case Instruction::Xor: 8700 case Instruction::ZExt: 8701 return true; 8702 } 8703 return false; 8704 }; 8705 8706 if (!IsVectorizableOpcode(I->getOpcode())) 8707 return nullptr; 8708 8709 // Success: widen this instruction. 8710 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8711 } 8712 8713 void VPRecipeBuilder::fixHeaderPhis() { 8714 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8715 for (VPWidenPHIRecipe *R : PhisToFix) { 8716 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8717 VPRecipeBase *IncR = 8718 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8719 R->addOperand(IncR->getVPSingleValue()); 8720 } 8721 } 8722 8723 VPBasicBlock *VPRecipeBuilder::handleReplication( 8724 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8725 VPlanPtr &Plan) { 8726 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8727 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8728 Range); 8729 8730 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8731 [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, 8732 Range); 8733 8734 // Even if the instruction is not marked as uniform, there are certain 8735 // intrinsic calls that can be effectively treated as such, so we check for 8736 // them here. Conservatively, we only do this for scalable vectors, since 8737 // for fixed-width VFs we can always fall back on full scalarization. 8738 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8739 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8740 case Intrinsic::assume: 8741 case Intrinsic::lifetime_start: 8742 case Intrinsic::lifetime_end: 8743 // For scalable vectors if one of the operands is variant then we still 8744 // want to mark as uniform, which will generate one instruction for just 8745 // the first lane of the vector. We can't scalarize the call in the same 8746 // way as for fixed-width vectors because we don't know how many lanes 8747 // there are. 8748 // 8749 // The reasons for doing it this way for scalable vectors are: 8750 // 1. For the assume intrinsic generating the instruction for the first 8751 // lane is still be better than not generating any at all. For 8752 // example, the input may be a splat across all lanes. 8753 // 2. For the lifetime start/end intrinsics the pointer operand only 8754 // does anything useful when the input comes from a stack object, 8755 // which suggests it should always be uniform. For non-stack objects 8756 // the effect is to poison the object, which still allows us to 8757 // remove the call. 8758 IsUniform = true; 8759 break; 8760 default: 8761 break; 8762 } 8763 } 8764 8765 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8766 IsUniform, IsPredicated); 8767 setRecipe(I, Recipe); 8768 Plan->addVPValue(I, Recipe); 8769 8770 // Find if I uses a predicated instruction. If so, it will use its scalar 8771 // value. Avoid hoisting the insert-element which packs the scalar value into 8772 // a vector value, as that happens iff all users use the vector value. 8773 for (VPValue *Op : Recipe->operands()) { 8774 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8775 if (!PredR) 8776 continue; 8777 auto *RepR = 8778 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8779 assert(RepR->isPredicated() && 8780 "expected Replicate recipe to be predicated"); 8781 RepR->setAlsoPack(false); 8782 } 8783 8784 // Finalize the recipe for Instr, first if it is not predicated. 8785 if (!IsPredicated) { 8786 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8787 VPBB->appendRecipe(Recipe); 8788 return VPBB; 8789 } 8790 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8791 8792 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8793 assert(SingleSucc && "VPBB must have a single successor when handling " 8794 "predicated replication."); 8795 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8796 // Record predicated instructions for above packing optimizations. 8797 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8798 VPBlockUtils::insertBlockAfter(Region, VPBB); 8799 auto *RegSucc = new VPBasicBlock(); 8800 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8801 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8802 return RegSucc; 8803 } 8804 8805 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8806 VPRecipeBase *PredRecipe, 8807 VPlanPtr &Plan) { 8808 // Instructions marked for predication are replicated and placed under an 8809 // if-then construct to prevent side-effects. 8810 8811 // Generate recipes to compute the block mask for this region. 8812 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8813 8814 // Build the triangular if-then region. 8815 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8816 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8817 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8818 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8819 auto *PHIRecipe = Instr->getType()->isVoidTy() 8820 ? nullptr 8821 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8822 if (PHIRecipe) { 8823 Plan->removeVPValueFor(Instr); 8824 Plan->addVPValue(Instr, PHIRecipe); 8825 } 8826 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8827 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8828 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8829 8830 // Note: first set Entry as region entry and then connect successors starting 8831 // from it in order, to propagate the "parent" of each VPBasicBlock. 8832 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8833 VPBlockUtils::connectBlocks(Pred, Exit); 8834 8835 return Region; 8836 } 8837 8838 VPRecipeOrVPValueTy 8839 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8840 ArrayRef<VPValue *> Operands, 8841 VFRange &Range, VPlanPtr &Plan) { 8842 // First, check for specific widening recipes that deal with calls, memory 8843 // operations, inductions and Phi nodes. 8844 if (auto *CI = dyn_cast<CallInst>(Instr)) 8845 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8846 8847 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8848 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8849 8850 VPRecipeBase *Recipe; 8851 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8852 if (Phi->getParent() != OrigLoop->getHeader()) 8853 return tryToBlend(Phi, Operands, Plan); 8854 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8855 return toVPRecipeResult(Recipe); 8856 8857 VPWidenPHIRecipe *PhiRecipe = nullptr; 8858 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8859 VPValue *StartV = Operands[0]; 8860 if (Legal->isReductionVariable(Phi)) { 8861 const RecurrenceDescriptor &RdxDesc = 8862 Legal->getReductionVars().find(Phi)->second; 8863 assert(RdxDesc.getRecurrenceStartValue() == 8864 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8865 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8866 CM.isInLoopReduction(Phi), 8867 CM.useOrderedReductions(RdxDesc)); 8868 } else { 8869 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8870 } 8871 8872 // Record the incoming value from the backedge, so we can add the incoming 8873 // value from the backedge after all recipes have been created. 8874 recordRecipeOf(cast<Instruction>( 8875 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8876 PhisToFix.push_back(PhiRecipe); 8877 } else { 8878 // TODO: record start and backedge value for remaining pointer induction 8879 // phis. 8880 assert(Phi->getType()->isPointerTy() && 8881 "only pointer phis should be handled here"); 8882 PhiRecipe = new VPWidenPHIRecipe(Phi); 8883 } 8884 8885 return toVPRecipeResult(PhiRecipe); 8886 } 8887 8888 if (isa<TruncInst>(Instr) && 8889 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8890 Range, *Plan))) 8891 return toVPRecipeResult(Recipe); 8892 8893 if (!shouldWiden(Instr, Range)) 8894 return nullptr; 8895 8896 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8897 return toVPRecipeResult(new VPWidenGEPRecipe( 8898 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8899 8900 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8901 bool InvariantCond = 8902 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8903 return toVPRecipeResult(new VPWidenSelectRecipe( 8904 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8905 } 8906 8907 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8908 } 8909 8910 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8911 ElementCount MaxVF) { 8912 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8913 8914 // Collect instructions from the original loop that will become trivially dead 8915 // in the vectorized loop. We don't need to vectorize these instructions. For 8916 // example, original induction update instructions can become dead because we 8917 // separately emit induction "steps" when generating code for the new loop. 8918 // Similarly, we create a new latch condition when setting up the structure 8919 // of the new loop, so the old one can become dead. 8920 SmallPtrSet<Instruction *, 4> DeadInstructions; 8921 collectTriviallyDeadInstructions(DeadInstructions); 8922 8923 // Add assume instructions we need to drop to DeadInstructions, to prevent 8924 // them from being added to the VPlan. 8925 // TODO: We only need to drop assumes in blocks that get flattend. If the 8926 // control flow is preserved, we should keep them. 8927 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8928 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8929 8930 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8931 // Dead instructions do not need sinking. Remove them from SinkAfter. 8932 for (Instruction *I : DeadInstructions) 8933 SinkAfter.erase(I); 8934 8935 // Cannot sink instructions after dead instructions (there won't be any 8936 // recipes for them). Instead, find the first non-dead previous instruction. 8937 for (auto &P : Legal->getSinkAfter()) { 8938 Instruction *SinkTarget = P.second; 8939 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8940 (void)FirstInst; 8941 while (DeadInstructions.contains(SinkTarget)) { 8942 assert( 8943 SinkTarget != FirstInst && 8944 "Must find a live instruction (at least the one feeding the " 8945 "first-order recurrence PHI) before reaching beginning of the block"); 8946 SinkTarget = SinkTarget->getPrevNode(); 8947 assert(SinkTarget != P.first && 8948 "sink source equals target, no sinking required"); 8949 } 8950 P.second = SinkTarget; 8951 } 8952 8953 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8954 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8955 VFRange SubRange = {VF, MaxVFPlusOne}; 8956 VPlans.push_back( 8957 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8958 VF = SubRange.End; 8959 } 8960 } 8961 8962 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8963 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8964 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8965 8966 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8967 8968 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8969 8970 // --------------------------------------------------------------------------- 8971 // Pre-construction: record ingredients whose recipes we'll need to further 8972 // process after constructing the initial VPlan. 8973 // --------------------------------------------------------------------------- 8974 8975 // Mark instructions we'll need to sink later and their targets as 8976 // ingredients whose recipe we'll need to record. 8977 for (auto &Entry : SinkAfter) { 8978 RecipeBuilder.recordRecipeOf(Entry.first); 8979 RecipeBuilder.recordRecipeOf(Entry.second); 8980 } 8981 for (auto &Reduction : CM.getInLoopReductionChains()) { 8982 PHINode *Phi = Reduction.first; 8983 RecurKind Kind = 8984 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8985 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8986 8987 RecipeBuilder.recordRecipeOf(Phi); 8988 for (auto &R : ReductionOperations) { 8989 RecipeBuilder.recordRecipeOf(R); 8990 // For min/max reducitons, where we have a pair of icmp/select, we also 8991 // need to record the ICmp recipe, so it can be removed later. 8992 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8993 "Only min/max recurrences allowed for inloop reductions"); 8994 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8995 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8996 } 8997 } 8998 8999 // For each interleave group which is relevant for this (possibly trimmed) 9000 // Range, add it to the set of groups to be later applied to the VPlan and add 9001 // placeholders for its members' Recipes which we'll be replacing with a 9002 // single VPInterleaveRecipe. 9003 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9004 auto applyIG = [IG, this](ElementCount VF) -> bool { 9005 return (VF.isVector() && // Query is illegal for VF == 1 9006 CM.getWideningDecision(IG->getInsertPos(), VF) == 9007 LoopVectorizationCostModel::CM_Interleave); 9008 }; 9009 if (!getDecisionAndClampRange(applyIG, Range)) 9010 continue; 9011 InterleaveGroups.insert(IG); 9012 for (unsigned i = 0; i < IG->getFactor(); i++) 9013 if (Instruction *Member = IG->getMember(i)) 9014 RecipeBuilder.recordRecipeOf(Member); 9015 }; 9016 9017 // --------------------------------------------------------------------------- 9018 // Build initial VPlan: Scan the body of the loop in a topological order to 9019 // visit each basic block after having visited its predecessor basic blocks. 9020 // --------------------------------------------------------------------------- 9021 9022 // Create initial VPlan skeleton, with separate header and latch blocks. 9023 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9024 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9025 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9026 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9027 auto Plan = std::make_unique<VPlan>(TopRegion); 9028 9029 // Scan the body of the loop in a topological order to visit each basic block 9030 // after having visited its predecessor basic blocks. 9031 LoopBlocksDFS DFS(OrigLoop); 9032 DFS.perform(LI); 9033 9034 VPBasicBlock *VPBB = HeaderVPBB; 9035 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9036 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9037 // Relevant instructions from basic block BB will be grouped into VPRecipe 9038 // ingredients and fill a new VPBasicBlock. 9039 unsigned VPBBsForBB = 0; 9040 VPBB->setName(BB->getName()); 9041 Builder.setInsertPoint(VPBB); 9042 9043 // Introduce each ingredient into VPlan. 9044 // TODO: Model and preserve debug instrinsics in VPlan. 9045 for (Instruction &I : BB->instructionsWithoutDebug()) { 9046 Instruction *Instr = &I; 9047 9048 // First filter out irrelevant instructions, to ensure no recipes are 9049 // built for them. 9050 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9051 continue; 9052 9053 SmallVector<VPValue *, 4> Operands; 9054 auto *Phi = dyn_cast<PHINode>(Instr); 9055 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9056 Operands.push_back(Plan->getOrAddVPValue( 9057 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9058 } else { 9059 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9060 Operands = {OpRange.begin(), OpRange.end()}; 9061 } 9062 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9063 Instr, Operands, Range, Plan)) { 9064 // If Instr can be simplified to an existing VPValue, use it. 9065 if (RecipeOrValue.is<VPValue *>()) { 9066 auto *VPV = RecipeOrValue.get<VPValue *>(); 9067 Plan->addVPValue(Instr, VPV); 9068 // If the re-used value is a recipe, register the recipe for the 9069 // instruction, in case the recipe for Instr needs to be recorded. 9070 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9071 RecipeBuilder.setRecipe(Instr, R); 9072 continue; 9073 } 9074 // Otherwise, add the new recipe. 9075 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9076 for (auto *Def : Recipe->definedValues()) { 9077 auto *UV = Def->getUnderlyingValue(); 9078 Plan->addVPValue(UV, Def); 9079 } 9080 9081 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9082 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9083 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9084 // of the header block. That can happen for truncates of induction 9085 // variables. Those recipes are moved to the phi section of the header 9086 // block after applying SinkAfter, which relies on the original 9087 // position of the trunc. 9088 assert(isa<TruncInst>(Instr)); 9089 InductionsToMove.push_back( 9090 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9091 } 9092 RecipeBuilder.setRecipe(Instr, Recipe); 9093 VPBB->appendRecipe(Recipe); 9094 continue; 9095 } 9096 9097 // Otherwise, if all widening options failed, Instruction is to be 9098 // replicated. This may create a successor for VPBB. 9099 VPBasicBlock *NextVPBB = 9100 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9101 if (NextVPBB != VPBB) { 9102 VPBB = NextVPBB; 9103 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9104 : ""); 9105 } 9106 } 9107 9108 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9109 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9110 } 9111 9112 // Fold the last, empty block into its predecessor. 9113 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9114 assert(VPBB && "expected to fold last (empty) block"); 9115 // After here, VPBB should not be used. 9116 VPBB = nullptr; 9117 9118 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9119 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9120 "entry block must be set to a VPRegionBlock having a non-empty entry " 9121 "VPBasicBlock"); 9122 RecipeBuilder.fixHeaderPhis(); 9123 9124 // --------------------------------------------------------------------------- 9125 // Transform initial VPlan: Apply previously taken decisions, in order, to 9126 // bring the VPlan to its final state. 9127 // --------------------------------------------------------------------------- 9128 9129 // Apply Sink-After legal constraints. 9130 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9131 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9132 if (Region && Region->isReplicator()) { 9133 assert(Region->getNumSuccessors() == 1 && 9134 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9135 assert(R->getParent()->size() == 1 && 9136 "A recipe in an original replicator region must be the only " 9137 "recipe in its block"); 9138 return Region; 9139 } 9140 return nullptr; 9141 }; 9142 for (auto &Entry : SinkAfter) { 9143 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9144 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9145 9146 auto *TargetRegion = GetReplicateRegion(Target); 9147 auto *SinkRegion = GetReplicateRegion(Sink); 9148 if (!SinkRegion) { 9149 // If the sink source is not a replicate region, sink the recipe directly. 9150 if (TargetRegion) { 9151 // The target is in a replication region, make sure to move Sink to 9152 // the block after it, not into the replication region itself. 9153 VPBasicBlock *NextBlock = 9154 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9155 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9156 } else 9157 Sink->moveAfter(Target); 9158 continue; 9159 } 9160 9161 // The sink source is in a replicate region. Unhook the region from the CFG. 9162 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9163 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9164 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9165 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9166 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9167 9168 if (TargetRegion) { 9169 // The target recipe is also in a replicate region, move the sink region 9170 // after the target region. 9171 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9172 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9173 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9174 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9175 } else { 9176 // The sink source is in a replicate region, we need to move the whole 9177 // replicate region, which should only contain a single recipe in the 9178 // main block. 9179 auto *SplitBlock = 9180 Target->getParent()->splitAt(std::next(Target->getIterator())); 9181 9182 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9183 9184 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9185 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9186 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9187 } 9188 } 9189 9190 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9191 9192 // Now that sink-after is done, move induction recipes for optimized truncates 9193 // to the phi section of the header block. 9194 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9195 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9196 9197 // Adjust the recipes for any inloop reductions. 9198 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9199 RecipeBuilder, Range.Start); 9200 9201 // Introduce a recipe to combine the incoming and previous values of a 9202 // first-order recurrence. 9203 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9204 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9205 if (!RecurPhi) 9206 continue; 9207 9208 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9209 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9210 auto *Region = GetReplicateRegion(PrevRecipe); 9211 if (Region) 9212 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9213 if (Region || PrevRecipe->isPhi()) 9214 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9215 else 9216 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9217 9218 auto *RecurSplice = cast<VPInstruction>( 9219 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9220 {RecurPhi, RecurPhi->getBackedgeValue()})); 9221 9222 RecurPhi->replaceAllUsesWith(RecurSplice); 9223 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9224 // all users. 9225 RecurSplice->setOperand(0, RecurPhi); 9226 } 9227 9228 // Interleave memory: for each Interleave Group we marked earlier as relevant 9229 // for this VPlan, replace the Recipes widening its memory instructions with a 9230 // single VPInterleaveRecipe at its insertion point. 9231 for (auto IG : InterleaveGroups) { 9232 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9233 RecipeBuilder.getRecipe(IG->getInsertPos())); 9234 SmallVector<VPValue *, 4> StoredValues; 9235 for (unsigned i = 0; i < IG->getFactor(); ++i) 9236 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9237 auto *StoreR = 9238 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9239 StoredValues.push_back(StoreR->getStoredValue()); 9240 } 9241 9242 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9243 Recipe->getMask()); 9244 VPIG->insertBefore(Recipe); 9245 unsigned J = 0; 9246 for (unsigned i = 0; i < IG->getFactor(); ++i) 9247 if (Instruction *Member = IG->getMember(i)) { 9248 if (!Member->getType()->isVoidTy()) { 9249 VPValue *OriginalV = Plan->getVPValue(Member); 9250 Plan->removeVPValueFor(Member); 9251 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9252 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9253 J++; 9254 } 9255 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9256 } 9257 } 9258 9259 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9260 // in ways that accessing values using original IR values is incorrect. 9261 Plan->disableValue2VPValue(); 9262 9263 VPlanTransforms::sinkScalarOperands(*Plan); 9264 VPlanTransforms::mergeReplicateRegions(*Plan); 9265 9266 std::string PlanName; 9267 raw_string_ostream RSO(PlanName); 9268 ElementCount VF = Range.Start; 9269 Plan->addVF(VF); 9270 RSO << "Initial VPlan for VF={" << VF; 9271 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9272 Plan->addVF(VF); 9273 RSO << "," << VF; 9274 } 9275 RSO << "},UF>=1"; 9276 RSO.flush(); 9277 Plan->setName(PlanName); 9278 9279 // Fold Exit block into its predecessor if possible. 9280 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9281 // VPBasicBlock as exit. 9282 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9283 9284 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9285 return Plan; 9286 } 9287 9288 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9289 // Outer loop handling: They may require CFG and instruction level 9290 // transformations before even evaluating whether vectorization is profitable. 9291 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9292 // the vectorization pipeline. 9293 assert(!OrigLoop->isInnermost()); 9294 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9295 9296 // Create new empty VPlan 9297 auto Plan = std::make_unique<VPlan>(); 9298 9299 // Build hierarchical CFG 9300 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9301 HCFGBuilder.buildHierarchicalCFG(); 9302 9303 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9304 VF *= 2) 9305 Plan->addVF(VF); 9306 9307 if (EnableVPlanPredication) { 9308 VPlanPredicator VPP(*Plan); 9309 VPP.predicate(); 9310 9311 // Avoid running transformation to recipes until masked code generation in 9312 // VPlan-native path is in place. 9313 return Plan; 9314 } 9315 9316 SmallPtrSet<Instruction *, 1> DeadInstructions; 9317 VPlanTransforms::VPInstructionsToVPRecipes( 9318 OrigLoop, Plan, 9319 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9320 DeadInstructions, *PSE.getSE()); 9321 return Plan; 9322 } 9323 9324 // Adjust the recipes for reductions. For in-loop reductions the chain of 9325 // instructions leading from the loop exit instr to the phi need to be converted 9326 // to reductions, with one operand being vector and the other being the scalar 9327 // reduction chain. For other reductions, a select is introduced between the phi 9328 // and live-out recipes when folding the tail. 9329 void LoopVectorizationPlanner::adjustRecipesForReductions( 9330 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9331 ElementCount MinVF) { 9332 for (auto &Reduction : CM.getInLoopReductionChains()) { 9333 PHINode *Phi = Reduction.first; 9334 const RecurrenceDescriptor &RdxDesc = 9335 Legal->getReductionVars().find(Phi)->second; 9336 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9337 9338 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9339 continue; 9340 9341 // ReductionOperations are orders top-down from the phi's use to the 9342 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9343 // which of the two operands will remain scalar and which will be reduced. 9344 // For minmax the chain will be the select instructions. 9345 Instruction *Chain = Phi; 9346 for (Instruction *R : ReductionOperations) { 9347 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9348 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9349 9350 VPValue *ChainOp = Plan->getVPValue(Chain); 9351 unsigned FirstOpId; 9352 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9353 "Only min/max recurrences allowed for inloop reductions"); 9354 // Recognize a call to the llvm.fmuladd intrinsic. 9355 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9356 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9357 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9358 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9359 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9360 "Expected to replace a VPWidenSelectSC"); 9361 FirstOpId = 1; 9362 } else { 9363 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9364 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9365 "Expected to replace a VPWidenSC"); 9366 FirstOpId = 0; 9367 } 9368 unsigned VecOpId = 9369 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9370 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9371 9372 auto *CondOp = CM.foldTailByMasking() 9373 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9374 : nullptr; 9375 9376 if (IsFMulAdd) { 9377 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9378 // need to create an fmul recipe to use as the vector operand for the 9379 // fadd reduction. 9380 VPInstruction *FMulRecipe = new VPInstruction( 9381 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9382 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9383 WidenRecipe->getParent()->insert(FMulRecipe, 9384 WidenRecipe->getIterator()); 9385 VecOp = FMulRecipe; 9386 } 9387 VPReductionRecipe *RedRecipe = 9388 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9389 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9390 Plan->removeVPValueFor(R); 9391 Plan->addVPValue(R, RedRecipe); 9392 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9393 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9394 WidenRecipe->eraseFromParent(); 9395 9396 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9397 VPRecipeBase *CompareRecipe = 9398 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9399 assert(isa<VPWidenRecipe>(CompareRecipe) && 9400 "Expected to replace a VPWidenSC"); 9401 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9402 "Expected no remaining users"); 9403 CompareRecipe->eraseFromParent(); 9404 } 9405 Chain = R; 9406 } 9407 } 9408 9409 // If tail is folded by masking, introduce selects between the phi 9410 // and the live-out instruction of each reduction, at the end of the latch. 9411 if (CM.foldTailByMasking()) { 9412 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9413 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9414 if (!PhiR || PhiR->isInLoop()) 9415 continue; 9416 Builder.setInsertPoint(LatchVPBB); 9417 VPValue *Cond = 9418 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9419 VPValue *Red = PhiR->getBackedgeValue(); 9420 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9421 } 9422 } 9423 } 9424 9425 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9426 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9427 VPSlotTracker &SlotTracker) const { 9428 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9429 IG->getInsertPos()->printAsOperand(O, false); 9430 O << ", "; 9431 getAddr()->printAsOperand(O, SlotTracker); 9432 VPValue *Mask = getMask(); 9433 if (Mask) { 9434 O << ", "; 9435 Mask->printAsOperand(O, SlotTracker); 9436 } 9437 9438 unsigned OpIdx = 0; 9439 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9440 if (!IG->getMember(i)) 9441 continue; 9442 if (getNumStoreOperands() > 0) { 9443 O << "\n" << Indent << " store "; 9444 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9445 O << " to index " << i; 9446 } else { 9447 O << "\n" << Indent << " "; 9448 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9449 O << " = load from index " << i; 9450 } 9451 ++OpIdx; 9452 } 9453 } 9454 #endif 9455 9456 void VPWidenCallRecipe::execute(VPTransformState &State) { 9457 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9458 *this, State); 9459 } 9460 9461 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9462 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9463 State.ILV->setDebugLocFromInst(&I); 9464 9465 // The condition can be loop invariant but still defined inside the 9466 // loop. This means that we can't just use the original 'cond' value. 9467 // We have to take the 'vectorized' value and pick the first lane. 9468 // Instcombine will make this a no-op. 9469 auto *InvarCond = 9470 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9471 9472 for (unsigned Part = 0; Part < State.UF; ++Part) { 9473 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9474 Value *Op0 = State.get(getOperand(1), Part); 9475 Value *Op1 = State.get(getOperand(2), Part); 9476 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9477 State.set(this, Sel, Part); 9478 State.ILV->addMetadata(Sel, &I); 9479 } 9480 } 9481 9482 void VPWidenRecipe::execute(VPTransformState &State) { 9483 auto &I = *cast<Instruction>(getUnderlyingValue()); 9484 auto &Builder = State.Builder; 9485 switch (I.getOpcode()) { 9486 case Instruction::Call: 9487 case Instruction::Br: 9488 case Instruction::PHI: 9489 case Instruction::GetElementPtr: 9490 case Instruction::Select: 9491 llvm_unreachable("This instruction is handled by a different recipe."); 9492 case Instruction::UDiv: 9493 case Instruction::SDiv: 9494 case Instruction::SRem: 9495 case Instruction::URem: 9496 case Instruction::Add: 9497 case Instruction::FAdd: 9498 case Instruction::Sub: 9499 case Instruction::FSub: 9500 case Instruction::FNeg: 9501 case Instruction::Mul: 9502 case Instruction::FMul: 9503 case Instruction::FDiv: 9504 case Instruction::FRem: 9505 case Instruction::Shl: 9506 case Instruction::LShr: 9507 case Instruction::AShr: 9508 case Instruction::And: 9509 case Instruction::Or: 9510 case Instruction::Xor: { 9511 // Just widen unops and binops. 9512 State.ILV->setDebugLocFromInst(&I); 9513 9514 for (unsigned Part = 0; Part < State.UF; ++Part) { 9515 SmallVector<Value *, 2> Ops; 9516 for (VPValue *VPOp : operands()) 9517 Ops.push_back(State.get(VPOp, Part)); 9518 9519 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9520 9521 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9522 VecOp->copyIRFlags(&I); 9523 9524 // If the instruction is vectorized and was in a basic block that needed 9525 // predication, we can't propagate poison-generating flags (nuw/nsw, 9526 // exact, etc.). The control flow has been linearized and the 9527 // instruction is no longer guarded by the predicate, which could make 9528 // the flag properties to no longer hold. 9529 if (State.MayGeneratePoisonRecipes.contains(this)) 9530 VecOp->dropPoisonGeneratingFlags(); 9531 } 9532 9533 // Use this vector value for all users of the original instruction. 9534 State.set(this, V, Part); 9535 State.ILV->addMetadata(V, &I); 9536 } 9537 9538 break; 9539 } 9540 case Instruction::ICmp: 9541 case Instruction::FCmp: { 9542 // Widen compares. Generate vector compares. 9543 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9544 auto *Cmp = cast<CmpInst>(&I); 9545 State.ILV->setDebugLocFromInst(Cmp); 9546 for (unsigned Part = 0; Part < State.UF; ++Part) { 9547 Value *A = State.get(getOperand(0), Part); 9548 Value *B = State.get(getOperand(1), Part); 9549 Value *C = nullptr; 9550 if (FCmp) { 9551 // Propagate fast math flags. 9552 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9553 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9554 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9555 } else { 9556 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9557 } 9558 State.set(this, C, Part); 9559 State.ILV->addMetadata(C, &I); 9560 } 9561 9562 break; 9563 } 9564 9565 case Instruction::ZExt: 9566 case Instruction::SExt: 9567 case Instruction::FPToUI: 9568 case Instruction::FPToSI: 9569 case Instruction::FPExt: 9570 case Instruction::PtrToInt: 9571 case Instruction::IntToPtr: 9572 case Instruction::SIToFP: 9573 case Instruction::UIToFP: 9574 case Instruction::Trunc: 9575 case Instruction::FPTrunc: 9576 case Instruction::BitCast: { 9577 auto *CI = cast<CastInst>(&I); 9578 State.ILV->setDebugLocFromInst(CI); 9579 9580 /// Vectorize casts. 9581 Type *DestTy = (State.VF.isScalar()) 9582 ? CI->getType() 9583 : VectorType::get(CI->getType(), State.VF); 9584 9585 for (unsigned Part = 0; Part < State.UF; ++Part) { 9586 Value *A = State.get(getOperand(0), Part); 9587 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9588 State.set(this, Cast, Part); 9589 State.ILV->addMetadata(Cast, &I); 9590 } 9591 break; 9592 } 9593 default: 9594 // This instruction is not vectorized by simple widening. 9595 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9596 llvm_unreachable("Unhandled instruction!"); 9597 } // end of switch. 9598 } 9599 9600 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9601 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9602 // Construct a vector GEP by widening the operands of the scalar GEP as 9603 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9604 // results in a vector of pointers when at least one operand of the GEP 9605 // is vector-typed. Thus, to keep the representation compact, we only use 9606 // vector-typed operands for loop-varying values. 9607 9608 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9609 // If we are vectorizing, but the GEP has only loop-invariant operands, 9610 // the GEP we build (by only using vector-typed operands for 9611 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9612 // produce a vector of pointers, we need to either arbitrarily pick an 9613 // operand to broadcast, or broadcast a clone of the original GEP. 9614 // Here, we broadcast a clone of the original. 9615 // 9616 // TODO: If at some point we decide to scalarize instructions having 9617 // loop-invariant operands, this special case will no longer be 9618 // required. We would add the scalarization decision to 9619 // collectLoopScalars() and teach getVectorValue() to broadcast 9620 // the lane-zero scalar value. 9621 auto *Clone = State.Builder.Insert(GEP->clone()); 9622 for (unsigned Part = 0; Part < State.UF; ++Part) { 9623 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9624 State.set(this, EntryPart, Part); 9625 State.ILV->addMetadata(EntryPart, GEP); 9626 } 9627 } else { 9628 // If the GEP has at least one loop-varying operand, we are sure to 9629 // produce a vector of pointers. But if we are only unrolling, we want 9630 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9631 // produce with the code below will be scalar (if VF == 1) or vector 9632 // (otherwise). Note that for the unroll-only case, we still maintain 9633 // values in the vector mapping with initVector, as we do for other 9634 // instructions. 9635 for (unsigned Part = 0; Part < State.UF; ++Part) { 9636 // The pointer operand of the new GEP. If it's loop-invariant, we 9637 // won't broadcast it. 9638 auto *Ptr = IsPtrLoopInvariant 9639 ? State.get(getOperand(0), VPIteration(0, 0)) 9640 : State.get(getOperand(0), Part); 9641 9642 // Collect all the indices for the new GEP. If any index is 9643 // loop-invariant, we won't broadcast it. 9644 SmallVector<Value *, 4> Indices; 9645 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9646 VPValue *Operand = getOperand(I); 9647 if (IsIndexLoopInvariant[I - 1]) 9648 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9649 else 9650 Indices.push_back(State.get(Operand, Part)); 9651 } 9652 9653 // If the GEP instruction is vectorized and was in a basic block that 9654 // needed predication, we can't propagate the poison-generating 'inbounds' 9655 // flag. The control flow has been linearized and the GEP is no longer 9656 // guarded by the predicate, which could make the 'inbounds' properties to 9657 // no longer hold. 9658 bool IsInBounds = 9659 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9660 9661 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9662 // but it should be a vector, otherwise. 9663 auto *NewGEP = IsInBounds 9664 ? State.Builder.CreateInBoundsGEP( 9665 GEP->getSourceElementType(), Ptr, Indices) 9666 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9667 Ptr, Indices); 9668 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9669 "NewGEP is not a pointer vector"); 9670 State.set(this, NewGEP, Part); 9671 State.ILV->addMetadata(NewGEP, GEP); 9672 } 9673 } 9674 } 9675 9676 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9677 assert(!State.Instance && "Int or FP induction being replicated."); 9678 State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), 9679 getStartValue()->getLiveInIRValue(), 9680 getTruncInst(), getVPValue(0), State); 9681 } 9682 9683 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9684 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9685 State); 9686 } 9687 9688 void VPBlendRecipe::execute(VPTransformState &State) { 9689 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9690 // We know that all PHIs in non-header blocks are converted into 9691 // selects, so we don't have to worry about the insertion order and we 9692 // can just use the builder. 9693 // At this point we generate the predication tree. There may be 9694 // duplications since this is a simple recursive scan, but future 9695 // optimizations will clean it up. 9696 9697 unsigned NumIncoming = getNumIncomingValues(); 9698 9699 // Generate a sequence of selects of the form: 9700 // SELECT(Mask3, In3, 9701 // SELECT(Mask2, In2, 9702 // SELECT(Mask1, In1, 9703 // In0))) 9704 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9705 // are essentially undef are taken from In0. 9706 InnerLoopVectorizer::VectorParts Entry(State.UF); 9707 for (unsigned In = 0; In < NumIncoming; ++In) { 9708 for (unsigned Part = 0; Part < State.UF; ++Part) { 9709 // We might have single edge PHIs (blocks) - use an identity 9710 // 'select' for the first PHI operand. 9711 Value *In0 = State.get(getIncomingValue(In), Part); 9712 if (In == 0) 9713 Entry[Part] = In0; // Initialize with the first incoming value. 9714 else { 9715 // Select between the current value and the previous incoming edge 9716 // based on the incoming mask. 9717 Value *Cond = State.get(getMask(In), Part); 9718 Entry[Part] = 9719 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9720 } 9721 } 9722 } 9723 for (unsigned Part = 0; Part < State.UF; ++Part) 9724 State.set(this, Entry[Part], Part); 9725 } 9726 9727 void VPInterleaveRecipe::execute(VPTransformState &State) { 9728 assert(!State.Instance && "Interleave group being replicated."); 9729 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9730 getStoredValues(), getMask()); 9731 } 9732 9733 void VPReductionRecipe::execute(VPTransformState &State) { 9734 assert(!State.Instance && "Reduction being replicated."); 9735 Value *PrevInChain = State.get(getChainOp(), 0); 9736 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9737 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9738 // Propagate the fast-math flags carried by the underlying instruction. 9739 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9740 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9741 for (unsigned Part = 0; Part < State.UF; ++Part) { 9742 Value *NewVecOp = State.get(getVecOp(), Part); 9743 if (VPValue *Cond = getCondOp()) { 9744 Value *NewCond = State.get(Cond, Part); 9745 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9746 Value *Iden = RdxDesc->getRecurrenceIdentity( 9747 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9748 Value *IdenVec = 9749 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9750 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9751 NewVecOp = Select; 9752 } 9753 Value *NewRed; 9754 Value *NextInChain; 9755 if (IsOrdered) { 9756 if (State.VF.isVector()) 9757 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9758 PrevInChain); 9759 else 9760 NewRed = State.Builder.CreateBinOp( 9761 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9762 NewVecOp); 9763 PrevInChain = NewRed; 9764 } else { 9765 PrevInChain = State.get(getChainOp(), Part); 9766 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9767 } 9768 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9769 NextInChain = 9770 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9771 NewRed, PrevInChain); 9772 } else if (IsOrdered) 9773 NextInChain = NewRed; 9774 else 9775 NextInChain = State.Builder.CreateBinOp( 9776 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9777 PrevInChain); 9778 State.set(this, NextInChain, Part); 9779 } 9780 } 9781 9782 void VPReplicateRecipe::execute(VPTransformState &State) { 9783 if (State.Instance) { // Generate a single instance. 9784 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9785 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9786 IsPredicated, State); 9787 // Insert scalar instance packing it into a vector. 9788 if (AlsoPack && State.VF.isVector()) { 9789 // If we're constructing lane 0, initialize to start from poison. 9790 if (State.Instance->Lane.isFirstLane()) { 9791 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9792 Value *Poison = PoisonValue::get( 9793 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9794 State.set(this, Poison, State.Instance->Part); 9795 } 9796 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9797 } 9798 return; 9799 } 9800 9801 // Generate scalar instances for all VF lanes of all UF parts, unless the 9802 // instruction is uniform inwhich case generate only the first lane for each 9803 // of the UF parts. 9804 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9805 assert((!State.VF.isScalable() || IsUniform) && 9806 "Can't scalarize a scalable vector"); 9807 for (unsigned Part = 0; Part < State.UF; ++Part) 9808 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9809 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9810 VPIteration(Part, Lane), IsPredicated, 9811 State); 9812 } 9813 9814 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9815 assert(State.Instance && "Branch on Mask works only on single instance."); 9816 9817 unsigned Part = State.Instance->Part; 9818 unsigned Lane = State.Instance->Lane.getKnownLane(); 9819 9820 Value *ConditionBit = nullptr; 9821 VPValue *BlockInMask = getMask(); 9822 if (BlockInMask) { 9823 ConditionBit = State.get(BlockInMask, Part); 9824 if (ConditionBit->getType()->isVectorTy()) 9825 ConditionBit = State.Builder.CreateExtractElement( 9826 ConditionBit, State.Builder.getInt32(Lane)); 9827 } else // Block in mask is all-one. 9828 ConditionBit = State.Builder.getTrue(); 9829 9830 // Replace the temporary unreachable terminator with a new conditional branch, 9831 // whose two destinations will be set later when they are created. 9832 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9833 assert(isa<UnreachableInst>(CurrentTerminator) && 9834 "Expected to replace unreachable terminator with conditional branch."); 9835 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9836 CondBr->setSuccessor(0, nullptr); 9837 ReplaceInstWithInst(CurrentTerminator, CondBr); 9838 } 9839 9840 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9841 assert(State.Instance && "Predicated instruction PHI works per instance."); 9842 Instruction *ScalarPredInst = 9843 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9844 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9845 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9846 assert(PredicatingBB && "Predicated block has no single predecessor."); 9847 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9848 "operand must be VPReplicateRecipe"); 9849 9850 // By current pack/unpack logic we need to generate only a single phi node: if 9851 // a vector value for the predicated instruction exists at this point it means 9852 // the instruction has vector users only, and a phi for the vector value is 9853 // needed. In this case the recipe of the predicated instruction is marked to 9854 // also do that packing, thereby "hoisting" the insert-element sequence. 9855 // Otherwise, a phi node for the scalar value is needed. 9856 unsigned Part = State.Instance->Part; 9857 if (State.hasVectorValue(getOperand(0), Part)) { 9858 Value *VectorValue = State.get(getOperand(0), Part); 9859 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9860 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9861 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9862 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9863 if (State.hasVectorValue(this, Part)) 9864 State.reset(this, VPhi, Part); 9865 else 9866 State.set(this, VPhi, Part); 9867 // NOTE: Currently we need to update the value of the operand, so the next 9868 // predicated iteration inserts its generated value in the correct vector. 9869 State.reset(getOperand(0), VPhi, Part); 9870 } else { 9871 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9872 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9873 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9874 PredicatingBB); 9875 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9876 if (State.hasScalarValue(this, *State.Instance)) 9877 State.reset(this, Phi, *State.Instance); 9878 else 9879 State.set(this, Phi, *State.Instance); 9880 // NOTE: Currently we need to update the value of the operand, so the next 9881 // predicated iteration inserts its generated value in the correct vector. 9882 State.reset(getOperand(0), Phi, *State.Instance); 9883 } 9884 } 9885 9886 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9887 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9888 9889 // Attempt to issue a wide load. 9890 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9891 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9892 9893 assert((LI || SI) && "Invalid Load/Store instruction"); 9894 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9895 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9896 9897 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9898 9899 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9900 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9901 bool CreateGatherScatter = !Consecutive; 9902 9903 auto &Builder = State.Builder; 9904 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9905 bool isMaskRequired = getMask(); 9906 if (isMaskRequired) 9907 for (unsigned Part = 0; Part < State.UF; ++Part) 9908 BlockInMaskParts[Part] = State.get(getMask(), Part); 9909 9910 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9911 // Calculate the pointer for the specific unroll-part. 9912 GetElementPtrInst *PartPtr = nullptr; 9913 9914 bool InBounds = false; 9915 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9916 InBounds = gep->isInBounds(); 9917 if (Reverse) { 9918 // If the address is consecutive but reversed, then the 9919 // wide store needs to start at the last vector element. 9920 // RunTimeVF = VScale * VF.getKnownMinValue() 9921 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9922 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9923 // NumElt = -Part * RunTimeVF 9924 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9925 // LastLane = 1 - RunTimeVF 9926 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9927 PartPtr = 9928 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9929 PartPtr->setIsInBounds(InBounds); 9930 PartPtr = cast<GetElementPtrInst>( 9931 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9932 PartPtr->setIsInBounds(InBounds); 9933 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9934 BlockInMaskParts[Part] = 9935 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9936 } else { 9937 Value *Increment = 9938 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9939 PartPtr = cast<GetElementPtrInst>( 9940 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9941 PartPtr->setIsInBounds(InBounds); 9942 } 9943 9944 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9945 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9946 }; 9947 9948 // Handle Stores: 9949 if (SI) { 9950 State.ILV->setDebugLocFromInst(SI); 9951 9952 for (unsigned Part = 0; Part < State.UF; ++Part) { 9953 Instruction *NewSI = nullptr; 9954 Value *StoredVal = State.get(StoredValue, Part); 9955 if (CreateGatherScatter) { 9956 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9957 Value *VectorGep = State.get(getAddr(), Part); 9958 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9959 MaskPart); 9960 } else { 9961 if (Reverse) { 9962 // If we store to reverse consecutive memory locations, then we need 9963 // to reverse the order of elements in the stored value. 9964 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9965 // We don't want to update the value in the map as it might be used in 9966 // another expression. So don't call resetVectorValue(StoredVal). 9967 } 9968 auto *VecPtr = 9969 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9970 if (isMaskRequired) 9971 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9972 BlockInMaskParts[Part]); 9973 else 9974 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9975 } 9976 State.ILV->addMetadata(NewSI, SI); 9977 } 9978 return; 9979 } 9980 9981 // Handle loads. 9982 assert(LI && "Must have a load instruction"); 9983 State.ILV->setDebugLocFromInst(LI); 9984 for (unsigned Part = 0; Part < State.UF; ++Part) { 9985 Value *NewLI; 9986 if (CreateGatherScatter) { 9987 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9988 Value *VectorGep = State.get(getAddr(), Part); 9989 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9990 nullptr, "wide.masked.gather"); 9991 State.ILV->addMetadata(NewLI, LI); 9992 } else { 9993 auto *VecPtr = 9994 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9995 if (isMaskRequired) 9996 NewLI = Builder.CreateMaskedLoad( 9997 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9998 PoisonValue::get(DataTy), "wide.masked.load"); 9999 else 10000 NewLI = 10001 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10002 10003 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10004 State.ILV->addMetadata(NewLI, LI); 10005 if (Reverse) 10006 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10007 } 10008 10009 State.set(getVPSingleValue(), NewLI, Part); 10010 } 10011 } 10012 10013 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10014 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10015 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10016 // for predication. 10017 static ScalarEpilogueLowering getScalarEpilogueLowering( 10018 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10019 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10020 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10021 LoopVectorizationLegality &LVL) { 10022 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10023 // don't look at hints or options, and don't request a scalar epilogue. 10024 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10025 // LoopAccessInfo (due to code dependency and not being able to reliably get 10026 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10027 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10028 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10029 // back to the old way and vectorize with versioning when forced. See D81345.) 10030 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10031 PGSOQueryType::IRPass) && 10032 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10033 return CM_ScalarEpilogueNotAllowedOptSize; 10034 10035 // 2) If set, obey the directives 10036 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10037 switch (PreferPredicateOverEpilogue) { 10038 case PreferPredicateTy::ScalarEpilogue: 10039 return CM_ScalarEpilogueAllowed; 10040 case PreferPredicateTy::PredicateElseScalarEpilogue: 10041 return CM_ScalarEpilogueNotNeededUsePredicate; 10042 case PreferPredicateTy::PredicateOrDontVectorize: 10043 return CM_ScalarEpilogueNotAllowedUsePredicate; 10044 }; 10045 } 10046 10047 // 3) If set, obey the hints 10048 switch (Hints.getPredicate()) { 10049 case LoopVectorizeHints::FK_Enabled: 10050 return CM_ScalarEpilogueNotNeededUsePredicate; 10051 case LoopVectorizeHints::FK_Disabled: 10052 return CM_ScalarEpilogueAllowed; 10053 }; 10054 10055 // 4) if the TTI hook indicates this is profitable, request predication. 10056 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10057 LVL.getLAI())) 10058 return CM_ScalarEpilogueNotNeededUsePredicate; 10059 10060 return CM_ScalarEpilogueAllowed; 10061 } 10062 10063 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10064 // If Values have been set for this Def return the one relevant for \p Part. 10065 if (hasVectorValue(Def, Part)) 10066 return Data.PerPartOutput[Def][Part]; 10067 10068 if (!hasScalarValue(Def, {Part, 0})) { 10069 Value *IRV = Def->getLiveInIRValue(); 10070 Value *B = ILV->getBroadcastInstrs(IRV); 10071 set(Def, B, Part); 10072 return B; 10073 } 10074 10075 Value *ScalarValue = get(Def, {Part, 0}); 10076 // If we aren't vectorizing, we can just copy the scalar map values over 10077 // to the vector map. 10078 if (VF.isScalar()) { 10079 set(Def, ScalarValue, Part); 10080 return ScalarValue; 10081 } 10082 10083 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10084 bool IsUniform = RepR && RepR->isUniform(); 10085 10086 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10087 // Check if there is a scalar value for the selected lane. 10088 if (!hasScalarValue(Def, {Part, LastLane})) { 10089 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10090 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10091 "unexpected recipe found to be invariant"); 10092 IsUniform = true; 10093 LastLane = 0; 10094 } 10095 10096 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10097 // Set the insert point after the last scalarized instruction or after the 10098 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10099 // will directly follow the scalar definitions. 10100 auto OldIP = Builder.saveIP(); 10101 auto NewIP = 10102 isa<PHINode>(LastInst) 10103 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10104 : std::next(BasicBlock::iterator(LastInst)); 10105 Builder.SetInsertPoint(&*NewIP); 10106 10107 // However, if we are vectorizing, we need to construct the vector values. 10108 // If the value is known to be uniform after vectorization, we can just 10109 // broadcast the scalar value corresponding to lane zero for each unroll 10110 // iteration. Otherwise, we construct the vector values using 10111 // insertelement instructions. Since the resulting vectors are stored in 10112 // State, we will only generate the insertelements once. 10113 Value *VectorValue = nullptr; 10114 if (IsUniform) { 10115 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10116 set(Def, VectorValue, Part); 10117 } else { 10118 // Initialize packing with insertelements to start from undef. 10119 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10120 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10121 set(Def, Undef, Part); 10122 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10123 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10124 VectorValue = get(Def, Part); 10125 } 10126 Builder.restoreIP(OldIP); 10127 return VectorValue; 10128 } 10129 10130 // Process the loop in the VPlan-native vectorization path. This path builds 10131 // VPlan upfront in the vectorization pipeline, which allows to apply 10132 // VPlan-to-VPlan transformations from the very beginning without modifying the 10133 // input LLVM IR. 10134 static bool processLoopInVPlanNativePath( 10135 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10136 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10137 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10138 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10139 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10140 LoopVectorizationRequirements &Requirements) { 10141 10142 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10143 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10144 return false; 10145 } 10146 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10147 Function *F = L->getHeader()->getParent(); 10148 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10149 10150 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10151 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10152 10153 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10154 &Hints, IAI); 10155 // Use the planner for outer loop vectorization. 10156 // TODO: CM is not used at this point inside the planner. Turn CM into an 10157 // optional argument if we don't need it in the future. 10158 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10159 Requirements, ORE); 10160 10161 // Get user vectorization factor. 10162 ElementCount UserVF = Hints.getWidth(); 10163 10164 CM.collectElementTypesForWidening(); 10165 10166 // Plan how to best vectorize, return the best VF and its cost. 10167 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10168 10169 // If we are stress testing VPlan builds, do not attempt to generate vector 10170 // code. Masked vector code generation support will follow soon. 10171 // Also, do not attempt to vectorize if no vector code will be produced. 10172 if (VPlanBuildStressTest || EnableVPlanPredication || 10173 VectorizationFactor::Disabled() == VF) 10174 return false; 10175 10176 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10177 10178 { 10179 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10180 F->getParent()->getDataLayout()); 10181 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10182 &CM, BFI, PSI, Checks); 10183 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10184 << L->getHeader()->getParent()->getName() << "\"\n"); 10185 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10186 } 10187 10188 // Mark the loop as already vectorized to avoid vectorizing again. 10189 Hints.setAlreadyVectorized(); 10190 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10191 return true; 10192 } 10193 10194 // Emit a remark if there are stores to floats that required a floating point 10195 // extension. If the vectorized loop was generated with floating point there 10196 // will be a performance penalty from the conversion overhead and the change in 10197 // the vector width. 10198 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10199 SmallVector<Instruction *, 4> Worklist; 10200 for (BasicBlock *BB : L->getBlocks()) { 10201 for (Instruction &Inst : *BB) { 10202 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10203 if (S->getValueOperand()->getType()->isFloatTy()) 10204 Worklist.push_back(S); 10205 } 10206 } 10207 } 10208 10209 // Traverse the floating point stores upwards searching, for floating point 10210 // conversions. 10211 SmallPtrSet<const Instruction *, 4> Visited; 10212 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10213 while (!Worklist.empty()) { 10214 auto *I = Worklist.pop_back_val(); 10215 if (!L->contains(I)) 10216 continue; 10217 if (!Visited.insert(I).second) 10218 continue; 10219 10220 // Emit a remark if the floating point store required a floating 10221 // point conversion. 10222 // TODO: More work could be done to identify the root cause such as a 10223 // constant or a function return type and point the user to it. 10224 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10225 ORE->emit([&]() { 10226 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10227 I->getDebugLoc(), L->getHeader()) 10228 << "floating point conversion changes vector width. " 10229 << "Mixed floating point precision requires an up/down " 10230 << "cast that will negatively impact performance."; 10231 }); 10232 10233 for (Use &Op : I->operands()) 10234 if (auto *OpI = dyn_cast<Instruction>(Op)) 10235 Worklist.push_back(OpI); 10236 } 10237 } 10238 10239 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10240 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10241 !EnableLoopInterleaving), 10242 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10243 !EnableLoopVectorization) {} 10244 10245 bool LoopVectorizePass::processLoop(Loop *L) { 10246 assert((EnableVPlanNativePath || L->isInnermost()) && 10247 "VPlan-native path is not enabled. Only process inner loops."); 10248 10249 #ifndef NDEBUG 10250 const std::string DebugLocStr = getDebugLocString(L); 10251 #endif /* NDEBUG */ 10252 10253 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10254 << L->getHeader()->getParent()->getName() << "\" from " 10255 << DebugLocStr << "\n"); 10256 10257 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10258 10259 LLVM_DEBUG( 10260 dbgs() << "LV: Loop hints:" 10261 << " force=" 10262 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10263 ? "disabled" 10264 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10265 ? "enabled" 10266 : "?")) 10267 << " width=" << Hints.getWidth() 10268 << " interleave=" << Hints.getInterleave() << "\n"); 10269 10270 // Function containing loop 10271 Function *F = L->getHeader()->getParent(); 10272 10273 // Looking at the diagnostic output is the only way to determine if a loop 10274 // was vectorized (other than looking at the IR or machine code), so it 10275 // is important to generate an optimization remark for each loop. Most of 10276 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10277 // generated as OptimizationRemark and OptimizationRemarkMissed are 10278 // less verbose reporting vectorized loops and unvectorized loops that may 10279 // benefit from vectorization, respectively. 10280 10281 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10282 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10283 return false; 10284 } 10285 10286 PredicatedScalarEvolution PSE(*SE, *L); 10287 10288 // Check if it is legal to vectorize the loop. 10289 LoopVectorizationRequirements Requirements; 10290 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10291 &Requirements, &Hints, DB, AC, BFI, PSI); 10292 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10293 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10294 Hints.emitRemarkWithHints(); 10295 return false; 10296 } 10297 10298 // Check the function attributes and profiles to find out if this function 10299 // should be optimized for size. 10300 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10301 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10302 10303 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10304 // here. They may require CFG and instruction level transformations before 10305 // even evaluating whether vectorization is profitable. Since we cannot modify 10306 // the incoming IR, we need to build VPlan upfront in the vectorization 10307 // pipeline. 10308 if (!L->isInnermost()) 10309 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10310 ORE, BFI, PSI, Hints, Requirements); 10311 10312 assert(L->isInnermost() && "Inner loop expected."); 10313 10314 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10315 // count by optimizing for size, to minimize overheads. 10316 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10317 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10318 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10319 << "This loop is worth vectorizing only if no scalar " 10320 << "iteration overheads are incurred."); 10321 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10322 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10323 else { 10324 LLVM_DEBUG(dbgs() << "\n"); 10325 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10326 } 10327 } 10328 10329 // Check the function attributes to see if implicit floats are allowed. 10330 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10331 // an integer loop and the vector instructions selected are purely integer 10332 // vector instructions? 10333 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10334 reportVectorizationFailure( 10335 "Can't vectorize when the NoImplicitFloat attribute is used", 10336 "loop not vectorized due to NoImplicitFloat attribute", 10337 "NoImplicitFloat", ORE, L); 10338 Hints.emitRemarkWithHints(); 10339 return false; 10340 } 10341 10342 // Check if the target supports potentially unsafe FP vectorization. 10343 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10344 // for the target we're vectorizing for, to make sure none of the 10345 // additional fp-math flags can help. 10346 if (Hints.isPotentiallyUnsafe() && 10347 TTI->isFPVectorizationPotentiallyUnsafe()) { 10348 reportVectorizationFailure( 10349 "Potentially unsafe FP op prevents vectorization", 10350 "loop not vectorized due to unsafe FP support.", 10351 "UnsafeFP", ORE, L); 10352 Hints.emitRemarkWithHints(); 10353 return false; 10354 } 10355 10356 bool AllowOrderedReductions; 10357 // If the flag is set, use that instead and override the TTI behaviour. 10358 if (ForceOrderedReductions.getNumOccurrences() > 0) 10359 AllowOrderedReductions = ForceOrderedReductions; 10360 else 10361 AllowOrderedReductions = TTI->enableOrderedReductions(); 10362 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10363 ORE->emit([&]() { 10364 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10365 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10366 ExactFPMathInst->getDebugLoc(), 10367 ExactFPMathInst->getParent()) 10368 << "loop not vectorized: cannot prove it is safe to reorder " 10369 "floating-point operations"; 10370 }); 10371 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10372 "reorder floating-point operations\n"); 10373 Hints.emitRemarkWithHints(); 10374 return false; 10375 } 10376 10377 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10378 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10379 10380 // If an override option has been passed in for interleaved accesses, use it. 10381 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10382 UseInterleaved = EnableInterleavedMemAccesses; 10383 10384 // Analyze interleaved memory accesses. 10385 if (UseInterleaved) { 10386 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10387 } 10388 10389 // Use the cost model. 10390 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10391 F, &Hints, IAI); 10392 CM.collectValuesToIgnore(); 10393 CM.collectElementTypesForWidening(); 10394 10395 // Use the planner for vectorization. 10396 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10397 Requirements, ORE); 10398 10399 // Get user vectorization factor and interleave count. 10400 ElementCount UserVF = Hints.getWidth(); 10401 unsigned UserIC = Hints.getInterleave(); 10402 10403 // Plan how to best vectorize, return the best VF and its cost. 10404 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10405 10406 VectorizationFactor VF = VectorizationFactor::Disabled(); 10407 unsigned IC = 1; 10408 10409 if (MaybeVF) { 10410 VF = *MaybeVF; 10411 // Select the interleave count. 10412 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10413 } 10414 10415 // Identify the diagnostic messages that should be produced. 10416 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10417 bool VectorizeLoop = true, InterleaveLoop = true; 10418 if (VF.Width.isScalar()) { 10419 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10420 VecDiagMsg = std::make_pair( 10421 "VectorizationNotBeneficial", 10422 "the cost-model indicates that vectorization is not beneficial"); 10423 VectorizeLoop = false; 10424 } 10425 10426 if (!MaybeVF && UserIC > 1) { 10427 // Tell the user interleaving was avoided up-front, despite being explicitly 10428 // requested. 10429 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10430 "interleaving should be avoided up front\n"); 10431 IntDiagMsg = std::make_pair( 10432 "InterleavingAvoided", 10433 "Ignoring UserIC, because interleaving was avoided up front"); 10434 InterleaveLoop = false; 10435 } else if (IC == 1 && UserIC <= 1) { 10436 // Tell the user interleaving is not beneficial. 10437 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10438 IntDiagMsg = std::make_pair( 10439 "InterleavingNotBeneficial", 10440 "the cost-model indicates that interleaving is not beneficial"); 10441 InterleaveLoop = false; 10442 if (UserIC == 1) { 10443 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10444 IntDiagMsg.second += 10445 " and is explicitly disabled or interleave count is set to 1"; 10446 } 10447 } else if (IC > 1 && UserIC == 1) { 10448 // Tell the user interleaving is beneficial, but it explicitly disabled. 10449 LLVM_DEBUG( 10450 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10451 IntDiagMsg = std::make_pair( 10452 "InterleavingBeneficialButDisabled", 10453 "the cost-model indicates that interleaving is beneficial " 10454 "but is explicitly disabled or interleave count is set to 1"); 10455 InterleaveLoop = false; 10456 } 10457 10458 // Override IC if user provided an interleave count. 10459 IC = UserIC > 0 ? UserIC : IC; 10460 10461 // Emit diagnostic messages, if any. 10462 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10463 if (!VectorizeLoop && !InterleaveLoop) { 10464 // Do not vectorize or interleaving the loop. 10465 ORE->emit([&]() { 10466 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10467 L->getStartLoc(), L->getHeader()) 10468 << VecDiagMsg.second; 10469 }); 10470 ORE->emit([&]() { 10471 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10472 L->getStartLoc(), L->getHeader()) 10473 << IntDiagMsg.second; 10474 }); 10475 return false; 10476 } else if (!VectorizeLoop && InterleaveLoop) { 10477 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10478 ORE->emit([&]() { 10479 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10480 L->getStartLoc(), L->getHeader()) 10481 << VecDiagMsg.second; 10482 }); 10483 } else if (VectorizeLoop && !InterleaveLoop) { 10484 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10485 << ") in " << DebugLocStr << '\n'); 10486 ORE->emit([&]() { 10487 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10488 L->getStartLoc(), L->getHeader()) 10489 << IntDiagMsg.second; 10490 }); 10491 } else if (VectorizeLoop && InterleaveLoop) { 10492 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10493 << ") in " << DebugLocStr << '\n'); 10494 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10495 } 10496 10497 bool DisableRuntimeUnroll = false; 10498 MDNode *OrigLoopID = L->getLoopID(); 10499 { 10500 // Optimistically generate runtime checks. Drop them if they turn out to not 10501 // be profitable. Limit the scope of Checks, so the cleanup happens 10502 // immediately after vector codegeneration is done. 10503 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10504 F->getParent()->getDataLayout()); 10505 if (!VF.Width.isScalar() || IC > 1) 10506 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10507 10508 using namespace ore; 10509 if (!VectorizeLoop) { 10510 assert(IC > 1 && "interleave count should not be 1 or 0"); 10511 // If we decided that it is not legal to vectorize the loop, then 10512 // interleave it. 10513 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10514 &CM, BFI, PSI, Checks); 10515 10516 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10517 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10518 10519 ORE->emit([&]() { 10520 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10521 L->getHeader()) 10522 << "interleaved loop (interleaved count: " 10523 << NV("InterleaveCount", IC) << ")"; 10524 }); 10525 } else { 10526 // If we decided that it is *legal* to vectorize the loop, then do it. 10527 10528 // Consider vectorizing the epilogue too if it's profitable. 10529 VectorizationFactor EpilogueVF = 10530 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10531 if (EpilogueVF.Width.isVector()) { 10532 10533 // The first pass vectorizes the main loop and creates a scalar epilogue 10534 // to be vectorized by executing the plan (potentially with a different 10535 // factor) again shortly afterwards. 10536 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10537 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10538 EPI, &LVL, &CM, BFI, PSI, Checks); 10539 10540 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10541 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10542 DT); 10543 ++LoopsVectorized; 10544 10545 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10546 formLCSSARecursively(*L, *DT, LI, SE); 10547 10548 // Second pass vectorizes the epilogue and adjusts the control flow 10549 // edges from the first pass. 10550 EPI.MainLoopVF = EPI.EpilogueVF; 10551 EPI.MainLoopUF = EPI.EpilogueUF; 10552 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10553 ORE, EPI, &LVL, &CM, BFI, PSI, 10554 Checks); 10555 10556 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10557 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10558 DT); 10559 ++LoopsEpilogueVectorized; 10560 10561 if (!MainILV.areSafetyChecksAdded()) 10562 DisableRuntimeUnroll = true; 10563 } else { 10564 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10565 &LVL, &CM, BFI, PSI, Checks); 10566 10567 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10568 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10569 ++LoopsVectorized; 10570 10571 // Add metadata to disable runtime unrolling a scalar loop when there 10572 // are no runtime checks about strides and memory. A scalar loop that is 10573 // rarely used is not worth unrolling. 10574 if (!LB.areSafetyChecksAdded()) 10575 DisableRuntimeUnroll = true; 10576 } 10577 // Report the vectorization decision. 10578 ORE->emit([&]() { 10579 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10580 L->getHeader()) 10581 << "vectorized loop (vectorization width: " 10582 << NV("VectorizationFactor", VF.Width) 10583 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10584 }); 10585 } 10586 10587 if (ORE->allowExtraAnalysis(LV_NAME)) 10588 checkMixedPrecision(L, ORE); 10589 } 10590 10591 Optional<MDNode *> RemainderLoopID = 10592 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10593 LLVMLoopVectorizeFollowupEpilogue}); 10594 if (RemainderLoopID.hasValue()) { 10595 L->setLoopID(RemainderLoopID.getValue()); 10596 } else { 10597 if (DisableRuntimeUnroll) 10598 AddRuntimeUnrollDisableMetaData(L); 10599 10600 // Mark the loop as already vectorized to avoid vectorizing again. 10601 Hints.setAlreadyVectorized(); 10602 } 10603 10604 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10605 return true; 10606 } 10607 10608 LoopVectorizeResult LoopVectorizePass::runImpl( 10609 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10610 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10611 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10612 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10613 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10614 SE = &SE_; 10615 LI = &LI_; 10616 TTI = &TTI_; 10617 DT = &DT_; 10618 BFI = &BFI_; 10619 TLI = TLI_; 10620 AA = &AA_; 10621 AC = &AC_; 10622 GetLAA = &GetLAA_; 10623 DB = &DB_; 10624 ORE = &ORE_; 10625 PSI = PSI_; 10626 10627 // Don't attempt if 10628 // 1. the target claims to have no vector registers, and 10629 // 2. interleaving won't help ILP. 10630 // 10631 // The second condition is necessary because, even if the target has no 10632 // vector registers, loop vectorization may still enable scalar 10633 // interleaving. 10634 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10635 TTI->getMaxInterleaveFactor(1) < 2) 10636 return LoopVectorizeResult(false, false); 10637 10638 bool Changed = false, CFGChanged = false; 10639 10640 // The vectorizer requires loops to be in simplified form. 10641 // Since simplification may add new inner loops, it has to run before the 10642 // legality and profitability checks. This means running the loop vectorizer 10643 // will simplify all loops, regardless of whether anything end up being 10644 // vectorized. 10645 for (auto &L : *LI) 10646 Changed |= CFGChanged |= 10647 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10648 10649 // Build up a worklist of inner-loops to vectorize. This is necessary as 10650 // the act of vectorizing or partially unrolling a loop creates new loops 10651 // and can invalidate iterators across the loops. 10652 SmallVector<Loop *, 8> Worklist; 10653 10654 for (Loop *L : *LI) 10655 collectSupportedLoops(*L, LI, ORE, Worklist); 10656 10657 LoopsAnalyzed += Worklist.size(); 10658 10659 // Now walk the identified inner loops. 10660 while (!Worklist.empty()) { 10661 Loop *L = Worklist.pop_back_val(); 10662 10663 // For the inner loops we actually process, form LCSSA to simplify the 10664 // transform. 10665 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10666 10667 Changed |= CFGChanged |= processLoop(L); 10668 } 10669 10670 // Process each loop nest in the function. 10671 return LoopVectorizeResult(Changed, CFGChanged); 10672 } 10673 10674 PreservedAnalyses LoopVectorizePass::run(Function &F, 10675 FunctionAnalysisManager &AM) { 10676 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10677 auto &LI = AM.getResult<LoopAnalysis>(F); 10678 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10679 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10680 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10681 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10682 auto &AA = AM.getResult<AAManager>(F); 10683 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10684 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10685 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10686 10687 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10688 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10689 [&](Loop &L) -> const LoopAccessInfo & { 10690 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10691 TLI, TTI, nullptr, nullptr, nullptr}; 10692 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10693 }; 10694 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10695 ProfileSummaryInfo *PSI = 10696 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10697 LoopVectorizeResult Result = 10698 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10699 if (!Result.MadeAnyChange) 10700 return PreservedAnalyses::all(); 10701 PreservedAnalyses PA; 10702 10703 // We currently do not preserve loopinfo/dominator analyses with outer loop 10704 // vectorization. Until this is addressed, mark these analyses as preserved 10705 // only for non-VPlan-native path. 10706 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10707 if (!EnableVPlanNativePath) { 10708 PA.preserve<LoopAnalysis>(); 10709 PA.preserve<DominatorTreeAnalysis>(); 10710 } 10711 10712 if (Result.MadeCFGChange) { 10713 // Making CFG changes likely means a loop got vectorized. Indicate that 10714 // extra simplification passes should be run. 10715 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10716 // be run if runtime checks have been added. 10717 AM.getResult<ShouldRunExtraVectorPasses>(F); 10718 PA.preserve<ShouldRunExtraVectorPasses>(); 10719 } else { 10720 PA.preserveSet<CFGAnalyses>(); 10721 } 10722 return PA; 10723 } 10724 10725 void LoopVectorizePass::printPipeline( 10726 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10727 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10728 OS, MapClassName2PassName); 10729 10730 OS << "<"; 10731 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10732 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10733 OS << ">"; 10734 } 10735