1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop. 472 /// In the case of epilogue vectorization, this function is overriden to 473 /// handle the more complex control flow around the loops. 474 virtual BasicBlock *createVectorizedLoopSkeleton(); 475 476 /// Widen a single instruction within the innermost loop. 477 void widenInstruction(Instruction &I, VPWidenRecipe *WidenRec, 478 VPTransformState &State); 479 480 /// Widen a single call instruction within the innermost loop. 481 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 482 VPTransformState &State); 483 484 /// Widen a single select instruction within the innermost loop. 485 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 486 bool InvariantCond, VPTransformState &State); 487 488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 489 void fixVectorizedLoop(VPTransformState &State); 490 491 // Return true if any runtime check is added. 492 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 493 494 /// A type for vectorized values in the new loop. Each value from the 495 /// original loop, when vectorized, is represented by UF vector values in the 496 /// new unrolled loop, where UF is the unroll factor. 497 using VectorParts = SmallVector<Value *, 2>; 498 499 /// Vectorize a single first-order recurrence or pointer induction PHINode in 500 /// a block. This method handles the induction variable canonicalization. It 501 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 502 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 503 VPTransformState &State); 504 505 /// A helper function to scalarize a single Instruction in the innermost loop. 506 /// Generates a sequence of scalar instances for each lane between \p MinLane 507 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 508 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 509 /// Instr's operands. 510 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 511 const VPIteration &Instance, bool IfPredicateInstr, 512 VPTransformState &State); 513 514 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 515 /// is provided, the integer induction variable will first be truncated to 516 /// the corresponding type. 517 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 518 VPValue *Def, VPValue *CastDef, 519 VPTransformState &State); 520 521 /// Construct the vector value of a scalarized value \p V one lane at a time. 522 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 523 VPTransformState &State); 524 525 /// Try to vectorize interleaved access group \p Group with the base address 526 /// given in \p Addr, optionally masking the vector operations if \p 527 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 528 /// values in the vectorized loop. 529 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 530 ArrayRef<VPValue *> VPDefs, 531 VPTransformState &State, VPValue *Addr, 532 ArrayRef<VPValue *> StoredValues, 533 VPValue *BlockInMask = nullptr); 534 535 /// Vectorize Load and Store instructions with the base address given in \p 536 /// Addr, optionally masking the vector operations if \p BlockInMask is 537 /// non-null. Use \p State to translate given VPValues to IR values in the 538 /// vectorized loop. 539 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 540 VPValue *Def, VPValue *Addr, 541 VPValue *StoredValue, VPValue *BlockInMask, 542 bool ConsecutiveStride, bool Reverse); 543 544 /// Set the debug location in the builder \p Ptr using the debug location in 545 /// \p V. If \p Ptr is None then it uses the class member's Builder. 546 void setDebugLocFromInst(const Value *V, 547 Optional<IRBuilder<> *> CustomBuilder = None); 548 549 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 550 void fixNonInductionPHIs(VPTransformState &State); 551 552 /// Returns true if the reordering of FP operations is not allowed, but we are 553 /// able to vectorize with strict in-order reductions for the given RdxDesc. 554 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 555 556 /// Create a broadcast instruction. This method generates a broadcast 557 /// instruction (shuffle) for loop invariant values and for the induction 558 /// value. If this is the induction variable then we extend it to N, N+1, ... 559 /// this is needed because each iteration in the loop corresponds to a SIMD 560 /// element. 561 virtual Value *getBroadcastInstrs(Value *V); 562 563 /// Add metadata from one instruction to another. 564 /// 565 /// This includes both the original MDs from \p From and additional ones (\see 566 /// addNewMetadata). Use this for *newly created* instructions in the vector 567 /// loop. 568 void addMetadata(Instruction *To, Instruction *From); 569 570 /// Similar to the previous function but it adds the metadata to a 571 /// vector of instructions. 572 void addMetadata(ArrayRef<Value *> To, Instruction *From); 573 574 protected: 575 friend class LoopVectorizationPlanner; 576 577 /// A small list of PHINodes. 578 using PhiVector = SmallVector<PHINode *, 4>; 579 580 /// A type for scalarized values in the new loop. Each value from the 581 /// original loop, when scalarized, is represented by UF x VF scalar values 582 /// in the new unrolled loop, where UF is the unroll factor and VF is the 583 /// vectorization factor. 584 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 585 586 /// Set up the values of the IVs correctly when exiting the vector loop. 587 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 588 Value *CountRoundDown, Value *EndValue, 589 BasicBlock *MiddleBlock); 590 591 /// Create a new induction variable inside L. 592 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 593 Value *Step, Instruction *DL); 594 595 /// Handle all cross-iteration phis in the header. 596 void fixCrossIterationPHIs(VPTransformState &State); 597 598 /// Create the exit value of first order recurrences in the middle block and 599 /// update their users. 600 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 601 602 /// Create code for the loop exit value of the reduction. 603 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 604 605 /// Clear NSW/NUW flags from reduction instructions if necessary. 606 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 607 VPTransformState &State); 608 609 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 610 /// means we need to add the appropriate incoming value from the middle 611 /// block as exiting edges from the scalar epilogue loop (if present) are 612 /// already in place, and we exit the vector loop exclusively to the middle 613 /// block. 614 void fixLCSSAPHIs(VPTransformState &State); 615 616 /// Iteratively sink the scalarized operands of a predicated instruction into 617 /// the block that was created for it. 618 void sinkScalarOperands(Instruction *PredInst); 619 620 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 621 /// represented as. 622 void truncateToMinimalBitwidths(VPTransformState &State); 623 624 /// This function adds 625 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 626 /// to each vector element of Val. The sequence starts at StartIndex. 627 /// \p Opcode is relevant for FP induction variable. 628 virtual Value * 629 getStepVector(Value *Val, Value *StartIdx, Value *Step, 630 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 631 632 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 633 /// variable on which to base the steps, \p Step is the size of the step, and 634 /// \p EntryVal is the value from the original loop that maps to the steps. 635 /// Note that \p EntryVal doesn't have to be an induction variable - it 636 /// can also be a truncate instruction. 637 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 638 const InductionDescriptor &ID, VPValue *Def, 639 VPValue *CastDef, VPTransformState &State); 640 641 /// Create a vector induction phi node based on an existing scalar one. \p 642 /// EntryVal is the value from the original loop that maps to the vector phi 643 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 644 /// truncate instruction, instead of widening the original IV, we widen a 645 /// version of the IV truncated to \p EntryVal's type. 646 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 647 Value *Step, Value *Start, 648 Instruction *EntryVal, VPValue *Def, 649 VPValue *CastDef, 650 VPTransformState &State); 651 652 /// Returns true if an instruction \p I should be scalarized instead of 653 /// vectorized for the chosen vectorization factor. 654 bool shouldScalarizeInstruction(Instruction *I) const; 655 656 /// Returns true if we should generate a scalar version of \p IV. 657 bool needsScalarInduction(Instruction *IV) const; 658 659 /// If there is a cast involved in the induction variable \p ID, which should 660 /// be ignored in the vectorized loop body, this function records the 661 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 662 /// cast. We had already proved that the casted Phi is equal to the uncasted 663 /// Phi in the vectorized loop (under a runtime guard), and therefore 664 /// there is no need to vectorize the cast - the same value can be used in the 665 /// vector loop for both the Phi and the cast. 666 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 667 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 668 /// 669 /// \p EntryVal is the value from the original loop that maps to the vector 670 /// phi node and is used to distinguish what is the IV currently being 671 /// processed - original one (if \p EntryVal is a phi corresponding to the 672 /// original IV) or the "newly-created" one based on the proof mentioned above 673 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 674 /// latter case \p EntryVal is a TruncInst and we must not record anything for 675 /// that IV, but it's error-prone to expect callers of this routine to care 676 /// about that, hence this explicit parameter. 677 void recordVectorLoopValueForInductionCast( 678 const InductionDescriptor &ID, const Instruction *EntryVal, 679 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 680 unsigned Part, unsigned Lane = UINT_MAX); 681 682 /// Generate a shuffle sequence that will reverse the vector Vec. 683 virtual Value *reverseVector(Value *Vec); 684 685 /// Returns (and creates if needed) the original loop trip count. 686 Value *getOrCreateTripCount(Loop *NewLoop); 687 688 /// Returns (and creates if needed) the trip count of the widened loop. 689 Value *getOrCreateVectorTripCount(Loop *NewLoop); 690 691 /// Returns a bitcasted value to the requested vector type. 692 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 693 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 694 const DataLayout &DL); 695 696 /// Emit a bypass check to see if the vector trip count is zero, including if 697 /// it overflows. 698 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 699 700 /// Emit a bypass check to see if all of the SCEV assumptions we've 701 /// had to make are correct. Returns the block containing the checks or 702 /// nullptr if no checks have been added. 703 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 704 705 /// Emit bypass checks to check any memory assumptions we may have made. 706 /// Returns the block containing the checks or nullptr if no checks have been 707 /// added. 708 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 709 710 /// Compute the transformed value of Index at offset StartValue using step 711 /// StepValue. 712 /// For integer induction, returns StartValue + Index * StepValue. 713 /// For pointer induction, returns StartValue[Index * StepValue]. 714 /// FIXME: The newly created binary instructions should contain nsw/nuw 715 /// flags, which can be found from the original scalar operations. 716 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 717 const DataLayout &DL, 718 const InductionDescriptor &ID) const; 719 720 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 721 /// vector loop preheader, middle block and scalar preheader. Also 722 /// allocate a loop object for the new vector loop and return it. 723 Loop *createVectorLoopSkeleton(StringRef Prefix); 724 725 /// Create new phi nodes for the induction variables to resume iteration count 726 /// in the scalar epilogue, from where the vectorized loop left off (given by 727 /// \p VectorTripCount). 728 /// In cases where the loop skeleton is more complicated (eg. epilogue 729 /// vectorization) and the resume values can come from an additional bypass 730 /// block, the \p AdditionalBypass pair provides information about the bypass 731 /// block and the end value on the edge from bypass to this loop. 732 void createInductionResumeValues( 733 Loop *L, Value *VectorTripCount, 734 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 735 736 /// Complete the loop skeleton by adding debug MDs, creating appropriate 737 /// conditional branches in the middle block, preparing the builder and 738 /// running the verifier. Take in the vector loop \p L as argument, and return 739 /// the preheader of the completed vector loop. 740 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 741 742 /// Add additional metadata to \p To that was not present on \p Orig. 743 /// 744 /// Currently this is used to add the noalias annotations based on the 745 /// inserted memchecks. Use this for instructions that are *cloned* into the 746 /// vector loop. 747 void addNewMetadata(Instruction *To, const Instruction *Orig); 748 749 /// Collect poison-generating recipes that may generate a poison value that is 750 /// used after vectorization, even when their operands are not poison. Those 751 /// recipes meet the following conditions: 752 /// * Contribute to the address computation of a recipe generating a widen 753 /// memory load/store (VPWidenMemoryInstructionRecipe or 754 /// VPInterleaveRecipe). 755 /// * Such a widen memory load/store has at least one underlying Instruction 756 /// that is in a basic block that needs predication and after vectorization 757 /// the generated instruction won't be predicated. 758 void collectPoisonGeneratingRecipes(VPTransformState &State); 759 760 /// Allow subclasses to override and print debug traces before/after vplan 761 /// execution, when trace information is requested. 762 virtual void printDebugTracesAtStart(){}; 763 virtual void printDebugTracesAtEnd(){}; 764 765 /// The original loop. 766 Loop *OrigLoop; 767 768 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 769 /// dynamic knowledge to simplify SCEV expressions and converts them to a 770 /// more usable form. 771 PredicatedScalarEvolution &PSE; 772 773 /// Loop Info. 774 LoopInfo *LI; 775 776 /// Dominator Tree. 777 DominatorTree *DT; 778 779 /// Alias Analysis. 780 AAResults *AA; 781 782 /// Target Library Info. 783 const TargetLibraryInfo *TLI; 784 785 /// Target Transform Info. 786 const TargetTransformInfo *TTI; 787 788 /// Assumption Cache. 789 AssumptionCache *AC; 790 791 /// Interface to emit optimization remarks. 792 OptimizationRemarkEmitter *ORE; 793 794 /// LoopVersioning. It's only set up (non-null) if memchecks were 795 /// used. 796 /// 797 /// This is currently only used to add no-alias metadata based on the 798 /// memchecks. The actually versioning is performed manually. 799 std::unique_ptr<LoopVersioning> LVer; 800 801 /// The vectorization SIMD factor to use. Each vector will have this many 802 /// vector elements. 803 ElementCount VF; 804 805 /// The vectorization unroll factor to use. Each scalar is vectorized to this 806 /// many different vector instructions. 807 unsigned UF; 808 809 /// The builder that we use 810 IRBuilder<> Builder; 811 812 // --- Vectorization state --- 813 814 /// The vector-loop preheader. 815 BasicBlock *LoopVectorPreHeader; 816 817 /// The scalar-loop preheader. 818 BasicBlock *LoopScalarPreHeader; 819 820 /// Middle Block between the vector and the scalar. 821 BasicBlock *LoopMiddleBlock; 822 823 /// The unique ExitBlock of the scalar loop if one exists. Note that 824 /// there can be multiple exiting edges reaching this block. 825 BasicBlock *LoopExitBlock; 826 827 /// The vector loop body. 828 BasicBlock *LoopVectorBody; 829 830 /// The scalar loop body. 831 BasicBlock *LoopScalarBody; 832 833 /// A list of all bypass blocks. The first block is the entry of the loop. 834 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 835 836 /// The new Induction variable which was added to the new block. 837 PHINode *Induction = nullptr; 838 839 /// The induction variable of the old basic block. 840 PHINode *OldInduction = nullptr; 841 842 /// Store instructions that were predicated. 843 SmallVector<Instruction *, 4> PredicatedInstructions; 844 845 /// Trip count of the original loop. 846 Value *TripCount = nullptr; 847 848 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 849 Value *VectorTripCount = nullptr; 850 851 /// The legality analysis. 852 LoopVectorizationLegality *Legal; 853 854 /// The profitablity analysis. 855 LoopVectorizationCostModel *Cost; 856 857 // Record whether runtime checks are added. 858 bool AddedSafetyChecks = false; 859 860 // Holds the end values for each induction variable. We save the end values 861 // so we can later fix-up the external users of the induction variables. 862 DenseMap<PHINode *, Value *> IVEndValues; 863 864 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 865 // fixed up at the end of vector code generation. 866 SmallVector<PHINode *, 8> OrigPHIsToFix; 867 868 /// BFI and PSI are used to check for profile guided size optimizations. 869 BlockFrequencyInfo *BFI; 870 ProfileSummaryInfo *PSI; 871 872 // Whether this loop should be optimized for size based on profile guided size 873 // optimizatios. 874 bool OptForSizeBasedOnProfile; 875 876 /// Structure to hold information about generated runtime checks, responsible 877 /// for cleaning the checks, if vectorization turns out unprofitable. 878 GeneratedRTChecks &RTChecks; 879 }; 880 881 class InnerLoopUnroller : public InnerLoopVectorizer { 882 public: 883 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 884 LoopInfo *LI, DominatorTree *DT, 885 const TargetLibraryInfo *TLI, 886 const TargetTransformInfo *TTI, AssumptionCache *AC, 887 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 888 LoopVectorizationLegality *LVL, 889 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 890 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 891 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 892 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 893 BFI, PSI, Check) {} 894 895 private: 896 Value *getBroadcastInstrs(Value *V) override; 897 Value *getStepVector( 898 Value *Val, Value *StartIdx, Value *Step, 899 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 900 Value *reverseVector(Value *Vec) override; 901 }; 902 903 /// Encapsulate information regarding vectorization of a loop and its epilogue. 904 /// This information is meant to be updated and used across two stages of 905 /// epilogue vectorization. 906 struct EpilogueLoopVectorizationInfo { 907 ElementCount MainLoopVF = ElementCount::getFixed(0); 908 unsigned MainLoopUF = 0; 909 ElementCount EpilogueVF = ElementCount::getFixed(0); 910 unsigned EpilogueUF = 0; 911 BasicBlock *MainLoopIterationCountCheck = nullptr; 912 BasicBlock *EpilogueIterationCountCheck = nullptr; 913 BasicBlock *SCEVSafetyCheck = nullptr; 914 BasicBlock *MemSafetyCheck = nullptr; 915 Value *TripCount = nullptr; 916 Value *VectorTripCount = nullptr; 917 918 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 919 ElementCount EVF, unsigned EUF) 920 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 921 assert(EUF == 1 && 922 "A high UF for the epilogue loop is likely not beneficial."); 923 } 924 }; 925 926 /// An extension of the inner loop vectorizer that creates a skeleton for a 927 /// vectorized loop that has its epilogue (residual) also vectorized. 928 /// The idea is to run the vplan on a given loop twice, firstly to setup the 929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 930 /// from the first step and vectorize the epilogue. This is achieved by 931 /// deriving two concrete strategy classes from this base class and invoking 932 /// them in succession from the loop vectorizer planner. 933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 934 public: 935 InnerLoopAndEpilogueVectorizer( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Checks) 943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 945 Checks), 946 EPI(EPI) {} 947 948 // Override this function to handle the more complex control flow around the 949 // three loops. 950 BasicBlock *createVectorizedLoopSkeleton() final override { 951 return createEpilogueVectorizedLoopSkeleton(); 952 } 953 954 /// The interface for creating a vectorized skeleton using one of two 955 /// different strategies, each corresponding to one execution of the vplan 956 /// as described above. 957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 958 959 /// Holds and updates state information required to vectorize the main loop 960 /// and its epilogue in two separate passes. This setup helps us avoid 961 /// regenerating and recomputing runtime safety checks. It also helps us to 962 /// shorten the iteration-count-check path length for the cases where the 963 /// iteration count of the loop is so small that the main vector loop is 964 /// completely skipped. 965 EpilogueLoopVectorizationInfo &EPI; 966 }; 967 968 /// A specialized derived class of inner loop vectorizer that performs 969 /// vectorization of *main* loops in the process of vectorizing loops and their 970 /// epilogues. 971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 972 public: 973 EpilogueVectorizerMainLoop( 974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 975 DominatorTree *DT, const TargetLibraryInfo *TLI, 976 const TargetTransformInfo *TTI, AssumptionCache *AC, 977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 980 GeneratedRTChecks &Check) 981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 982 EPI, LVL, CM, BFI, PSI, Check) {} 983 /// Implements the interface for creating a vectorized skeleton using the 984 /// *main loop* strategy (ie the first pass of vplan execution). 985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 986 987 protected: 988 /// Emits an iteration count bypass check once for the main loop (when \p 989 /// ForEpilogue is false) and once for the epilogue loop (when \p 990 /// ForEpilogue is true). 991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 992 bool ForEpilogue); 993 void printDebugTracesAtStart() override; 994 void printDebugTracesAtEnd() override; 995 }; 996 997 // A specialized derived class of inner loop vectorizer that performs 998 // vectorization of *epilogue* loops in the process of vectorizing loops and 999 // their epilogues. 1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerEpilogueLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1009 GeneratedRTChecks &Checks) 1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1011 EPI, LVL, CM, BFI, PSI, Checks) {} 1012 /// Implements the interface for creating a vectorized skeleton using the 1013 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1015 1016 protected: 1017 /// Emits an iteration count bypass check after the main vector loop has 1018 /// finished to see if there are any iterations left to execute by either 1019 /// the vector epilogue or the scalar epilogue. 1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1021 BasicBlock *Bypass, 1022 BasicBlock *Insert); 1023 void printDebugTracesAtStart() override; 1024 void printDebugTracesAtEnd() override; 1025 }; 1026 } // end namespace llvm 1027 1028 /// Look for a meaningful debug location on the instruction or it's 1029 /// operands. 1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1031 if (!I) 1032 return I; 1033 1034 DebugLoc Empty; 1035 if (I->getDebugLoc() != Empty) 1036 return I; 1037 1038 for (Use &Op : I->operands()) { 1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1040 if (OpInst->getDebugLoc() != Empty) 1041 return OpInst; 1042 } 1043 1044 return I; 1045 } 1046 1047 void InnerLoopVectorizer::setDebugLocFromInst( 1048 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1049 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1050 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1051 const DILocation *DIL = Inst->getDebugLoc(); 1052 1053 // When a FSDiscriminator is enabled, we don't need to add the multiply 1054 // factors to the discriminators. 1055 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1056 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1057 // FIXME: For scalable vectors, assume vscale=1. 1058 auto NewDIL = 1059 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1060 if (NewDIL) 1061 B->SetCurrentDebugLocation(NewDIL.getValue()); 1062 else 1063 LLVM_DEBUG(dbgs() 1064 << "Failed to create new discriminator: " 1065 << DIL->getFilename() << " Line: " << DIL->getLine()); 1066 } else 1067 B->SetCurrentDebugLocation(DIL); 1068 } else 1069 B->SetCurrentDebugLocation(DebugLoc()); 1070 } 1071 1072 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1073 /// is passed, the message relates to that particular instruction. 1074 #ifndef NDEBUG 1075 static void debugVectorizationMessage(const StringRef Prefix, 1076 const StringRef DebugMsg, 1077 Instruction *I) { 1078 dbgs() << "LV: " << Prefix << DebugMsg; 1079 if (I != nullptr) 1080 dbgs() << " " << *I; 1081 else 1082 dbgs() << '.'; 1083 dbgs() << '\n'; 1084 } 1085 #endif 1086 1087 /// Create an analysis remark that explains why vectorization failed 1088 /// 1089 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1090 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1091 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1092 /// the location of the remark. \return the remark object that can be 1093 /// streamed to. 1094 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1095 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1096 Value *CodeRegion = TheLoop->getHeader(); 1097 DebugLoc DL = TheLoop->getStartLoc(); 1098 1099 if (I) { 1100 CodeRegion = I->getParent(); 1101 // If there is no debug location attached to the instruction, revert back to 1102 // using the loop's. 1103 if (I->getDebugLoc()) 1104 DL = I->getDebugLoc(); 1105 } 1106 1107 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1108 } 1109 1110 /// Return a value for Step multiplied by VF. 1111 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1112 int64_t Step) { 1113 assert(Ty->isIntegerTy() && "Expected an integer step"); 1114 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1115 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1116 } 1117 1118 namespace llvm { 1119 1120 /// Return the runtime value for VF. 1121 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1122 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1123 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1124 } 1125 1126 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1127 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1128 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1129 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1130 return B.CreateUIToFP(RuntimeVF, FTy); 1131 } 1132 1133 void reportVectorizationFailure(const StringRef DebugMsg, 1134 const StringRef OREMsg, const StringRef ORETag, 1135 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1136 Instruction *I) { 1137 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1138 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1139 ORE->emit( 1140 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1141 << "loop not vectorized: " << OREMsg); 1142 } 1143 1144 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1145 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1146 Instruction *I) { 1147 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1148 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1149 ORE->emit( 1150 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1151 << Msg); 1152 } 1153 1154 } // end namespace llvm 1155 1156 #ifndef NDEBUG 1157 /// \return string containing a file name and a line # for the given loop. 1158 static std::string getDebugLocString(const Loop *L) { 1159 std::string Result; 1160 if (L) { 1161 raw_string_ostream OS(Result); 1162 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1163 LoopDbgLoc.print(OS); 1164 else 1165 // Just print the module name. 1166 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1167 OS.flush(); 1168 } 1169 return Result; 1170 } 1171 #endif 1172 1173 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1174 const Instruction *Orig) { 1175 // If the loop was versioned with memchecks, add the corresponding no-alias 1176 // metadata. 1177 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1178 LVer->annotateInstWithNoAlias(To, Orig); 1179 } 1180 1181 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1182 VPTransformState &State) { 1183 1184 // Collect recipes in the backward slice of `Root` that may generate a poison 1185 // value that is used after vectorization. 1186 SmallPtrSet<VPRecipeBase *, 16> Visited; 1187 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1188 SmallVector<VPRecipeBase *, 16> Worklist; 1189 Worklist.push_back(Root); 1190 1191 // Traverse the backward slice of Root through its use-def chain. 1192 while (!Worklist.empty()) { 1193 VPRecipeBase *CurRec = Worklist.back(); 1194 Worklist.pop_back(); 1195 1196 if (!Visited.insert(CurRec).second) 1197 continue; 1198 1199 // Prune search if we find another recipe generating a widen memory 1200 // instruction. Widen memory instructions involved in address computation 1201 // will lead to gather/scatter instructions, which don't need to be 1202 // handled. 1203 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1204 isa<VPInterleaveRecipe>(CurRec)) 1205 continue; 1206 1207 // This recipe contributes to the address computation of a widen 1208 // load/store. Collect recipe if its underlying instruction has 1209 // poison-generating flags. 1210 Instruction *Instr = CurRec->getUnderlyingInstr(); 1211 if (Instr && cast<Operator>(Instr)->hasPoisonGeneratingFlags()) 1212 State.MayGeneratePoisonRecipes.insert(CurRec); 1213 1214 // Add new definitions to the worklist. 1215 for (VPValue *operand : CurRec->operands()) 1216 if (VPDef *OpDef = operand->getDef()) 1217 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1218 } 1219 }); 1220 1221 // Traverse all the recipes in the VPlan and collect the poison-generating 1222 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1223 // VPInterleaveRecipe. 1224 auto Iter = depth_first( 1225 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1226 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1227 for (VPRecipeBase &Recipe : *VPBB) { 1228 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1229 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1230 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1231 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1232 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1233 collectPoisonGeneratingInstrsInBackwardSlice( 1234 cast<VPRecipeBase>(AddrDef)); 1235 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1236 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1237 if (AddrDef) { 1238 // Check if any member of the interleave group needs predication. 1239 const InterleaveGroup<Instruction> *InterGroup = 1240 InterleaveRec->getInterleaveGroup(); 1241 bool NeedPredication = false; 1242 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1243 I < NumMembers; ++I) { 1244 Instruction *Member = InterGroup->getMember(I); 1245 if (Member) 1246 NeedPredication |= 1247 Legal->blockNeedsPredication(Member->getParent()); 1248 } 1249 1250 if (NeedPredication) 1251 collectPoisonGeneratingInstrsInBackwardSlice( 1252 cast<VPRecipeBase>(AddrDef)); 1253 } 1254 } 1255 } 1256 } 1257 } 1258 1259 void InnerLoopVectorizer::addMetadata(Instruction *To, 1260 Instruction *From) { 1261 propagateMetadata(To, From); 1262 addNewMetadata(To, From); 1263 } 1264 1265 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1266 Instruction *From) { 1267 for (Value *V : To) { 1268 if (Instruction *I = dyn_cast<Instruction>(V)) 1269 addMetadata(I, From); 1270 } 1271 } 1272 1273 namespace llvm { 1274 1275 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1276 // lowered. 1277 enum ScalarEpilogueLowering { 1278 1279 // The default: allowing scalar epilogues. 1280 CM_ScalarEpilogueAllowed, 1281 1282 // Vectorization with OptForSize: don't allow epilogues. 1283 CM_ScalarEpilogueNotAllowedOptSize, 1284 1285 // A special case of vectorisation with OptForSize: loops with a very small 1286 // trip count are considered for vectorization under OptForSize, thereby 1287 // making sure the cost of their loop body is dominant, free of runtime 1288 // guards and scalar iteration overheads. 1289 CM_ScalarEpilogueNotAllowedLowTripLoop, 1290 1291 // Loop hint predicate indicating an epilogue is undesired. 1292 CM_ScalarEpilogueNotNeededUsePredicate, 1293 1294 // Directive indicating we must either tail fold or not vectorize 1295 CM_ScalarEpilogueNotAllowedUsePredicate 1296 }; 1297 1298 /// ElementCountComparator creates a total ordering for ElementCount 1299 /// for the purposes of using it in a set structure. 1300 struct ElementCountComparator { 1301 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1302 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1303 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1304 } 1305 }; 1306 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1307 1308 /// LoopVectorizationCostModel - estimates the expected speedups due to 1309 /// vectorization. 1310 /// In many cases vectorization is not profitable. This can happen because of 1311 /// a number of reasons. In this class we mainly attempt to predict the 1312 /// expected speedup/slowdowns due to the supported instruction set. We use the 1313 /// TargetTransformInfo to query the different backends for the cost of 1314 /// different operations. 1315 class LoopVectorizationCostModel { 1316 public: 1317 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1318 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1319 LoopVectorizationLegality *Legal, 1320 const TargetTransformInfo &TTI, 1321 const TargetLibraryInfo *TLI, DemandedBits *DB, 1322 AssumptionCache *AC, 1323 OptimizationRemarkEmitter *ORE, const Function *F, 1324 const LoopVectorizeHints *Hints, 1325 InterleavedAccessInfo &IAI) 1326 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1327 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1328 Hints(Hints), InterleaveInfo(IAI) {} 1329 1330 /// \return An upper bound for the vectorization factors (both fixed and 1331 /// scalable). If the factors are 0, vectorization and interleaving should be 1332 /// avoided up front. 1333 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1334 1335 /// \return True if runtime checks are required for vectorization, and false 1336 /// otherwise. 1337 bool runtimeChecksRequired(); 1338 1339 /// \return The most profitable vectorization factor and the cost of that VF. 1340 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1341 /// then this vectorization factor will be selected if vectorization is 1342 /// possible. 1343 VectorizationFactor 1344 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1345 1346 VectorizationFactor 1347 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1348 const LoopVectorizationPlanner &LVP); 1349 1350 /// Setup cost-based decisions for user vectorization factor. 1351 /// \return true if the UserVF is a feasible VF to be chosen. 1352 bool selectUserVectorizationFactor(ElementCount UserVF) { 1353 collectUniformsAndScalars(UserVF); 1354 collectInstsToScalarize(UserVF); 1355 return expectedCost(UserVF).first.isValid(); 1356 } 1357 1358 /// \return The size (in bits) of the smallest and widest types in the code 1359 /// that needs to be vectorized. We ignore values that remain scalar such as 1360 /// 64 bit loop indices. 1361 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1362 1363 /// \return The desired interleave count. 1364 /// If interleave count has been specified by metadata it will be returned. 1365 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1366 /// are the selected vectorization factor and the cost of the selected VF. 1367 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1368 1369 /// Memory access instruction may be vectorized in more than one way. 1370 /// Form of instruction after vectorization depends on cost. 1371 /// This function takes cost-based decisions for Load/Store instructions 1372 /// and collects them in a map. This decisions map is used for building 1373 /// the lists of loop-uniform and loop-scalar instructions. 1374 /// The calculated cost is saved with widening decision in order to 1375 /// avoid redundant calculations. 1376 void setCostBasedWideningDecision(ElementCount VF); 1377 1378 /// A struct that represents some properties of the register usage 1379 /// of a loop. 1380 struct RegisterUsage { 1381 /// Holds the number of loop invariant values that are used in the loop. 1382 /// The key is ClassID of target-provided register class. 1383 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1384 /// Holds the maximum number of concurrent live intervals in the loop. 1385 /// The key is ClassID of target-provided register class. 1386 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1387 }; 1388 1389 /// \return Returns information about the register usages of the loop for the 1390 /// given vectorization factors. 1391 SmallVector<RegisterUsage, 8> 1392 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1393 1394 /// Collect values we want to ignore in the cost model. 1395 void collectValuesToIgnore(); 1396 1397 /// Collect all element types in the loop for which widening is needed. 1398 void collectElementTypesForWidening(); 1399 1400 /// Split reductions into those that happen in the loop, and those that happen 1401 /// outside. In loop reductions are collected into InLoopReductionChains. 1402 void collectInLoopReductions(); 1403 1404 /// Returns true if we should use strict in-order reductions for the given 1405 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1406 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1407 /// of FP operations. 1408 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1409 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1410 } 1411 1412 /// \returns The smallest bitwidth each instruction can be represented with. 1413 /// The vector equivalents of these instructions should be truncated to this 1414 /// type. 1415 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1416 return MinBWs; 1417 } 1418 1419 /// \returns True if it is more profitable to scalarize instruction \p I for 1420 /// vectorization factor \p VF. 1421 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1422 assert(VF.isVector() && 1423 "Profitable to scalarize relevant only for VF > 1."); 1424 1425 // Cost model is not run in the VPlan-native path - return conservative 1426 // result until this changes. 1427 if (EnableVPlanNativePath) 1428 return false; 1429 1430 auto Scalars = InstsToScalarize.find(VF); 1431 assert(Scalars != InstsToScalarize.end() && 1432 "VF not yet analyzed for scalarization profitability"); 1433 return Scalars->second.find(I) != Scalars->second.end(); 1434 } 1435 1436 /// Returns true if \p I is known to be uniform after vectorization. 1437 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1438 if (VF.isScalar()) 1439 return true; 1440 1441 // Cost model is not run in the VPlan-native path - return conservative 1442 // result until this changes. 1443 if (EnableVPlanNativePath) 1444 return false; 1445 1446 auto UniformsPerVF = Uniforms.find(VF); 1447 assert(UniformsPerVF != Uniforms.end() && 1448 "VF not yet analyzed for uniformity"); 1449 return UniformsPerVF->second.count(I); 1450 } 1451 1452 /// Returns true if \p I is known to be scalar after vectorization. 1453 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1454 if (VF.isScalar()) 1455 return true; 1456 1457 // Cost model is not run in the VPlan-native path - return conservative 1458 // result until this changes. 1459 if (EnableVPlanNativePath) 1460 return false; 1461 1462 auto ScalarsPerVF = Scalars.find(VF); 1463 assert(ScalarsPerVF != Scalars.end() && 1464 "Scalar values are not calculated for VF"); 1465 return ScalarsPerVF->second.count(I); 1466 } 1467 1468 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1469 /// for vectorization factor \p VF. 1470 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1471 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1472 !isProfitableToScalarize(I, VF) && 1473 !isScalarAfterVectorization(I, VF); 1474 } 1475 1476 /// Decision that was taken during cost calculation for memory instruction. 1477 enum InstWidening { 1478 CM_Unknown, 1479 CM_Widen, // For consecutive accesses with stride +1. 1480 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1481 CM_Interleave, 1482 CM_GatherScatter, 1483 CM_Scalarize 1484 }; 1485 1486 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1487 /// instruction \p I and vector width \p VF. 1488 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1489 InstructionCost Cost) { 1490 assert(VF.isVector() && "Expected VF >=2"); 1491 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1492 } 1493 1494 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1495 /// interleaving group \p Grp and vector width \p VF. 1496 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1497 ElementCount VF, InstWidening W, 1498 InstructionCost Cost) { 1499 assert(VF.isVector() && "Expected VF >=2"); 1500 /// Broadcast this decicion to all instructions inside the group. 1501 /// But the cost will be assigned to one instruction only. 1502 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1503 if (auto *I = Grp->getMember(i)) { 1504 if (Grp->getInsertPos() == I) 1505 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1506 else 1507 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1508 } 1509 } 1510 } 1511 1512 /// Return the cost model decision for the given instruction \p I and vector 1513 /// width \p VF. Return CM_Unknown if this instruction did not pass 1514 /// through the cost modeling. 1515 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1516 assert(VF.isVector() && "Expected VF to be a vector VF"); 1517 // Cost model is not run in the VPlan-native path - return conservative 1518 // result until this changes. 1519 if (EnableVPlanNativePath) 1520 return CM_GatherScatter; 1521 1522 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1523 auto Itr = WideningDecisions.find(InstOnVF); 1524 if (Itr == WideningDecisions.end()) 1525 return CM_Unknown; 1526 return Itr->second.first; 1527 } 1528 1529 /// Return the vectorization cost for the given instruction \p I and vector 1530 /// width \p VF. 1531 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1532 assert(VF.isVector() && "Expected VF >=2"); 1533 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1534 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1535 "The cost is not calculated"); 1536 return WideningDecisions[InstOnVF].second; 1537 } 1538 1539 /// Return True if instruction \p I is an optimizable truncate whose operand 1540 /// is an induction variable. Such a truncate will be removed by adding a new 1541 /// induction variable with the destination type. 1542 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1543 // If the instruction is not a truncate, return false. 1544 auto *Trunc = dyn_cast<TruncInst>(I); 1545 if (!Trunc) 1546 return false; 1547 1548 // Get the source and destination types of the truncate. 1549 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1550 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1551 1552 // If the truncate is free for the given types, return false. Replacing a 1553 // free truncate with an induction variable would add an induction variable 1554 // update instruction to each iteration of the loop. We exclude from this 1555 // check the primary induction variable since it will need an update 1556 // instruction regardless. 1557 Value *Op = Trunc->getOperand(0); 1558 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1559 return false; 1560 1561 // If the truncated value is not an induction variable, return false. 1562 return Legal->isInductionPhi(Op); 1563 } 1564 1565 /// Collects the instructions to scalarize for each predicated instruction in 1566 /// the loop. 1567 void collectInstsToScalarize(ElementCount VF); 1568 1569 /// Collect Uniform and Scalar values for the given \p VF. 1570 /// The sets depend on CM decision for Load/Store instructions 1571 /// that may be vectorized as interleave, gather-scatter or scalarized. 1572 void collectUniformsAndScalars(ElementCount VF) { 1573 // Do the analysis once. 1574 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1575 return; 1576 setCostBasedWideningDecision(VF); 1577 collectLoopUniforms(VF); 1578 collectLoopScalars(VF); 1579 } 1580 1581 /// Returns true if the target machine supports masked store operation 1582 /// for the given \p DataType and kind of access to \p Ptr. 1583 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1584 return Legal->isConsecutivePtr(DataType, Ptr) && 1585 TTI.isLegalMaskedStore(DataType, Alignment); 1586 } 1587 1588 /// Returns true if the target machine supports masked load operation 1589 /// for the given \p DataType and kind of access to \p Ptr. 1590 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1591 return Legal->isConsecutivePtr(DataType, Ptr) && 1592 TTI.isLegalMaskedLoad(DataType, Alignment); 1593 } 1594 1595 /// Returns true if the target machine can represent \p V as a masked gather 1596 /// or scatter operation. 1597 bool isLegalGatherOrScatter(Value *V) { 1598 bool LI = isa<LoadInst>(V); 1599 bool SI = isa<StoreInst>(V); 1600 if (!LI && !SI) 1601 return false; 1602 auto *Ty = getLoadStoreType(V); 1603 Align Align = getLoadStoreAlignment(V); 1604 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1605 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1606 } 1607 1608 /// Returns true if the target machine supports all of the reduction 1609 /// variables found for the given VF. 1610 bool canVectorizeReductions(ElementCount VF) const { 1611 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1612 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1613 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1614 })); 1615 } 1616 1617 /// Returns true if \p I is an instruction that will be scalarized with 1618 /// predication. Such instructions include conditional stores and 1619 /// instructions that may divide by zero. 1620 /// If a non-zero VF has been calculated, we check if I will be scalarized 1621 /// predication for that VF. 1622 bool isScalarWithPredication(Instruction *I) const; 1623 1624 // Returns true if \p I is an instruction that will be predicated either 1625 // through scalar predication or masked load/store or masked gather/scatter. 1626 // Superset of instructions that return true for isScalarWithPredication. 1627 bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { 1628 // When we know the load is uniform and the original scalar loop was not 1629 // predicated we don't need to mark it as a predicated instruction. Any 1630 // vectorised blocks created when tail-folding are something artificial we 1631 // have introduced and we know there is always at least one active lane. 1632 // That's why we call Legal->blockNeedsPredication here because it doesn't 1633 // query tail-folding. 1634 if (IsKnownUniform && isa<LoadInst>(I) && 1635 !Legal->blockNeedsPredication(I->getParent())) 1636 return false; 1637 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1638 return false; 1639 // Loads and stores that need some form of masked operation are predicated 1640 // instructions. 1641 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1642 return Legal->isMaskRequired(I); 1643 return isScalarWithPredication(I); 1644 } 1645 1646 /// Returns true if \p I is a memory instruction with consecutive memory 1647 /// access that can be widened. 1648 bool 1649 memoryInstructionCanBeWidened(Instruction *I, 1650 ElementCount VF = ElementCount::getFixed(1)); 1651 1652 /// Returns true if \p I is a memory instruction in an interleaved-group 1653 /// of memory accesses that can be vectorized with wide vector loads/stores 1654 /// and shuffles. 1655 bool 1656 interleavedAccessCanBeWidened(Instruction *I, 1657 ElementCount VF = ElementCount::getFixed(1)); 1658 1659 /// Check if \p Instr belongs to any interleaved access group. 1660 bool isAccessInterleaved(Instruction *Instr) { 1661 return InterleaveInfo.isInterleaved(Instr); 1662 } 1663 1664 /// Get the interleaved access group that \p Instr belongs to. 1665 const InterleaveGroup<Instruction> * 1666 getInterleavedAccessGroup(Instruction *Instr) { 1667 return InterleaveInfo.getInterleaveGroup(Instr); 1668 } 1669 1670 /// Returns true if we're required to use a scalar epilogue for at least 1671 /// the final iteration of the original loop. 1672 bool requiresScalarEpilogue(ElementCount VF) const { 1673 if (!isScalarEpilogueAllowed()) 1674 return false; 1675 // If we might exit from anywhere but the latch, must run the exiting 1676 // iteration in scalar form. 1677 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1678 return true; 1679 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1680 } 1681 1682 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1683 /// loop hint annotation. 1684 bool isScalarEpilogueAllowed() const { 1685 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1686 } 1687 1688 /// Returns true if all loop blocks should be masked to fold tail loop. 1689 bool foldTailByMasking() const { return FoldTailByMasking; } 1690 1691 /// Returns true if the instructions in this block requires predication 1692 /// for any reason, e.g. because tail folding now requires a predicate 1693 /// or because the block in the original loop was predicated. 1694 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1695 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1696 } 1697 1698 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1699 /// nodes to the chain of instructions representing the reductions. Uses a 1700 /// MapVector to ensure deterministic iteration order. 1701 using ReductionChainMap = 1702 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1703 1704 /// Return the chain of instructions representing an inloop reduction. 1705 const ReductionChainMap &getInLoopReductionChains() const { 1706 return InLoopReductionChains; 1707 } 1708 1709 /// Returns true if the Phi is part of an inloop reduction. 1710 bool isInLoopReduction(PHINode *Phi) const { 1711 return InLoopReductionChains.count(Phi); 1712 } 1713 1714 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1715 /// with factor VF. Return the cost of the instruction, including 1716 /// scalarization overhead if it's needed. 1717 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1718 1719 /// Estimate cost of a call instruction CI if it were vectorized with factor 1720 /// VF. Return the cost of the instruction, including scalarization overhead 1721 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1722 /// scalarized - 1723 /// i.e. either vector version isn't available, or is too expensive. 1724 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1725 bool &NeedToScalarize) const; 1726 1727 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1728 /// that of B. 1729 bool isMoreProfitable(const VectorizationFactor &A, 1730 const VectorizationFactor &B) const; 1731 1732 /// Invalidates decisions already taken by the cost model. 1733 void invalidateCostModelingDecisions() { 1734 WideningDecisions.clear(); 1735 Uniforms.clear(); 1736 Scalars.clear(); 1737 } 1738 1739 private: 1740 unsigned NumPredStores = 0; 1741 1742 /// \return An upper bound for the vectorization factors for both 1743 /// fixed and scalable vectorization, where the minimum-known number of 1744 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1745 /// disabled or unsupported, then the scalable part will be equal to 1746 /// ElementCount::getScalable(0). 1747 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1748 ElementCount UserVF); 1749 1750 /// \return the maximized element count based on the targets vector 1751 /// registers and the loop trip-count, but limited to a maximum safe VF. 1752 /// This is a helper function of computeFeasibleMaxVF. 1753 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1754 /// issue that occurred on one of the buildbots which cannot be reproduced 1755 /// without having access to the properietary compiler (see comments on 1756 /// D98509). The issue is currently under investigation and this workaround 1757 /// will be removed as soon as possible. 1758 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1759 unsigned SmallestType, 1760 unsigned WidestType, 1761 const ElementCount &MaxSafeVF); 1762 1763 /// \return the maximum legal scalable VF, based on the safe max number 1764 /// of elements. 1765 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1766 1767 /// The vectorization cost is a combination of the cost itself and a boolean 1768 /// indicating whether any of the contributing operations will actually 1769 /// operate on vector values after type legalization in the backend. If this 1770 /// latter value is false, then all operations will be scalarized (i.e. no 1771 /// vectorization has actually taken place). 1772 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1773 1774 /// Returns the expected execution cost. The unit of the cost does 1775 /// not matter because we use the 'cost' units to compare different 1776 /// vector widths. The cost that is returned is *not* normalized by 1777 /// the factor width. If \p Invalid is not nullptr, this function 1778 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1779 /// each instruction that has an Invalid cost for the given VF. 1780 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1781 VectorizationCostTy 1782 expectedCost(ElementCount VF, 1783 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1784 1785 /// Returns the execution time cost of an instruction for a given vector 1786 /// width. Vector width of one means scalar. 1787 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1788 1789 /// The cost-computation logic from getInstructionCost which provides 1790 /// the vector type as an output parameter. 1791 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1792 Type *&VectorTy); 1793 1794 /// Return the cost of instructions in an inloop reduction pattern, if I is 1795 /// part of that pattern. 1796 Optional<InstructionCost> 1797 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1798 TTI::TargetCostKind CostKind); 1799 1800 /// Calculate vectorization cost of memory instruction \p I. 1801 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1802 1803 /// The cost computation for scalarized memory instruction. 1804 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1805 1806 /// The cost computation for interleaving group of memory instructions. 1807 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1808 1809 /// The cost computation for Gather/Scatter instruction. 1810 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1811 1812 /// The cost computation for widening instruction \p I with consecutive 1813 /// memory access. 1814 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1815 1816 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1817 /// Load: scalar load + broadcast. 1818 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1819 /// element) 1820 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1821 1822 /// Estimate the overhead of scalarizing an instruction. This is a 1823 /// convenience wrapper for the type-based getScalarizationOverhead API. 1824 InstructionCost getScalarizationOverhead(Instruction *I, 1825 ElementCount VF) const; 1826 1827 /// Returns whether the instruction is a load or store and will be a emitted 1828 /// as a vector operation. 1829 bool isConsecutiveLoadOrStore(Instruction *I); 1830 1831 /// Returns true if an artificially high cost for emulated masked memrefs 1832 /// should be used. 1833 bool useEmulatedMaskMemRefHack(Instruction *I); 1834 1835 /// Map of scalar integer values to the smallest bitwidth they can be legally 1836 /// represented as. The vector equivalents of these values should be truncated 1837 /// to this type. 1838 MapVector<Instruction *, uint64_t> MinBWs; 1839 1840 /// A type representing the costs for instructions if they were to be 1841 /// scalarized rather than vectorized. The entries are Instruction-Cost 1842 /// pairs. 1843 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1844 1845 /// A set containing all BasicBlocks that are known to present after 1846 /// vectorization as a predicated block. 1847 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1848 1849 /// Records whether it is allowed to have the original scalar loop execute at 1850 /// least once. This may be needed as a fallback loop in case runtime 1851 /// aliasing/dependence checks fail, or to handle the tail/remainder 1852 /// iterations when the trip count is unknown or doesn't divide by the VF, 1853 /// or as a peel-loop to handle gaps in interleave-groups. 1854 /// Under optsize and when the trip count is very small we don't allow any 1855 /// iterations to execute in the scalar loop. 1856 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1857 1858 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1859 bool FoldTailByMasking = false; 1860 1861 /// A map holding scalar costs for different vectorization factors. The 1862 /// presence of a cost for an instruction in the mapping indicates that the 1863 /// instruction will be scalarized when vectorizing with the associated 1864 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1865 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1866 1867 /// Holds the instructions known to be uniform after vectorization. 1868 /// The data is collected per VF. 1869 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1870 1871 /// Holds the instructions known to be scalar after vectorization. 1872 /// The data is collected per VF. 1873 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1874 1875 /// Holds the instructions (address computations) that are forced to be 1876 /// scalarized. 1877 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1878 1879 /// PHINodes of the reductions that should be expanded in-loop along with 1880 /// their associated chains of reduction operations, in program order from top 1881 /// (PHI) to bottom 1882 ReductionChainMap InLoopReductionChains; 1883 1884 /// A Map of inloop reduction operations and their immediate chain operand. 1885 /// FIXME: This can be removed once reductions can be costed correctly in 1886 /// vplan. This was added to allow quick lookup to the inloop operations, 1887 /// without having to loop through InLoopReductionChains. 1888 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1889 1890 /// Returns the expected difference in cost from scalarizing the expression 1891 /// feeding a predicated instruction \p PredInst. The instructions to 1892 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1893 /// non-negative return value implies the expression will be scalarized. 1894 /// Currently, only single-use chains are considered for scalarization. 1895 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1896 ElementCount VF); 1897 1898 /// Collect the instructions that are uniform after vectorization. An 1899 /// instruction is uniform if we represent it with a single scalar value in 1900 /// the vectorized loop corresponding to each vector iteration. Examples of 1901 /// uniform instructions include pointer operands of consecutive or 1902 /// interleaved memory accesses. Note that although uniformity implies an 1903 /// instruction will be scalar, the reverse is not true. In general, a 1904 /// scalarized instruction will be represented by VF scalar values in the 1905 /// vectorized loop, each corresponding to an iteration of the original 1906 /// scalar loop. 1907 void collectLoopUniforms(ElementCount VF); 1908 1909 /// Collect the instructions that are scalar after vectorization. An 1910 /// instruction is scalar if it is known to be uniform or will be scalarized 1911 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1912 /// to the list if they are used by a load/store instruction that is marked as 1913 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1914 /// VF values in the vectorized loop, each corresponding to an iteration of 1915 /// the original scalar loop. 1916 void collectLoopScalars(ElementCount VF); 1917 1918 /// Keeps cost model vectorization decision and cost for instructions. 1919 /// Right now it is used for memory instructions only. 1920 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1921 std::pair<InstWidening, InstructionCost>>; 1922 1923 DecisionList WideningDecisions; 1924 1925 /// Returns true if \p V is expected to be vectorized and it needs to be 1926 /// extracted. 1927 bool needsExtract(Value *V, ElementCount VF) const { 1928 Instruction *I = dyn_cast<Instruction>(V); 1929 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1930 TheLoop->isLoopInvariant(I)) 1931 return false; 1932 1933 // Assume we can vectorize V (and hence we need extraction) if the 1934 // scalars are not computed yet. This can happen, because it is called 1935 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1936 // the scalars are collected. That should be a safe assumption in most 1937 // cases, because we check if the operands have vectorizable types 1938 // beforehand in LoopVectorizationLegality. 1939 return Scalars.find(VF) == Scalars.end() || 1940 !isScalarAfterVectorization(I, VF); 1941 }; 1942 1943 /// Returns a range containing only operands needing to be extracted. 1944 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1945 ElementCount VF) const { 1946 return SmallVector<Value *, 4>(make_filter_range( 1947 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1948 } 1949 1950 /// Determines if we have the infrastructure to vectorize loop \p L and its 1951 /// epilogue, assuming the main loop is vectorized by \p VF. 1952 bool isCandidateForEpilogueVectorization(const Loop &L, 1953 const ElementCount VF) const; 1954 1955 /// Returns true if epilogue vectorization is considered profitable, and 1956 /// false otherwise. 1957 /// \p VF is the vectorization factor chosen for the original loop. 1958 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1959 1960 public: 1961 /// The loop that we evaluate. 1962 Loop *TheLoop; 1963 1964 /// Predicated scalar evolution analysis. 1965 PredicatedScalarEvolution &PSE; 1966 1967 /// Loop Info analysis. 1968 LoopInfo *LI; 1969 1970 /// Vectorization legality. 1971 LoopVectorizationLegality *Legal; 1972 1973 /// Vector target information. 1974 const TargetTransformInfo &TTI; 1975 1976 /// Target Library Info. 1977 const TargetLibraryInfo *TLI; 1978 1979 /// Demanded bits analysis. 1980 DemandedBits *DB; 1981 1982 /// Assumption cache. 1983 AssumptionCache *AC; 1984 1985 /// Interface to emit optimization remarks. 1986 OptimizationRemarkEmitter *ORE; 1987 1988 const Function *TheFunction; 1989 1990 /// Loop Vectorize Hint. 1991 const LoopVectorizeHints *Hints; 1992 1993 /// The interleave access information contains groups of interleaved accesses 1994 /// with the same stride and close to each other. 1995 InterleavedAccessInfo &InterleaveInfo; 1996 1997 /// Values to ignore in the cost model. 1998 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1999 2000 /// Values to ignore in the cost model when VF > 1. 2001 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 2002 2003 /// All element types found in the loop. 2004 SmallPtrSet<Type *, 16> ElementTypesInLoop; 2005 2006 /// Profitable vector factors. 2007 SmallVector<VectorizationFactor, 8> ProfitableVFs; 2008 }; 2009 } // end namespace llvm 2010 2011 /// Helper struct to manage generating runtime checks for vectorization. 2012 /// 2013 /// The runtime checks are created up-front in temporary blocks to allow better 2014 /// estimating the cost and un-linked from the existing IR. After deciding to 2015 /// vectorize, the checks are moved back. If deciding not to vectorize, the 2016 /// temporary blocks are completely removed. 2017 class GeneratedRTChecks { 2018 /// Basic block which contains the generated SCEV checks, if any. 2019 BasicBlock *SCEVCheckBlock = nullptr; 2020 2021 /// The value representing the result of the generated SCEV checks. If it is 2022 /// nullptr, either no SCEV checks have been generated or they have been used. 2023 Value *SCEVCheckCond = nullptr; 2024 2025 /// Basic block which contains the generated memory runtime checks, if any. 2026 BasicBlock *MemCheckBlock = nullptr; 2027 2028 /// The value representing the result of the generated memory runtime checks. 2029 /// If it is nullptr, either no memory runtime checks have been generated or 2030 /// they have been used. 2031 Value *MemRuntimeCheckCond = nullptr; 2032 2033 DominatorTree *DT; 2034 LoopInfo *LI; 2035 2036 SCEVExpander SCEVExp; 2037 SCEVExpander MemCheckExp; 2038 2039 public: 2040 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 2041 const DataLayout &DL) 2042 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 2043 MemCheckExp(SE, DL, "scev.check") {} 2044 2045 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 2046 /// accurately estimate the cost of the runtime checks. The blocks are 2047 /// un-linked from the IR and is added back during vector code generation. If 2048 /// there is no vector code generation, the check blocks are removed 2049 /// completely. 2050 void Create(Loop *L, const LoopAccessInfo &LAI, 2051 const SCEVUnionPredicate &UnionPred) { 2052 2053 BasicBlock *LoopHeader = L->getHeader(); 2054 BasicBlock *Preheader = L->getLoopPreheader(); 2055 2056 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2057 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2058 // may be used by SCEVExpander. The blocks will be un-linked from their 2059 // predecessors and removed from LI & DT at the end of the function. 2060 if (!UnionPred.isAlwaysTrue()) { 2061 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2062 nullptr, "vector.scevcheck"); 2063 2064 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2065 &UnionPred, SCEVCheckBlock->getTerminator()); 2066 } 2067 2068 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2069 if (RtPtrChecking.Need) { 2070 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2071 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2072 "vector.memcheck"); 2073 2074 MemRuntimeCheckCond = 2075 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2076 RtPtrChecking.getChecks(), MemCheckExp); 2077 assert(MemRuntimeCheckCond && 2078 "no RT checks generated although RtPtrChecking " 2079 "claimed checks are required"); 2080 } 2081 2082 if (!MemCheckBlock && !SCEVCheckBlock) 2083 return; 2084 2085 // Unhook the temporary block with the checks, update various places 2086 // accordingly. 2087 if (SCEVCheckBlock) 2088 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2089 if (MemCheckBlock) 2090 MemCheckBlock->replaceAllUsesWith(Preheader); 2091 2092 if (SCEVCheckBlock) { 2093 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2094 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2095 Preheader->getTerminator()->eraseFromParent(); 2096 } 2097 if (MemCheckBlock) { 2098 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2099 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2100 Preheader->getTerminator()->eraseFromParent(); 2101 } 2102 2103 DT->changeImmediateDominator(LoopHeader, Preheader); 2104 if (MemCheckBlock) { 2105 DT->eraseNode(MemCheckBlock); 2106 LI->removeBlock(MemCheckBlock); 2107 } 2108 if (SCEVCheckBlock) { 2109 DT->eraseNode(SCEVCheckBlock); 2110 LI->removeBlock(SCEVCheckBlock); 2111 } 2112 } 2113 2114 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2115 /// unused. 2116 ~GeneratedRTChecks() { 2117 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2118 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2119 if (!SCEVCheckCond) 2120 SCEVCleaner.markResultUsed(); 2121 2122 if (!MemRuntimeCheckCond) 2123 MemCheckCleaner.markResultUsed(); 2124 2125 if (MemRuntimeCheckCond) { 2126 auto &SE = *MemCheckExp.getSE(); 2127 // Memory runtime check generation creates compares that use expanded 2128 // values. Remove them before running the SCEVExpanderCleaners. 2129 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2130 if (MemCheckExp.isInsertedInstruction(&I)) 2131 continue; 2132 SE.forgetValue(&I); 2133 I.eraseFromParent(); 2134 } 2135 } 2136 MemCheckCleaner.cleanup(); 2137 SCEVCleaner.cleanup(); 2138 2139 if (SCEVCheckCond) 2140 SCEVCheckBlock->eraseFromParent(); 2141 if (MemRuntimeCheckCond) 2142 MemCheckBlock->eraseFromParent(); 2143 } 2144 2145 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2146 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2147 /// depending on the generated condition. 2148 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2149 BasicBlock *LoopVectorPreHeader, 2150 BasicBlock *LoopExitBlock) { 2151 if (!SCEVCheckCond) 2152 return nullptr; 2153 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2154 if (C->isZero()) 2155 return nullptr; 2156 2157 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2158 2159 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2160 // Create new preheader for vector loop. 2161 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2162 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2163 2164 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2165 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2166 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2167 SCEVCheckBlock); 2168 2169 DT->addNewBlock(SCEVCheckBlock, Pred); 2170 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2171 2172 ReplaceInstWithInst( 2173 SCEVCheckBlock->getTerminator(), 2174 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2175 // Mark the check as used, to prevent it from being removed during cleanup. 2176 SCEVCheckCond = nullptr; 2177 return SCEVCheckBlock; 2178 } 2179 2180 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2181 /// the branches to branch to the vector preheader or \p Bypass, depending on 2182 /// the generated condition. 2183 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2184 BasicBlock *LoopVectorPreHeader) { 2185 // Check if we generated code that checks in runtime if arrays overlap. 2186 if (!MemRuntimeCheckCond) 2187 return nullptr; 2188 2189 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2190 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2191 MemCheckBlock); 2192 2193 DT->addNewBlock(MemCheckBlock, Pred); 2194 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2195 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2196 2197 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2198 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2199 2200 ReplaceInstWithInst( 2201 MemCheckBlock->getTerminator(), 2202 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2203 MemCheckBlock->getTerminator()->setDebugLoc( 2204 Pred->getTerminator()->getDebugLoc()); 2205 2206 // Mark the check as used, to prevent it from being removed during cleanup. 2207 MemRuntimeCheckCond = nullptr; 2208 return MemCheckBlock; 2209 } 2210 }; 2211 2212 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2213 // vectorization. The loop needs to be annotated with #pragma omp simd 2214 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2215 // vector length information is not provided, vectorization is not considered 2216 // explicit. Interleave hints are not allowed either. These limitations will be 2217 // relaxed in the future. 2218 // Please, note that we are currently forced to abuse the pragma 'clang 2219 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2220 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2221 // provides *explicit vectorization hints* (LV can bypass legal checks and 2222 // assume that vectorization is legal). However, both hints are implemented 2223 // using the same metadata (llvm.loop.vectorize, processed by 2224 // LoopVectorizeHints). This will be fixed in the future when the native IR 2225 // representation for pragma 'omp simd' is introduced. 2226 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2227 OptimizationRemarkEmitter *ORE) { 2228 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2229 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2230 2231 // Only outer loops with an explicit vectorization hint are supported. 2232 // Unannotated outer loops are ignored. 2233 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2234 return false; 2235 2236 Function *Fn = OuterLp->getHeader()->getParent(); 2237 if (!Hints.allowVectorization(Fn, OuterLp, 2238 true /*VectorizeOnlyWhenForced*/)) { 2239 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2240 return false; 2241 } 2242 2243 if (Hints.getInterleave() > 1) { 2244 // TODO: Interleave support is future work. 2245 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2246 "outer loops.\n"); 2247 Hints.emitRemarkWithHints(); 2248 return false; 2249 } 2250 2251 return true; 2252 } 2253 2254 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2255 OptimizationRemarkEmitter *ORE, 2256 SmallVectorImpl<Loop *> &V) { 2257 // Collect inner loops and outer loops without irreducible control flow. For 2258 // now, only collect outer loops that have explicit vectorization hints. If we 2259 // are stress testing the VPlan H-CFG construction, we collect the outermost 2260 // loop of every loop nest. 2261 if (L.isInnermost() || VPlanBuildStressTest || 2262 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2263 LoopBlocksRPO RPOT(&L); 2264 RPOT.perform(LI); 2265 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2266 V.push_back(&L); 2267 // TODO: Collect inner loops inside marked outer loops in case 2268 // vectorization fails for the outer loop. Do not invoke 2269 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2270 // already known to be reducible. We can use an inherited attribute for 2271 // that. 2272 return; 2273 } 2274 } 2275 for (Loop *InnerL : L) 2276 collectSupportedLoops(*InnerL, LI, ORE, V); 2277 } 2278 2279 namespace { 2280 2281 /// The LoopVectorize Pass. 2282 struct LoopVectorize : public FunctionPass { 2283 /// Pass identification, replacement for typeid 2284 static char ID; 2285 2286 LoopVectorizePass Impl; 2287 2288 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2289 bool VectorizeOnlyWhenForced = false) 2290 : FunctionPass(ID), 2291 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2292 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2293 } 2294 2295 bool runOnFunction(Function &F) override { 2296 if (skipFunction(F)) 2297 return false; 2298 2299 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2300 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2301 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2302 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2303 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2304 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2305 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2306 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2307 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2308 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2309 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2310 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2311 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2312 2313 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2314 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2315 2316 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2317 GetLAA, *ORE, PSI).MadeAnyChange; 2318 } 2319 2320 void getAnalysisUsage(AnalysisUsage &AU) const override { 2321 AU.addRequired<AssumptionCacheTracker>(); 2322 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2323 AU.addRequired<DominatorTreeWrapperPass>(); 2324 AU.addRequired<LoopInfoWrapperPass>(); 2325 AU.addRequired<ScalarEvolutionWrapperPass>(); 2326 AU.addRequired<TargetTransformInfoWrapperPass>(); 2327 AU.addRequired<AAResultsWrapperPass>(); 2328 AU.addRequired<LoopAccessLegacyAnalysis>(); 2329 AU.addRequired<DemandedBitsWrapperPass>(); 2330 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2331 AU.addRequired<InjectTLIMappingsLegacy>(); 2332 2333 // We currently do not preserve loopinfo/dominator analyses with outer loop 2334 // vectorization. Until this is addressed, mark these analyses as preserved 2335 // only for non-VPlan-native path. 2336 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2337 if (!EnableVPlanNativePath) { 2338 AU.addPreserved<LoopInfoWrapperPass>(); 2339 AU.addPreserved<DominatorTreeWrapperPass>(); 2340 } 2341 2342 AU.addPreserved<BasicAAWrapperPass>(); 2343 AU.addPreserved<GlobalsAAWrapperPass>(); 2344 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2345 } 2346 }; 2347 2348 } // end anonymous namespace 2349 2350 //===----------------------------------------------------------------------===// 2351 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2352 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2353 //===----------------------------------------------------------------------===// 2354 2355 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2356 // We need to place the broadcast of invariant variables outside the loop, 2357 // but only if it's proven safe to do so. Else, broadcast will be inside 2358 // vector loop body. 2359 Instruction *Instr = dyn_cast<Instruction>(V); 2360 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2361 (!Instr || 2362 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2363 // Place the code for broadcasting invariant variables in the new preheader. 2364 IRBuilder<>::InsertPointGuard Guard(Builder); 2365 if (SafeToHoist) 2366 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2367 2368 // Broadcast the scalar into all locations in the vector. 2369 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2370 2371 return Shuf; 2372 } 2373 2374 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2375 const InductionDescriptor &II, Value *Step, Value *Start, 2376 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2377 VPTransformState &State) { 2378 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2379 "Expected either an induction phi-node or a truncate of it!"); 2380 2381 // Construct the initial value of the vector IV in the vector loop preheader 2382 auto CurrIP = Builder.saveIP(); 2383 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2384 if (isa<TruncInst>(EntryVal)) { 2385 assert(Start->getType()->isIntegerTy() && 2386 "Truncation requires an integer type"); 2387 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2388 Step = Builder.CreateTrunc(Step, TruncType); 2389 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2390 } 2391 2392 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2393 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2394 Value *SteppedStart = 2395 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2396 2397 // We create vector phi nodes for both integer and floating-point induction 2398 // variables. Here, we determine the kind of arithmetic we will perform. 2399 Instruction::BinaryOps AddOp; 2400 Instruction::BinaryOps MulOp; 2401 if (Step->getType()->isIntegerTy()) { 2402 AddOp = Instruction::Add; 2403 MulOp = Instruction::Mul; 2404 } else { 2405 AddOp = II.getInductionOpcode(); 2406 MulOp = Instruction::FMul; 2407 } 2408 2409 // Multiply the vectorization factor by the step using integer or 2410 // floating-point arithmetic as appropriate. 2411 Type *StepType = Step->getType(); 2412 Value *RuntimeVF; 2413 if (Step->getType()->isFloatingPointTy()) 2414 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); 2415 else 2416 RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2417 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2418 2419 // Create a vector splat to use in the induction update. 2420 // 2421 // FIXME: If the step is non-constant, we create the vector splat with 2422 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2423 // handle a constant vector splat. 2424 Value *SplatVF = isa<Constant>(Mul) 2425 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2426 : Builder.CreateVectorSplat(VF, Mul); 2427 Builder.restoreIP(CurrIP); 2428 2429 // We may need to add the step a number of times, depending on the unroll 2430 // factor. The last of those goes into the PHI. 2431 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2432 &*LoopVectorBody->getFirstInsertionPt()); 2433 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2434 Instruction *LastInduction = VecInd; 2435 for (unsigned Part = 0; Part < UF; ++Part) { 2436 State.set(Def, LastInduction, Part); 2437 2438 if (isa<TruncInst>(EntryVal)) 2439 addMetadata(LastInduction, EntryVal); 2440 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2441 State, Part); 2442 2443 LastInduction = cast<Instruction>( 2444 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2445 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2446 } 2447 2448 // Move the last step to the end of the latch block. This ensures consistent 2449 // placement of all induction updates. 2450 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2451 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2452 auto *ICmp = cast<Instruction>(Br->getCondition()); 2453 LastInduction->moveBefore(ICmp); 2454 LastInduction->setName("vec.ind.next"); 2455 2456 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2457 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2458 } 2459 2460 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2461 return Cost->isScalarAfterVectorization(I, VF) || 2462 Cost->isProfitableToScalarize(I, VF); 2463 } 2464 2465 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2466 if (shouldScalarizeInstruction(IV)) 2467 return true; 2468 auto isScalarInst = [&](User *U) -> bool { 2469 auto *I = cast<Instruction>(U); 2470 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2471 }; 2472 return llvm::any_of(IV->users(), isScalarInst); 2473 } 2474 2475 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2476 const InductionDescriptor &ID, const Instruction *EntryVal, 2477 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2478 unsigned Part, unsigned Lane) { 2479 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2480 "Expected either an induction phi-node or a truncate of it!"); 2481 2482 // This induction variable is not the phi from the original loop but the 2483 // newly-created IV based on the proof that casted Phi is equal to the 2484 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2485 // re-uses the same InductionDescriptor that original IV uses but we don't 2486 // have to do any recording in this case - that is done when original IV is 2487 // processed. 2488 if (isa<TruncInst>(EntryVal)) 2489 return; 2490 2491 if (!CastDef) { 2492 assert(ID.getCastInsts().empty() && 2493 "there are casts for ID, but no CastDef"); 2494 return; 2495 } 2496 assert(!ID.getCastInsts().empty() && 2497 "there is a CastDef, but no casts for ID"); 2498 // Only the first Cast instruction in the Casts vector is of interest. 2499 // The rest of the Casts (if exist) have no uses outside the 2500 // induction update chain itself. 2501 if (Lane < UINT_MAX) 2502 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2503 else 2504 State.set(CastDef, VectorLoopVal, Part); 2505 } 2506 2507 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2508 TruncInst *Trunc, VPValue *Def, 2509 VPValue *CastDef, 2510 VPTransformState &State) { 2511 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2512 "Primary induction variable must have an integer type"); 2513 2514 auto II = Legal->getInductionVars().find(IV); 2515 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2516 2517 auto ID = II->second; 2518 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2519 2520 // The value from the original loop to which we are mapping the new induction 2521 // variable. 2522 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2523 2524 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2525 2526 // Generate code for the induction step. Note that induction steps are 2527 // required to be loop-invariant 2528 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2529 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2530 "Induction step should be loop invariant"); 2531 if (PSE.getSE()->isSCEVable(IV->getType())) { 2532 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2533 return Exp.expandCodeFor(Step, Step->getType(), 2534 LoopVectorPreHeader->getTerminator()); 2535 } 2536 return cast<SCEVUnknown>(Step)->getValue(); 2537 }; 2538 2539 // The scalar value to broadcast. This is derived from the canonical 2540 // induction variable. If a truncation type is given, truncate the canonical 2541 // induction variable and step. Otherwise, derive these values from the 2542 // induction descriptor. 2543 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2544 Value *ScalarIV = Induction; 2545 if (IV != OldInduction) { 2546 ScalarIV = IV->getType()->isIntegerTy() 2547 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2548 : Builder.CreateCast(Instruction::SIToFP, Induction, 2549 IV->getType()); 2550 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2551 ScalarIV->setName("offset.idx"); 2552 } 2553 if (Trunc) { 2554 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2555 assert(Step->getType()->isIntegerTy() && 2556 "Truncation requires an integer step"); 2557 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2558 Step = Builder.CreateTrunc(Step, TruncType); 2559 } 2560 return ScalarIV; 2561 }; 2562 2563 // Create the vector values from the scalar IV, in the absence of creating a 2564 // vector IV. 2565 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2566 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2567 for (unsigned Part = 0; Part < UF; ++Part) { 2568 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2569 Value *StartIdx; 2570 if (Step->getType()->isFloatingPointTy()) 2571 StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); 2572 else 2573 StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); 2574 2575 Value *EntryPart = 2576 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2577 State.set(Def, EntryPart, Part); 2578 if (Trunc) 2579 addMetadata(EntryPart, Trunc); 2580 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2581 State, Part); 2582 } 2583 }; 2584 2585 // Fast-math-flags propagate from the original induction instruction. 2586 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2587 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2588 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2589 2590 // Now do the actual transformations, and start with creating the step value. 2591 Value *Step = CreateStepValue(ID.getStep()); 2592 if (VF.isZero() || VF.isScalar()) { 2593 Value *ScalarIV = CreateScalarIV(Step); 2594 CreateSplatIV(ScalarIV, Step); 2595 return; 2596 } 2597 2598 // Determine if we want a scalar version of the induction variable. This is 2599 // true if the induction variable itself is not widened, or if it has at 2600 // least one user in the loop that is not widened. 2601 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2602 if (!NeedsScalarIV) { 2603 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2604 State); 2605 return; 2606 } 2607 2608 // Try to create a new independent vector induction variable. If we can't 2609 // create the phi node, we will splat the scalar induction variable in each 2610 // loop iteration. 2611 if (!shouldScalarizeInstruction(EntryVal)) { 2612 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2613 State); 2614 Value *ScalarIV = CreateScalarIV(Step); 2615 // Create scalar steps that can be used by instructions we will later 2616 // scalarize. Note that the addition of the scalar steps will not increase 2617 // the number of instructions in the loop in the common case prior to 2618 // InstCombine. We will be trading one vector extract for each scalar step. 2619 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2620 return; 2621 } 2622 2623 // All IV users are scalar instructions, so only emit a scalar IV, not a 2624 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2625 // predicate used by the masked loads/stores. 2626 Value *ScalarIV = CreateScalarIV(Step); 2627 if (!Cost->isScalarEpilogueAllowed()) 2628 CreateSplatIV(ScalarIV, Step); 2629 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2630 } 2631 2632 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2633 Value *Step, 2634 Instruction::BinaryOps BinOp) { 2635 // Create and check the types. 2636 auto *ValVTy = cast<VectorType>(Val->getType()); 2637 ElementCount VLen = ValVTy->getElementCount(); 2638 2639 Type *STy = Val->getType()->getScalarType(); 2640 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2641 "Induction Step must be an integer or FP"); 2642 assert(Step->getType() == STy && "Step has wrong type"); 2643 2644 SmallVector<Constant *, 8> Indices; 2645 2646 // Create a vector of consecutive numbers from zero to VF. 2647 VectorType *InitVecValVTy = ValVTy; 2648 Type *InitVecValSTy = STy; 2649 if (STy->isFloatingPointTy()) { 2650 InitVecValSTy = 2651 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2652 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2653 } 2654 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2655 2656 // Splat the StartIdx 2657 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2658 2659 if (STy->isIntegerTy()) { 2660 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2661 Step = Builder.CreateVectorSplat(VLen, Step); 2662 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2663 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2664 // which can be found from the original scalar operations. 2665 Step = Builder.CreateMul(InitVec, Step); 2666 return Builder.CreateAdd(Val, Step, "induction"); 2667 } 2668 2669 // Floating point induction. 2670 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2671 "Binary Opcode should be specified for FP induction"); 2672 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2673 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2674 2675 Step = Builder.CreateVectorSplat(VLen, Step); 2676 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2677 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2678 } 2679 2680 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2681 Instruction *EntryVal, 2682 const InductionDescriptor &ID, 2683 VPValue *Def, VPValue *CastDef, 2684 VPTransformState &State) { 2685 // We shouldn't have to build scalar steps if we aren't vectorizing. 2686 assert(VF.isVector() && "VF should be greater than one"); 2687 // Get the value type and ensure it and the step have the same integer type. 2688 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2689 assert(ScalarIVTy == Step->getType() && 2690 "Val and Step should have the same type"); 2691 2692 // We build scalar steps for both integer and floating-point induction 2693 // variables. Here, we determine the kind of arithmetic we will perform. 2694 Instruction::BinaryOps AddOp; 2695 Instruction::BinaryOps MulOp; 2696 if (ScalarIVTy->isIntegerTy()) { 2697 AddOp = Instruction::Add; 2698 MulOp = Instruction::Mul; 2699 } else { 2700 AddOp = ID.getInductionOpcode(); 2701 MulOp = Instruction::FMul; 2702 } 2703 2704 // Determine the number of scalars we need to generate for each unroll 2705 // iteration. If EntryVal is uniform, we only need to generate the first 2706 // lane. Otherwise, we generate all VF values. 2707 bool IsUniform = 2708 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2709 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2710 // Compute the scalar steps and save the results in State. 2711 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2712 ScalarIVTy->getScalarSizeInBits()); 2713 Type *VecIVTy = nullptr; 2714 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2715 if (!IsUniform && VF.isScalable()) { 2716 VecIVTy = VectorType::get(ScalarIVTy, VF); 2717 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2718 SplatStep = Builder.CreateVectorSplat(VF, Step); 2719 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2720 } 2721 2722 for (unsigned Part = 0; Part < UF; ++Part) { 2723 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); 2724 2725 if (!IsUniform && VF.isScalable()) { 2726 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2727 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2728 if (ScalarIVTy->isFloatingPointTy()) 2729 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2730 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2731 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2732 State.set(Def, Add, Part); 2733 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2734 Part); 2735 // It's useful to record the lane values too for the known minimum number 2736 // of elements so we do those below. This improves the code quality when 2737 // trying to extract the first element, for example. 2738 } 2739 2740 if (ScalarIVTy->isFloatingPointTy()) 2741 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2742 2743 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2744 Value *StartIdx = Builder.CreateBinOp( 2745 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2746 // The step returned by `createStepForVF` is a runtime-evaluated value 2747 // when VF is scalable. Otherwise, it should be folded into a Constant. 2748 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2749 "Expected StartIdx to be folded to a constant when VF is not " 2750 "scalable"); 2751 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2752 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2753 State.set(Def, Add, VPIteration(Part, Lane)); 2754 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2755 Part, Lane); 2756 } 2757 } 2758 } 2759 2760 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2761 const VPIteration &Instance, 2762 VPTransformState &State) { 2763 Value *ScalarInst = State.get(Def, Instance); 2764 Value *VectorValue = State.get(Def, Instance.Part); 2765 VectorValue = Builder.CreateInsertElement( 2766 VectorValue, ScalarInst, 2767 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2768 State.set(Def, VectorValue, Instance.Part); 2769 } 2770 2771 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2772 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2773 return Builder.CreateVectorReverse(Vec, "reverse"); 2774 } 2775 2776 // Return whether we allow using masked interleave-groups (for dealing with 2777 // strided loads/stores that reside in predicated blocks, or for dealing 2778 // with gaps). 2779 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2780 // If an override option has been passed in for interleaved accesses, use it. 2781 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2782 return EnableMaskedInterleavedMemAccesses; 2783 2784 return TTI.enableMaskedInterleavedAccessVectorization(); 2785 } 2786 2787 // Try to vectorize the interleave group that \p Instr belongs to. 2788 // 2789 // E.g. Translate following interleaved load group (factor = 3): 2790 // for (i = 0; i < N; i+=3) { 2791 // R = Pic[i]; // Member of index 0 2792 // G = Pic[i+1]; // Member of index 1 2793 // B = Pic[i+2]; // Member of index 2 2794 // ... // do something to R, G, B 2795 // } 2796 // To: 2797 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2798 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2799 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2800 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2801 // 2802 // Or translate following interleaved store group (factor = 3): 2803 // for (i = 0; i < N; i+=3) { 2804 // ... do something to R, G, B 2805 // Pic[i] = R; // Member of index 0 2806 // Pic[i+1] = G; // Member of index 1 2807 // Pic[i+2] = B; // Member of index 2 2808 // } 2809 // To: 2810 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2811 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2812 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2813 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2814 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2815 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2816 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2817 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2818 VPValue *BlockInMask) { 2819 Instruction *Instr = Group->getInsertPos(); 2820 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2821 2822 // Prepare for the vector type of the interleaved load/store. 2823 Type *ScalarTy = getLoadStoreType(Instr); 2824 unsigned InterleaveFactor = Group->getFactor(); 2825 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2826 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2827 2828 // Prepare for the new pointers. 2829 SmallVector<Value *, 2> AddrParts; 2830 unsigned Index = Group->getIndex(Instr); 2831 2832 // TODO: extend the masked interleaved-group support to reversed access. 2833 assert((!BlockInMask || !Group->isReverse()) && 2834 "Reversed masked interleave-group not supported."); 2835 2836 // If the group is reverse, adjust the index to refer to the last vector lane 2837 // instead of the first. We adjust the index from the first vector lane, 2838 // rather than directly getting the pointer for lane VF - 1, because the 2839 // pointer operand of the interleaved access is supposed to be uniform. For 2840 // uniform instructions, we're only required to generate a value for the 2841 // first vector lane in each unroll iteration. 2842 if (Group->isReverse()) 2843 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2844 2845 for (unsigned Part = 0; Part < UF; Part++) { 2846 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2847 setDebugLocFromInst(AddrPart); 2848 2849 // Notice current instruction could be any index. Need to adjust the address 2850 // to the member of index 0. 2851 // 2852 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2853 // b = A[i]; // Member of index 0 2854 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2855 // 2856 // E.g. A[i+1] = a; // Member of index 1 2857 // A[i] = b; // Member of index 0 2858 // A[i+2] = c; // Member of index 2 (Current instruction) 2859 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2860 2861 bool InBounds = false; 2862 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2863 InBounds = gep->isInBounds(); 2864 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2865 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2866 2867 // Cast to the vector pointer type. 2868 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2869 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2870 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2871 } 2872 2873 setDebugLocFromInst(Instr); 2874 Value *PoisonVec = PoisonValue::get(VecTy); 2875 2876 Value *MaskForGaps = nullptr; 2877 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2878 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2879 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2880 } 2881 2882 // Vectorize the interleaved load group. 2883 if (isa<LoadInst>(Instr)) { 2884 // For each unroll part, create a wide load for the group. 2885 SmallVector<Value *, 2> NewLoads; 2886 for (unsigned Part = 0; Part < UF; Part++) { 2887 Instruction *NewLoad; 2888 if (BlockInMask || MaskForGaps) { 2889 assert(useMaskedInterleavedAccesses(*TTI) && 2890 "masked interleaved groups are not allowed."); 2891 Value *GroupMask = MaskForGaps; 2892 if (BlockInMask) { 2893 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2894 Value *ShuffledMask = Builder.CreateShuffleVector( 2895 BlockInMaskPart, 2896 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2897 "interleaved.mask"); 2898 GroupMask = MaskForGaps 2899 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2900 MaskForGaps) 2901 : ShuffledMask; 2902 } 2903 NewLoad = 2904 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2905 GroupMask, PoisonVec, "wide.masked.vec"); 2906 } 2907 else 2908 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2909 Group->getAlign(), "wide.vec"); 2910 Group->addMetadata(NewLoad); 2911 NewLoads.push_back(NewLoad); 2912 } 2913 2914 // For each member in the group, shuffle out the appropriate data from the 2915 // wide loads. 2916 unsigned J = 0; 2917 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2918 Instruction *Member = Group->getMember(I); 2919 2920 // Skip the gaps in the group. 2921 if (!Member) 2922 continue; 2923 2924 auto StrideMask = 2925 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2926 for (unsigned Part = 0; Part < UF; Part++) { 2927 Value *StridedVec = Builder.CreateShuffleVector( 2928 NewLoads[Part], StrideMask, "strided.vec"); 2929 2930 // If this member has different type, cast the result type. 2931 if (Member->getType() != ScalarTy) { 2932 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2933 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2934 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2935 } 2936 2937 if (Group->isReverse()) 2938 StridedVec = reverseVector(StridedVec); 2939 2940 State.set(VPDefs[J], StridedVec, Part); 2941 } 2942 ++J; 2943 } 2944 return; 2945 } 2946 2947 // The sub vector type for current instruction. 2948 auto *SubVT = VectorType::get(ScalarTy, VF); 2949 2950 // Vectorize the interleaved store group. 2951 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2952 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2953 "masked interleaved groups are not allowed."); 2954 assert((!MaskForGaps || !VF.isScalable()) && 2955 "masking gaps for scalable vectors is not yet supported."); 2956 for (unsigned Part = 0; Part < UF; Part++) { 2957 // Collect the stored vector from each member. 2958 SmallVector<Value *, 4> StoredVecs; 2959 for (unsigned i = 0; i < InterleaveFactor; i++) { 2960 assert((Group->getMember(i) || MaskForGaps) && 2961 "Fail to get a member from an interleaved store group"); 2962 Instruction *Member = Group->getMember(i); 2963 2964 // Skip the gaps in the group. 2965 if (!Member) { 2966 Value *Undef = PoisonValue::get(SubVT); 2967 StoredVecs.push_back(Undef); 2968 continue; 2969 } 2970 2971 Value *StoredVec = State.get(StoredValues[i], Part); 2972 2973 if (Group->isReverse()) 2974 StoredVec = reverseVector(StoredVec); 2975 2976 // If this member has different type, cast it to a unified type. 2977 2978 if (StoredVec->getType() != SubVT) 2979 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2980 2981 StoredVecs.push_back(StoredVec); 2982 } 2983 2984 // Concatenate all vectors into a wide vector. 2985 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2986 2987 // Interleave the elements in the wide vector. 2988 Value *IVec = Builder.CreateShuffleVector( 2989 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2990 "interleaved.vec"); 2991 2992 Instruction *NewStoreInstr; 2993 if (BlockInMask || MaskForGaps) { 2994 Value *GroupMask = MaskForGaps; 2995 if (BlockInMask) { 2996 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2997 Value *ShuffledMask = Builder.CreateShuffleVector( 2998 BlockInMaskPart, 2999 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 3000 "interleaved.mask"); 3001 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 3002 ShuffledMask, MaskForGaps) 3003 : ShuffledMask; 3004 } 3005 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 3006 Group->getAlign(), GroupMask); 3007 } else 3008 NewStoreInstr = 3009 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 3010 3011 Group->addMetadata(NewStoreInstr); 3012 } 3013 } 3014 3015 void InnerLoopVectorizer::vectorizeMemoryInstruction( 3016 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 3017 VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, 3018 bool Reverse) { 3019 // Attempt to issue a wide load. 3020 LoadInst *LI = dyn_cast<LoadInst>(Instr); 3021 StoreInst *SI = dyn_cast<StoreInst>(Instr); 3022 3023 assert((LI || SI) && "Invalid Load/Store instruction"); 3024 assert((!SI || StoredValue) && "No stored value provided for widened store"); 3025 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 3026 3027 Type *ScalarDataTy = getLoadStoreType(Instr); 3028 3029 auto *DataTy = VectorType::get(ScalarDataTy, VF); 3030 const Align Alignment = getLoadStoreAlignment(Instr); 3031 bool CreateGatherScatter = !ConsecutiveStride; 3032 3033 VectorParts BlockInMaskParts(UF); 3034 bool isMaskRequired = BlockInMask; 3035 if (isMaskRequired) 3036 for (unsigned Part = 0; Part < UF; ++Part) 3037 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 3038 3039 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 3040 // Calculate the pointer for the specific unroll-part. 3041 GetElementPtrInst *PartPtr = nullptr; 3042 3043 bool InBounds = false; 3044 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 3045 InBounds = gep->isInBounds(); 3046 if (Reverse) { 3047 // If the address is consecutive but reversed, then the 3048 // wide store needs to start at the last vector element. 3049 // RunTimeVF = VScale * VF.getKnownMinValue() 3050 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 3051 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 3052 // NumElt = -Part * RunTimeVF 3053 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 3054 // LastLane = 1 - RunTimeVF 3055 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 3056 PartPtr = 3057 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 3058 PartPtr->setIsInBounds(InBounds); 3059 PartPtr = cast<GetElementPtrInst>( 3060 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 3061 PartPtr->setIsInBounds(InBounds); 3062 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 3063 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 3064 } else { 3065 Value *Increment = 3066 createStepForVF(Builder, Builder.getInt32Ty(), VF, Part); 3067 PartPtr = cast<GetElementPtrInst>( 3068 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 3069 PartPtr->setIsInBounds(InBounds); 3070 } 3071 3072 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 3073 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 3074 }; 3075 3076 // Handle Stores: 3077 if (SI) { 3078 setDebugLocFromInst(SI); 3079 3080 for (unsigned Part = 0; Part < UF; ++Part) { 3081 Instruction *NewSI = nullptr; 3082 Value *StoredVal = State.get(StoredValue, Part); 3083 if (CreateGatherScatter) { 3084 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3085 Value *VectorGep = State.get(Addr, Part); 3086 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 3087 MaskPart); 3088 } else { 3089 if (Reverse) { 3090 // If we store to reverse consecutive memory locations, then we need 3091 // to reverse the order of elements in the stored value. 3092 StoredVal = reverseVector(StoredVal); 3093 // We don't want to update the value in the map as it might be used in 3094 // another expression. So don't call resetVectorValue(StoredVal). 3095 } 3096 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3097 if (isMaskRequired) 3098 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 3099 BlockInMaskParts[Part]); 3100 else 3101 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 3102 } 3103 addMetadata(NewSI, SI); 3104 } 3105 return; 3106 } 3107 3108 // Handle loads. 3109 assert(LI && "Must have a load instruction"); 3110 setDebugLocFromInst(LI); 3111 for (unsigned Part = 0; Part < UF; ++Part) { 3112 Value *NewLI; 3113 if (CreateGatherScatter) { 3114 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3115 Value *VectorGep = State.get(Addr, Part); 3116 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3117 nullptr, "wide.masked.gather"); 3118 addMetadata(NewLI, LI); 3119 } else { 3120 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3121 if (isMaskRequired) 3122 NewLI = Builder.CreateMaskedLoad( 3123 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3124 PoisonValue::get(DataTy), "wide.masked.load"); 3125 else 3126 NewLI = 3127 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3128 3129 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3130 addMetadata(NewLI, LI); 3131 if (Reverse) 3132 NewLI = reverseVector(NewLI); 3133 } 3134 3135 State.set(Def, NewLI, Part); 3136 } 3137 } 3138 3139 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 3140 VPReplicateRecipe *RepRecipe, 3141 const VPIteration &Instance, 3142 bool IfPredicateInstr, 3143 VPTransformState &State) { 3144 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3145 3146 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3147 // the first lane and part. 3148 if (isa<NoAliasScopeDeclInst>(Instr)) 3149 if (!Instance.isFirstIteration()) 3150 return; 3151 3152 setDebugLocFromInst(Instr); 3153 3154 // Does this instruction return a value ? 3155 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3156 3157 Instruction *Cloned = Instr->clone(); 3158 if (!IsVoidRetTy) 3159 Cloned->setName(Instr->getName() + ".cloned"); 3160 3161 // If the scalarized instruction contributes to the address computation of a 3162 // widen masked load/store which was in a basic block that needed predication 3163 // and is not predicated after vectorization, we can't propagate 3164 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 3165 // instruction could feed a poison value to the base address of the widen 3166 // load/store. 3167 if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) 3168 Cloned->dropPoisonGeneratingFlags(); 3169 3170 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3171 Builder.GetInsertPoint()); 3172 // Replace the operands of the cloned instructions with their scalar 3173 // equivalents in the new loop. 3174 for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { 3175 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3176 auto InputInstance = Instance; 3177 if (!Operand || !OrigLoop->contains(Operand) || 3178 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3179 InputInstance.Lane = VPLane::getFirstLane(); 3180 auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); 3181 Cloned->setOperand(op, NewOp); 3182 } 3183 addNewMetadata(Cloned, Instr); 3184 3185 // Place the cloned scalar in the new loop. 3186 Builder.Insert(Cloned); 3187 3188 State.set(RepRecipe, Cloned, Instance); 3189 3190 // If we just cloned a new assumption, add it the assumption cache. 3191 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3192 AC->registerAssumption(II); 3193 3194 // End if-block. 3195 if (IfPredicateInstr) 3196 PredicatedInstructions.push_back(Cloned); 3197 } 3198 3199 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3200 Value *End, Value *Step, 3201 Instruction *DL) { 3202 BasicBlock *Header = L->getHeader(); 3203 BasicBlock *Latch = L->getLoopLatch(); 3204 // As we're just creating this loop, it's possible no latch exists 3205 // yet. If so, use the header as this will be a single block loop. 3206 if (!Latch) 3207 Latch = Header; 3208 3209 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3210 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3211 setDebugLocFromInst(OldInst, &B); 3212 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3213 3214 B.SetInsertPoint(Latch->getTerminator()); 3215 setDebugLocFromInst(OldInst, &B); 3216 3217 // Create i+1 and fill the PHINode. 3218 // 3219 // If the tail is not folded, we know that End - Start >= Step (either 3220 // statically or through the minimum iteration checks). We also know that both 3221 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3222 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3223 // overflows and we can mark the induction increment as NUW. 3224 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3225 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3226 Induction->addIncoming(Start, L->getLoopPreheader()); 3227 Induction->addIncoming(Next, Latch); 3228 // Create the compare. 3229 Value *ICmp = B.CreateICmpEQ(Next, End); 3230 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3231 3232 // Now we have two terminators. Remove the old one from the block. 3233 Latch->getTerminator()->eraseFromParent(); 3234 3235 return Induction; 3236 } 3237 3238 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3239 if (TripCount) 3240 return TripCount; 3241 3242 assert(L && "Create Trip Count for null loop."); 3243 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3244 // Find the loop boundaries. 3245 ScalarEvolution *SE = PSE.getSE(); 3246 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3247 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3248 "Invalid loop count"); 3249 3250 Type *IdxTy = Legal->getWidestInductionType(); 3251 assert(IdxTy && "No type for induction"); 3252 3253 // The exit count might have the type of i64 while the phi is i32. This can 3254 // happen if we have an induction variable that is sign extended before the 3255 // compare. The only way that we get a backedge taken count is that the 3256 // induction variable was signed and as such will not overflow. In such a case 3257 // truncation is legal. 3258 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3259 IdxTy->getPrimitiveSizeInBits()) 3260 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3261 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3262 3263 // Get the total trip count from the count by adding 1. 3264 const SCEV *ExitCount = SE->getAddExpr( 3265 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3266 3267 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3268 3269 // Expand the trip count and place the new instructions in the preheader. 3270 // Notice that the pre-header does not change, only the loop body. 3271 SCEVExpander Exp(*SE, DL, "induction"); 3272 3273 // Count holds the overall loop count (N). 3274 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3275 L->getLoopPreheader()->getTerminator()); 3276 3277 if (TripCount->getType()->isPointerTy()) 3278 TripCount = 3279 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3280 L->getLoopPreheader()->getTerminator()); 3281 3282 return TripCount; 3283 } 3284 3285 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3286 if (VectorTripCount) 3287 return VectorTripCount; 3288 3289 Value *TC = getOrCreateTripCount(L); 3290 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3291 3292 Type *Ty = TC->getType(); 3293 // This is where we can make the step a runtime constant. 3294 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3295 3296 // If the tail is to be folded by masking, round the number of iterations N 3297 // up to a multiple of Step instead of rounding down. This is done by first 3298 // adding Step-1 and then rounding down. Note that it's ok if this addition 3299 // overflows: the vector induction variable will eventually wrap to zero given 3300 // that it starts at zero and its Step is a power of two; the loop will then 3301 // exit, with the last early-exit vector comparison also producing all-true. 3302 if (Cost->foldTailByMasking()) { 3303 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3304 "VF*UF must be a power of 2 when folding tail by masking"); 3305 assert(!VF.isScalable() && 3306 "Tail folding not yet supported for scalable vectors"); 3307 TC = Builder.CreateAdd( 3308 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3309 } 3310 3311 // Now we need to generate the expression for the part of the loop that the 3312 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3313 // iterations are not required for correctness, or N - Step, otherwise. Step 3314 // is equal to the vectorization factor (number of SIMD elements) times the 3315 // unroll factor (number of SIMD instructions). 3316 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3317 3318 // There are cases where we *must* run at least one iteration in the remainder 3319 // loop. See the cost model for when this can happen. If the step evenly 3320 // divides the trip count, we set the remainder to be equal to the step. If 3321 // the step does not evenly divide the trip count, no adjustment is necessary 3322 // since there will already be scalar iterations. Note that the minimum 3323 // iterations check ensures that N >= Step. 3324 if (Cost->requiresScalarEpilogue(VF)) { 3325 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3326 R = Builder.CreateSelect(IsZero, Step, R); 3327 } 3328 3329 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3330 3331 return VectorTripCount; 3332 } 3333 3334 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3335 const DataLayout &DL) { 3336 // Verify that V is a vector type with same number of elements as DstVTy. 3337 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3338 unsigned VF = DstFVTy->getNumElements(); 3339 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3340 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3341 Type *SrcElemTy = SrcVecTy->getElementType(); 3342 Type *DstElemTy = DstFVTy->getElementType(); 3343 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3344 "Vector elements must have same size"); 3345 3346 // Do a direct cast if element types are castable. 3347 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3348 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3349 } 3350 // V cannot be directly casted to desired vector type. 3351 // May happen when V is a floating point vector but DstVTy is a vector of 3352 // pointers or vice-versa. Handle this using a two-step bitcast using an 3353 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3354 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3355 "Only one type should be a pointer type"); 3356 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3357 "Only one type should be a floating point type"); 3358 Type *IntTy = 3359 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3360 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3361 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3362 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3363 } 3364 3365 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3366 BasicBlock *Bypass) { 3367 Value *Count = getOrCreateTripCount(L); 3368 // Reuse existing vector loop preheader for TC checks. 3369 // Note that new preheader block is generated for vector loop. 3370 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3371 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3372 3373 // Generate code to check if the loop's trip count is less than VF * UF, or 3374 // equal to it in case a scalar epilogue is required; this implies that the 3375 // vector trip count is zero. This check also covers the case where adding one 3376 // to the backedge-taken count overflowed leading to an incorrect trip count 3377 // of zero. In this case we will also jump to the scalar loop. 3378 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3379 : ICmpInst::ICMP_ULT; 3380 3381 // If tail is to be folded, vector loop takes care of all iterations. 3382 Value *CheckMinIters = Builder.getFalse(); 3383 if (!Cost->foldTailByMasking()) { 3384 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3385 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3386 } 3387 // Create new preheader for vector loop. 3388 LoopVectorPreHeader = 3389 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3390 "vector.ph"); 3391 3392 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3393 DT->getNode(Bypass)->getIDom()) && 3394 "TC check is expected to dominate Bypass"); 3395 3396 // Update dominator for Bypass & LoopExit (if needed). 3397 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3398 if (!Cost->requiresScalarEpilogue(VF)) 3399 // If there is an epilogue which must run, there's no edge from the 3400 // middle block to exit blocks and thus no need to update the immediate 3401 // dominator of the exit blocks. 3402 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3403 3404 ReplaceInstWithInst( 3405 TCCheckBlock->getTerminator(), 3406 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3407 LoopBypassBlocks.push_back(TCCheckBlock); 3408 } 3409 3410 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3411 3412 BasicBlock *const SCEVCheckBlock = 3413 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3414 if (!SCEVCheckBlock) 3415 return nullptr; 3416 3417 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3418 (OptForSizeBasedOnProfile && 3419 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3420 "Cannot SCEV check stride or overflow when optimizing for size"); 3421 3422 3423 // Update dominator only if this is first RT check. 3424 if (LoopBypassBlocks.empty()) { 3425 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3426 if (!Cost->requiresScalarEpilogue(VF)) 3427 // If there is an epilogue which must run, there's no edge from the 3428 // middle block to exit blocks and thus no need to update the immediate 3429 // dominator of the exit blocks. 3430 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3431 } 3432 3433 LoopBypassBlocks.push_back(SCEVCheckBlock); 3434 AddedSafetyChecks = true; 3435 return SCEVCheckBlock; 3436 } 3437 3438 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3439 BasicBlock *Bypass) { 3440 // VPlan-native path does not do any analysis for runtime checks currently. 3441 if (EnableVPlanNativePath) 3442 return nullptr; 3443 3444 BasicBlock *const MemCheckBlock = 3445 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3446 3447 // Check if we generated code that checks in runtime if arrays overlap. We put 3448 // the checks into a separate block to make the more common case of few 3449 // elements faster. 3450 if (!MemCheckBlock) 3451 return nullptr; 3452 3453 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3454 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3455 "Cannot emit memory checks when optimizing for size, unless forced " 3456 "to vectorize."); 3457 ORE->emit([&]() { 3458 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3459 L->getStartLoc(), L->getHeader()) 3460 << "Code-size may be reduced by not forcing " 3461 "vectorization, or by source-code modifications " 3462 "eliminating the need for runtime checks " 3463 "(e.g., adding 'restrict')."; 3464 }); 3465 } 3466 3467 LoopBypassBlocks.push_back(MemCheckBlock); 3468 3469 AddedSafetyChecks = true; 3470 3471 // We currently don't use LoopVersioning for the actual loop cloning but we 3472 // still use it to add the noalias metadata. 3473 LVer = std::make_unique<LoopVersioning>( 3474 *Legal->getLAI(), 3475 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3476 DT, PSE.getSE()); 3477 LVer->prepareNoAliasMetadata(); 3478 return MemCheckBlock; 3479 } 3480 3481 Value *InnerLoopVectorizer::emitTransformedIndex( 3482 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3483 const InductionDescriptor &ID) const { 3484 3485 SCEVExpander Exp(*SE, DL, "induction"); 3486 auto Step = ID.getStep(); 3487 auto StartValue = ID.getStartValue(); 3488 assert(Index->getType()->getScalarType() == Step->getType() && 3489 "Index scalar type does not match StepValue type"); 3490 3491 // Note: the IR at this point is broken. We cannot use SE to create any new 3492 // SCEV and then expand it, hoping that SCEV's simplification will give us 3493 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3494 // lead to various SCEV crashes. So all we can do is to use builder and rely 3495 // on InstCombine for future simplifications. Here we handle some trivial 3496 // cases only. 3497 auto CreateAdd = [&B](Value *X, Value *Y) { 3498 assert(X->getType() == Y->getType() && "Types don't match!"); 3499 if (auto *CX = dyn_cast<ConstantInt>(X)) 3500 if (CX->isZero()) 3501 return Y; 3502 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3503 if (CY->isZero()) 3504 return X; 3505 return B.CreateAdd(X, Y); 3506 }; 3507 3508 // We allow X to be a vector type, in which case Y will potentially be 3509 // splatted into a vector with the same element count. 3510 auto CreateMul = [&B](Value *X, Value *Y) { 3511 assert(X->getType()->getScalarType() == Y->getType() && 3512 "Types don't match!"); 3513 if (auto *CX = dyn_cast<ConstantInt>(X)) 3514 if (CX->isOne()) 3515 return Y; 3516 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3517 if (CY->isOne()) 3518 return X; 3519 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3520 if (XVTy && !isa<VectorType>(Y->getType())) 3521 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3522 return B.CreateMul(X, Y); 3523 }; 3524 3525 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3526 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3527 // the DomTree is not kept up-to-date for additional blocks generated in the 3528 // vector loop. By using the header as insertion point, we guarantee that the 3529 // expanded instructions dominate all their uses. 3530 auto GetInsertPoint = [this, &B]() { 3531 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3532 if (InsertBB != LoopVectorBody && 3533 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3534 return LoopVectorBody->getTerminator(); 3535 return &*B.GetInsertPoint(); 3536 }; 3537 3538 switch (ID.getKind()) { 3539 case InductionDescriptor::IK_IntInduction: { 3540 assert(!isa<VectorType>(Index->getType()) && 3541 "Vector indices not supported for integer inductions yet"); 3542 assert(Index->getType() == StartValue->getType() && 3543 "Index type does not match StartValue type"); 3544 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3545 return B.CreateSub(StartValue, Index); 3546 auto *Offset = CreateMul( 3547 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3548 return CreateAdd(StartValue, Offset); 3549 } 3550 case InductionDescriptor::IK_PtrInduction: { 3551 assert(isa<SCEVConstant>(Step) && 3552 "Expected constant step for pointer induction"); 3553 return B.CreateGEP( 3554 ID.getElementType(), StartValue, 3555 CreateMul(Index, 3556 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3557 GetInsertPoint()))); 3558 } 3559 case InductionDescriptor::IK_FpInduction: { 3560 assert(!isa<VectorType>(Index->getType()) && 3561 "Vector indices not supported for FP inductions yet"); 3562 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3563 auto InductionBinOp = ID.getInductionBinOp(); 3564 assert(InductionBinOp && 3565 (InductionBinOp->getOpcode() == Instruction::FAdd || 3566 InductionBinOp->getOpcode() == Instruction::FSub) && 3567 "Original bin op should be defined for FP induction"); 3568 3569 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3570 Value *MulExp = B.CreateFMul(StepValue, Index); 3571 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3572 "induction"); 3573 } 3574 case InductionDescriptor::IK_NoInduction: 3575 return nullptr; 3576 } 3577 llvm_unreachable("invalid enum"); 3578 } 3579 3580 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3581 LoopScalarBody = OrigLoop->getHeader(); 3582 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3583 assert(LoopVectorPreHeader && "Invalid loop structure"); 3584 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3585 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3586 "multiple exit loop without required epilogue?"); 3587 3588 LoopMiddleBlock = 3589 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3590 LI, nullptr, Twine(Prefix) + "middle.block"); 3591 LoopScalarPreHeader = 3592 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3593 nullptr, Twine(Prefix) + "scalar.ph"); 3594 3595 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3596 3597 // Set up the middle block terminator. Two cases: 3598 // 1) If we know that we must execute the scalar epilogue, emit an 3599 // unconditional branch. 3600 // 2) Otherwise, we must have a single unique exit block (due to how we 3601 // implement the multiple exit case). In this case, set up a conditonal 3602 // branch from the middle block to the loop scalar preheader, and the 3603 // exit block. completeLoopSkeleton will update the condition to use an 3604 // iteration check, if required to decide whether to execute the remainder. 3605 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3606 BranchInst::Create(LoopScalarPreHeader) : 3607 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3608 Builder.getTrue()); 3609 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3610 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3611 3612 // We intentionally don't let SplitBlock to update LoopInfo since 3613 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3614 // LoopVectorBody is explicitly added to the correct place few lines later. 3615 LoopVectorBody = 3616 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3617 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3618 3619 // Update dominator for loop exit. 3620 if (!Cost->requiresScalarEpilogue(VF)) 3621 // If there is an epilogue which must run, there's no edge from the 3622 // middle block to exit blocks and thus no need to update the immediate 3623 // dominator of the exit blocks. 3624 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3625 3626 // Create and register the new vector loop. 3627 Loop *Lp = LI->AllocateLoop(); 3628 Loop *ParentLoop = OrigLoop->getParentLoop(); 3629 3630 // Insert the new loop into the loop nest and register the new basic blocks 3631 // before calling any utilities such as SCEV that require valid LoopInfo. 3632 if (ParentLoop) { 3633 ParentLoop->addChildLoop(Lp); 3634 } else { 3635 LI->addTopLevelLoop(Lp); 3636 } 3637 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3638 return Lp; 3639 } 3640 3641 void InnerLoopVectorizer::createInductionResumeValues( 3642 Loop *L, Value *VectorTripCount, 3643 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3644 assert(VectorTripCount && L && "Expected valid arguments"); 3645 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3646 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3647 "Inconsistent information about additional bypass."); 3648 // We are going to resume the execution of the scalar loop. 3649 // Go over all of the induction variables that we found and fix the 3650 // PHIs that are left in the scalar version of the loop. 3651 // The starting values of PHI nodes depend on the counter of the last 3652 // iteration in the vectorized loop. 3653 // If we come from a bypass edge then we need to start from the original 3654 // start value. 3655 for (auto &InductionEntry : Legal->getInductionVars()) { 3656 PHINode *OrigPhi = InductionEntry.first; 3657 InductionDescriptor II = InductionEntry.second; 3658 3659 // Create phi nodes to merge from the backedge-taken check block. 3660 PHINode *BCResumeVal = 3661 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3662 LoopScalarPreHeader->getTerminator()); 3663 // Copy original phi DL over to the new one. 3664 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3665 Value *&EndValue = IVEndValues[OrigPhi]; 3666 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3667 if (OrigPhi == OldInduction) { 3668 // We know what the end value is. 3669 EndValue = VectorTripCount; 3670 } else { 3671 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3672 3673 // Fast-math-flags propagate from the original induction instruction. 3674 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3675 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3676 3677 Type *StepType = II.getStep()->getType(); 3678 Instruction::CastOps CastOp = 3679 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3680 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3681 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3682 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3683 EndValue->setName("ind.end"); 3684 3685 // Compute the end value for the additional bypass (if applicable). 3686 if (AdditionalBypass.first) { 3687 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3688 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3689 StepType, true); 3690 CRD = 3691 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3692 EndValueFromAdditionalBypass = 3693 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3694 EndValueFromAdditionalBypass->setName("ind.end"); 3695 } 3696 } 3697 // The new PHI merges the original incoming value, in case of a bypass, 3698 // or the value at the end of the vectorized loop. 3699 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3700 3701 // Fix the scalar body counter (PHI node). 3702 // The old induction's phi node in the scalar body needs the truncated 3703 // value. 3704 for (BasicBlock *BB : LoopBypassBlocks) 3705 BCResumeVal->addIncoming(II.getStartValue(), BB); 3706 3707 if (AdditionalBypass.first) 3708 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3709 EndValueFromAdditionalBypass); 3710 3711 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3712 } 3713 } 3714 3715 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3716 MDNode *OrigLoopID) { 3717 assert(L && "Expected valid loop."); 3718 3719 // The trip counts should be cached by now. 3720 Value *Count = getOrCreateTripCount(L); 3721 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3722 3723 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3724 3725 // Add a check in the middle block to see if we have completed 3726 // all of the iterations in the first vector loop. Three cases: 3727 // 1) If we require a scalar epilogue, there is no conditional branch as 3728 // we unconditionally branch to the scalar preheader. Do nothing. 3729 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3730 // Thus if tail is to be folded, we know we don't need to run the 3731 // remainder and we can use the previous value for the condition (true). 3732 // 3) Otherwise, construct a runtime check. 3733 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3734 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3735 Count, VectorTripCount, "cmp.n", 3736 LoopMiddleBlock->getTerminator()); 3737 3738 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3739 // of the corresponding compare because they may have ended up with 3740 // different line numbers and we want to avoid awkward line stepping while 3741 // debugging. Eg. if the compare has got a line number inside the loop. 3742 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3743 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3744 } 3745 3746 // Get ready to start creating new instructions into the vectorized body. 3747 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3748 "Inconsistent vector loop preheader"); 3749 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3750 3751 Optional<MDNode *> VectorizedLoopID = 3752 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3753 LLVMLoopVectorizeFollowupVectorized}); 3754 if (VectorizedLoopID.hasValue()) { 3755 L->setLoopID(VectorizedLoopID.getValue()); 3756 3757 // Do not setAlreadyVectorized if loop attributes have been defined 3758 // explicitly. 3759 return LoopVectorPreHeader; 3760 } 3761 3762 // Keep all loop hints from the original loop on the vector loop (we'll 3763 // replace the vectorizer-specific hints below). 3764 if (MDNode *LID = OrigLoop->getLoopID()) 3765 L->setLoopID(LID); 3766 3767 LoopVectorizeHints Hints(L, true, *ORE); 3768 Hints.setAlreadyVectorized(); 3769 3770 #ifdef EXPENSIVE_CHECKS 3771 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3772 LI->verify(*DT); 3773 #endif 3774 3775 return LoopVectorPreHeader; 3776 } 3777 3778 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3779 /* 3780 In this function we generate a new loop. The new loop will contain 3781 the vectorized instructions while the old loop will continue to run the 3782 scalar remainder. 3783 3784 [ ] <-- loop iteration number check. 3785 / | 3786 / v 3787 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3788 | / | 3789 | / v 3790 || [ ] <-- vector pre header. 3791 |/ | 3792 | v 3793 | [ ] \ 3794 | [ ]_| <-- vector loop. 3795 | | 3796 | v 3797 \ -[ ] <--- middle-block. 3798 \/ | 3799 /\ v 3800 | ->[ ] <--- new preheader. 3801 | | 3802 (opt) v <-- edge from middle to exit iff epilogue is not required. 3803 | [ ] \ 3804 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3805 \ | 3806 \ v 3807 >[ ] <-- exit block(s). 3808 ... 3809 */ 3810 3811 // Get the metadata of the original loop before it gets modified. 3812 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3813 3814 // Workaround! Compute the trip count of the original loop and cache it 3815 // before we start modifying the CFG. This code has a systemic problem 3816 // wherein it tries to run analysis over partially constructed IR; this is 3817 // wrong, and not simply for SCEV. The trip count of the original loop 3818 // simply happens to be prone to hitting this in practice. In theory, we 3819 // can hit the same issue for any SCEV, or ValueTracking query done during 3820 // mutation. See PR49900. 3821 getOrCreateTripCount(OrigLoop); 3822 3823 // Create an empty vector loop, and prepare basic blocks for the runtime 3824 // checks. 3825 Loop *Lp = createVectorLoopSkeleton(""); 3826 3827 // Now, compare the new count to zero. If it is zero skip the vector loop and 3828 // jump to the scalar loop. This check also covers the case where the 3829 // backedge-taken count is uint##_max: adding one to it will overflow leading 3830 // to an incorrect trip count of zero. In this (rare) case we will also jump 3831 // to the scalar loop. 3832 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3833 3834 // Generate the code to check any assumptions that we've made for SCEV 3835 // expressions. 3836 emitSCEVChecks(Lp, LoopScalarPreHeader); 3837 3838 // Generate the code that checks in runtime if arrays overlap. We put the 3839 // checks into a separate block to make the more common case of few elements 3840 // faster. 3841 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3842 3843 // Some loops have a single integer induction variable, while other loops 3844 // don't. One example is c++ iterators that often have multiple pointer 3845 // induction variables. In the code below we also support a case where we 3846 // don't have a single induction variable. 3847 // 3848 // We try to obtain an induction variable from the original loop as hard 3849 // as possible. However if we don't find one that: 3850 // - is an integer 3851 // - counts from zero, stepping by one 3852 // - is the size of the widest induction variable type 3853 // then we create a new one. 3854 OldInduction = Legal->getPrimaryInduction(); 3855 Type *IdxTy = Legal->getWidestInductionType(); 3856 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3857 // The loop step is equal to the vectorization factor (num of SIMD elements) 3858 // times the unroll factor (num of SIMD instructions). 3859 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3860 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3861 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3862 Induction = 3863 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3864 getDebugLocFromInstOrOperands(OldInduction)); 3865 3866 // Emit phis for the new starting index of the scalar loop. 3867 createInductionResumeValues(Lp, CountRoundDown); 3868 3869 return completeLoopSkeleton(Lp, OrigLoopID); 3870 } 3871 3872 // Fix up external users of the induction variable. At this point, we are 3873 // in LCSSA form, with all external PHIs that use the IV having one input value, 3874 // coming from the remainder loop. We need those PHIs to also have a correct 3875 // value for the IV when arriving directly from the middle block. 3876 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3877 const InductionDescriptor &II, 3878 Value *CountRoundDown, Value *EndValue, 3879 BasicBlock *MiddleBlock) { 3880 // There are two kinds of external IV usages - those that use the value 3881 // computed in the last iteration (the PHI) and those that use the penultimate 3882 // value (the value that feeds into the phi from the loop latch). 3883 // We allow both, but they, obviously, have different values. 3884 3885 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3886 3887 DenseMap<Value *, Value *> MissingVals; 3888 3889 // An external user of the last iteration's value should see the value that 3890 // the remainder loop uses to initialize its own IV. 3891 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3892 for (User *U : PostInc->users()) { 3893 Instruction *UI = cast<Instruction>(U); 3894 if (!OrigLoop->contains(UI)) { 3895 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3896 MissingVals[UI] = EndValue; 3897 } 3898 } 3899 3900 // An external user of the penultimate value need to see EndValue - Step. 3901 // The simplest way to get this is to recompute it from the constituent SCEVs, 3902 // that is Start + (Step * (CRD - 1)). 3903 for (User *U : OrigPhi->users()) { 3904 auto *UI = cast<Instruction>(U); 3905 if (!OrigLoop->contains(UI)) { 3906 const DataLayout &DL = 3907 OrigLoop->getHeader()->getModule()->getDataLayout(); 3908 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3909 3910 IRBuilder<> B(MiddleBlock->getTerminator()); 3911 3912 // Fast-math-flags propagate from the original induction instruction. 3913 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3914 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3915 3916 Value *CountMinusOne = B.CreateSub( 3917 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3918 Value *CMO = 3919 !II.getStep()->getType()->isIntegerTy() 3920 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3921 II.getStep()->getType()) 3922 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3923 CMO->setName("cast.cmo"); 3924 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3925 Escape->setName("ind.escape"); 3926 MissingVals[UI] = Escape; 3927 } 3928 } 3929 3930 for (auto &I : MissingVals) { 3931 PHINode *PHI = cast<PHINode>(I.first); 3932 // One corner case we have to handle is two IVs "chasing" each-other, 3933 // that is %IV2 = phi [...], [ %IV1, %latch ] 3934 // In this case, if IV1 has an external use, we need to avoid adding both 3935 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3936 // don't already have an incoming value for the middle block. 3937 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3938 PHI->addIncoming(I.second, MiddleBlock); 3939 } 3940 } 3941 3942 namespace { 3943 3944 struct CSEDenseMapInfo { 3945 static bool canHandle(const Instruction *I) { 3946 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3947 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3948 } 3949 3950 static inline Instruction *getEmptyKey() { 3951 return DenseMapInfo<Instruction *>::getEmptyKey(); 3952 } 3953 3954 static inline Instruction *getTombstoneKey() { 3955 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3956 } 3957 3958 static unsigned getHashValue(const Instruction *I) { 3959 assert(canHandle(I) && "Unknown instruction!"); 3960 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3961 I->value_op_end())); 3962 } 3963 3964 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3965 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3966 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3967 return LHS == RHS; 3968 return LHS->isIdenticalTo(RHS); 3969 } 3970 }; 3971 3972 } // end anonymous namespace 3973 3974 ///Perform cse of induction variable instructions. 3975 static void cse(BasicBlock *BB) { 3976 // Perform simple cse. 3977 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3978 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3979 if (!CSEDenseMapInfo::canHandle(&In)) 3980 continue; 3981 3982 // Check if we can replace this instruction with any of the 3983 // visited instructions. 3984 if (Instruction *V = CSEMap.lookup(&In)) { 3985 In.replaceAllUsesWith(V); 3986 In.eraseFromParent(); 3987 continue; 3988 } 3989 3990 CSEMap[&In] = &In; 3991 } 3992 } 3993 3994 InstructionCost 3995 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3996 bool &NeedToScalarize) const { 3997 Function *F = CI->getCalledFunction(); 3998 Type *ScalarRetTy = CI->getType(); 3999 SmallVector<Type *, 4> Tys, ScalarTys; 4000 for (auto &ArgOp : CI->args()) 4001 ScalarTys.push_back(ArgOp->getType()); 4002 4003 // Estimate cost of scalarized vector call. The source operands are assumed 4004 // to be vectors, so we need to extract individual elements from there, 4005 // execute VF scalar calls, and then gather the result into the vector return 4006 // value. 4007 InstructionCost ScalarCallCost = 4008 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 4009 if (VF.isScalar()) 4010 return ScalarCallCost; 4011 4012 // Compute corresponding vector type for return value and arguments. 4013 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 4014 for (Type *ScalarTy : ScalarTys) 4015 Tys.push_back(ToVectorTy(ScalarTy, VF)); 4016 4017 // Compute costs of unpacking argument values for the scalar calls and 4018 // packing the return values to a vector. 4019 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 4020 4021 InstructionCost Cost = 4022 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 4023 4024 // If we can't emit a vector call for this function, then the currently found 4025 // cost is the cost we need to return. 4026 NeedToScalarize = true; 4027 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4028 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 4029 4030 if (!TLI || CI->isNoBuiltin() || !VecFunc) 4031 return Cost; 4032 4033 // If the corresponding vector cost is cheaper, return its cost. 4034 InstructionCost VectorCallCost = 4035 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 4036 if (VectorCallCost < Cost) { 4037 NeedToScalarize = false; 4038 Cost = VectorCallCost; 4039 } 4040 return Cost; 4041 } 4042 4043 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 4044 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 4045 return Elt; 4046 return VectorType::get(Elt, VF); 4047 } 4048 4049 InstructionCost 4050 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 4051 ElementCount VF) const { 4052 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4053 assert(ID && "Expected intrinsic call!"); 4054 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 4055 FastMathFlags FMF; 4056 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 4057 FMF = FPMO->getFastMathFlags(); 4058 4059 SmallVector<const Value *> Arguments(CI->args()); 4060 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 4061 SmallVector<Type *> ParamTys; 4062 std::transform(FTy->param_begin(), FTy->param_end(), 4063 std::back_inserter(ParamTys), 4064 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 4065 4066 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 4067 dyn_cast<IntrinsicInst>(CI)); 4068 return TTI.getIntrinsicInstrCost(CostAttrs, 4069 TargetTransformInfo::TCK_RecipThroughput); 4070 } 4071 4072 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 4073 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 4074 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 4075 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 4076 } 4077 4078 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 4079 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 4080 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 4081 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 4082 } 4083 4084 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 4085 // For every instruction `I` in MinBWs, truncate the operands, create a 4086 // truncated version of `I` and reextend its result. InstCombine runs 4087 // later and will remove any ext/trunc pairs. 4088 SmallPtrSet<Value *, 4> Erased; 4089 for (const auto &KV : Cost->getMinimalBitwidths()) { 4090 // If the value wasn't vectorized, we must maintain the original scalar 4091 // type. The absence of the value from State indicates that it 4092 // wasn't vectorized. 4093 // FIXME: Should not rely on getVPValue at this point. 4094 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4095 if (!State.hasAnyVectorValue(Def)) 4096 continue; 4097 for (unsigned Part = 0; Part < UF; ++Part) { 4098 Value *I = State.get(Def, Part); 4099 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 4100 continue; 4101 Type *OriginalTy = I->getType(); 4102 Type *ScalarTruncatedTy = 4103 IntegerType::get(OriginalTy->getContext(), KV.second); 4104 auto *TruncatedTy = VectorType::get( 4105 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 4106 if (TruncatedTy == OriginalTy) 4107 continue; 4108 4109 IRBuilder<> B(cast<Instruction>(I)); 4110 auto ShrinkOperand = [&](Value *V) -> Value * { 4111 if (auto *ZI = dyn_cast<ZExtInst>(V)) 4112 if (ZI->getSrcTy() == TruncatedTy) 4113 return ZI->getOperand(0); 4114 return B.CreateZExtOrTrunc(V, TruncatedTy); 4115 }; 4116 4117 // The actual instruction modification depends on the instruction type, 4118 // unfortunately. 4119 Value *NewI = nullptr; 4120 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4121 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4122 ShrinkOperand(BO->getOperand(1))); 4123 4124 // Any wrapping introduced by shrinking this operation shouldn't be 4125 // considered undefined behavior. So, we can't unconditionally copy 4126 // arithmetic wrapping flags to NewI. 4127 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4128 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4129 NewI = 4130 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4131 ShrinkOperand(CI->getOperand(1))); 4132 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4133 NewI = B.CreateSelect(SI->getCondition(), 4134 ShrinkOperand(SI->getTrueValue()), 4135 ShrinkOperand(SI->getFalseValue())); 4136 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4137 switch (CI->getOpcode()) { 4138 default: 4139 llvm_unreachable("Unhandled cast!"); 4140 case Instruction::Trunc: 4141 NewI = ShrinkOperand(CI->getOperand(0)); 4142 break; 4143 case Instruction::SExt: 4144 NewI = B.CreateSExtOrTrunc( 4145 CI->getOperand(0), 4146 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4147 break; 4148 case Instruction::ZExt: 4149 NewI = B.CreateZExtOrTrunc( 4150 CI->getOperand(0), 4151 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4152 break; 4153 } 4154 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4155 auto Elements0 = 4156 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4157 auto *O0 = B.CreateZExtOrTrunc( 4158 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4159 auto Elements1 = 4160 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4161 auto *O1 = B.CreateZExtOrTrunc( 4162 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4163 4164 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4165 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4166 // Don't do anything with the operands, just extend the result. 4167 continue; 4168 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4169 auto Elements = 4170 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4171 auto *O0 = B.CreateZExtOrTrunc( 4172 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4173 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4174 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4175 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4176 auto Elements = 4177 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4178 auto *O0 = B.CreateZExtOrTrunc( 4179 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4180 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4181 } else { 4182 // If we don't know what to do, be conservative and don't do anything. 4183 continue; 4184 } 4185 4186 // Lastly, extend the result. 4187 NewI->takeName(cast<Instruction>(I)); 4188 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4189 I->replaceAllUsesWith(Res); 4190 cast<Instruction>(I)->eraseFromParent(); 4191 Erased.insert(I); 4192 State.reset(Def, Res, Part); 4193 } 4194 } 4195 4196 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4197 for (const auto &KV : Cost->getMinimalBitwidths()) { 4198 // If the value wasn't vectorized, we must maintain the original scalar 4199 // type. The absence of the value from State indicates that it 4200 // wasn't vectorized. 4201 // FIXME: Should not rely on getVPValue at this point. 4202 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4203 if (!State.hasAnyVectorValue(Def)) 4204 continue; 4205 for (unsigned Part = 0; Part < UF; ++Part) { 4206 Value *I = State.get(Def, Part); 4207 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4208 if (Inst && Inst->use_empty()) { 4209 Value *NewI = Inst->getOperand(0); 4210 Inst->eraseFromParent(); 4211 State.reset(Def, NewI, Part); 4212 } 4213 } 4214 } 4215 } 4216 4217 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4218 // Insert truncates and extends for any truncated instructions as hints to 4219 // InstCombine. 4220 if (VF.isVector()) 4221 truncateToMinimalBitwidths(State); 4222 4223 // Fix widened non-induction PHIs by setting up the PHI operands. 4224 if (OrigPHIsToFix.size()) { 4225 assert(EnableVPlanNativePath && 4226 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4227 fixNonInductionPHIs(State); 4228 } 4229 4230 // At this point every instruction in the original loop is widened to a 4231 // vector form. Now we need to fix the recurrences in the loop. These PHI 4232 // nodes are currently empty because we did not want to introduce cycles. 4233 // This is the second stage of vectorizing recurrences. 4234 fixCrossIterationPHIs(State); 4235 4236 // Forget the original basic block. 4237 PSE.getSE()->forgetLoop(OrigLoop); 4238 4239 // If we inserted an edge from the middle block to the unique exit block, 4240 // update uses outside the loop (phis) to account for the newly inserted 4241 // edge. 4242 if (!Cost->requiresScalarEpilogue(VF)) { 4243 // Fix-up external users of the induction variables. 4244 for (auto &Entry : Legal->getInductionVars()) 4245 fixupIVUsers(Entry.first, Entry.second, 4246 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4247 IVEndValues[Entry.first], LoopMiddleBlock); 4248 4249 fixLCSSAPHIs(State); 4250 } 4251 4252 for (Instruction *PI : PredicatedInstructions) 4253 sinkScalarOperands(&*PI); 4254 4255 // Remove redundant induction instructions. 4256 cse(LoopVectorBody); 4257 4258 // Set/update profile weights for the vector and remainder loops as original 4259 // loop iterations are now distributed among them. Note that original loop 4260 // represented by LoopScalarBody becomes remainder loop after vectorization. 4261 // 4262 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4263 // end up getting slightly roughened result but that should be OK since 4264 // profile is not inherently precise anyway. Note also possible bypass of 4265 // vector code caused by legality checks is ignored, assigning all the weight 4266 // to the vector loop, optimistically. 4267 // 4268 // For scalable vectorization we can't know at compile time how many iterations 4269 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4270 // vscale of '1'. 4271 setProfileInfoAfterUnrolling( 4272 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4273 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4274 } 4275 4276 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4277 // In order to support recurrences we need to be able to vectorize Phi nodes. 4278 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4279 // stage #2: We now need to fix the recurrences by adding incoming edges to 4280 // the currently empty PHI nodes. At this point every instruction in the 4281 // original loop is widened to a vector form so we can use them to construct 4282 // the incoming edges. 4283 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4284 for (VPRecipeBase &R : Header->phis()) { 4285 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4286 fixReduction(ReductionPhi, State); 4287 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4288 fixFirstOrderRecurrence(FOR, State); 4289 } 4290 } 4291 4292 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4293 VPTransformState &State) { 4294 // This is the second phase of vectorizing first-order recurrences. An 4295 // overview of the transformation is described below. Suppose we have the 4296 // following loop. 4297 // 4298 // for (int i = 0; i < n; ++i) 4299 // b[i] = a[i] - a[i - 1]; 4300 // 4301 // There is a first-order recurrence on "a". For this loop, the shorthand 4302 // scalar IR looks like: 4303 // 4304 // scalar.ph: 4305 // s_init = a[-1] 4306 // br scalar.body 4307 // 4308 // scalar.body: 4309 // i = phi [0, scalar.ph], [i+1, scalar.body] 4310 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4311 // s2 = a[i] 4312 // b[i] = s2 - s1 4313 // br cond, scalar.body, ... 4314 // 4315 // In this example, s1 is a recurrence because it's value depends on the 4316 // previous iteration. In the first phase of vectorization, we created a 4317 // vector phi v1 for s1. We now complete the vectorization and produce the 4318 // shorthand vector IR shown below (for VF = 4, UF = 1). 4319 // 4320 // vector.ph: 4321 // v_init = vector(..., ..., ..., a[-1]) 4322 // br vector.body 4323 // 4324 // vector.body 4325 // i = phi [0, vector.ph], [i+4, vector.body] 4326 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4327 // v2 = a[i, i+1, i+2, i+3]; 4328 // v3 = vector(v1(3), v2(0, 1, 2)) 4329 // b[i, i+1, i+2, i+3] = v2 - v3 4330 // br cond, vector.body, middle.block 4331 // 4332 // middle.block: 4333 // x = v2(3) 4334 // br scalar.ph 4335 // 4336 // scalar.ph: 4337 // s_init = phi [x, middle.block], [a[-1], otherwise] 4338 // br scalar.body 4339 // 4340 // After execution completes the vector loop, we extract the next value of 4341 // the recurrence (x) to use as the initial value in the scalar loop. 4342 4343 // Extract the last vector element in the middle block. This will be the 4344 // initial value for the recurrence when jumping to the scalar loop. 4345 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4346 Value *Incoming = State.get(PreviousDef, UF - 1); 4347 auto *ExtractForScalar = Incoming; 4348 auto *IdxTy = Builder.getInt32Ty(); 4349 if (VF.isVector()) { 4350 auto *One = ConstantInt::get(IdxTy, 1); 4351 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4352 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4353 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4354 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4355 "vector.recur.extract"); 4356 } 4357 // Extract the second last element in the middle block if the 4358 // Phi is used outside the loop. We need to extract the phi itself 4359 // and not the last element (the phi update in the current iteration). This 4360 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4361 // when the scalar loop is not run at all. 4362 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4363 if (VF.isVector()) { 4364 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4365 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4366 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4367 Incoming, Idx, "vector.recur.extract.for.phi"); 4368 } else if (UF > 1) 4369 // When loop is unrolled without vectorizing, initialize 4370 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4371 // of `Incoming`. This is analogous to the vectorized case above: extracting 4372 // the second last element when VF > 1. 4373 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4374 4375 // Fix the initial value of the original recurrence in the scalar loop. 4376 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4377 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4378 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4379 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4380 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4381 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4382 Start->addIncoming(Incoming, BB); 4383 } 4384 4385 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4386 Phi->setName("scalar.recur"); 4387 4388 // Finally, fix users of the recurrence outside the loop. The users will need 4389 // either the last value of the scalar recurrence or the last value of the 4390 // vector recurrence we extracted in the middle block. Since the loop is in 4391 // LCSSA form, we just need to find all the phi nodes for the original scalar 4392 // recurrence in the exit block, and then add an edge for the middle block. 4393 // Note that LCSSA does not imply single entry when the original scalar loop 4394 // had multiple exiting edges (as we always run the last iteration in the 4395 // scalar epilogue); in that case, there is no edge from middle to exit and 4396 // and thus no phis which needed updated. 4397 if (!Cost->requiresScalarEpilogue(VF)) 4398 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4399 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4400 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4401 } 4402 4403 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4404 VPTransformState &State) { 4405 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4406 // Get it's reduction variable descriptor. 4407 assert(Legal->isReductionVariable(OrigPhi) && 4408 "Unable to find the reduction variable"); 4409 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4410 4411 RecurKind RK = RdxDesc.getRecurrenceKind(); 4412 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4413 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4414 setDebugLocFromInst(ReductionStartValue); 4415 4416 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4417 // This is the vector-clone of the value that leaves the loop. 4418 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4419 4420 // Wrap flags are in general invalid after vectorization, clear them. 4421 clearReductionWrapFlags(RdxDesc, State); 4422 4423 // Before each round, move the insertion point right between 4424 // the PHIs and the values we are going to write. 4425 // This allows us to write both PHINodes and the extractelement 4426 // instructions. 4427 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4428 4429 setDebugLocFromInst(LoopExitInst); 4430 4431 Type *PhiTy = OrigPhi->getType(); 4432 // If tail is folded by masking, the vector value to leave the loop should be 4433 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4434 // instead of the former. For an inloop reduction the reduction will already 4435 // be predicated, and does not need to be handled here. 4436 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4437 for (unsigned Part = 0; Part < UF; ++Part) { 4438 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4439 Value *Sel = nullptr; 4440 for (User *U : VecLoopExitInst->users()) { 4441 if (isa<SelectInst>(U)) { 4442 assert(!Sel && "Reduction exit feeding two selects"); 4443 Sel = U; 4444 } else 4445 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4446 } 4447 assert(Sel && "Reduction exit feeds no select"); 4448 State.reset(LoopExitInstDef, Sel, Part); 4449 4450 // If the target can create a predicated operator for the reduction at no 4451 // extra cost in the loop (for example a predicated vadd), it can be 4452 // cheaper for the select to remain in the loop than be sunk out of it, 4453 // and so use the select value for the phi instead of the old 4454 // LoopExitValue. 4455 if (PreferPredicatedReductionSelect || 4456 TTI->preferPredicatedReductionSelect( 4457 RdxDesc.getOpcode(), PhiTy, 4458 TargetTransformInfo::ReductionFlags())) { 4459 auto *VecRdxPhi = 4460 cast<PHINode>(State.get(PhiR, Part)); 4461 VecRdxPhi->setIncomingValueForBlock( 4462 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4463 } 4464 } 4465 } 4466 4467 // If the vector reduction can be performed in a smaller type, we truncate 4468 // then extend the loop exit value to enable InstCombine to evaluate the 4469 // entire expression in the smaller type. 4470 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4471 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4472 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4473 Builder.SetInsertPoint( 4474 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4475 VectorParts RdxParts(UF); 4476 for (unsigned Part = 0; Part < UF; ++Part) { 4477 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4478 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4479 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4480 : Builder.CreateZExt(Trunc, VecTy); 4481 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4482 if (U != Trunc) { 4483 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4484 RdxParts[Part] = Extnd; 4485 } 4486 } 4487 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4488 for (unsigned Part = 0; Part < UF; ++Part) { 4489 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4490 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4491 } 4492 } 4493 4494 // Reduce all of the unrolled parts into a single vector. 4495 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4496 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4497 4498 // The middle block terminator has already been assigned a DebugLoc here (the 4499 // OrigLoop's single latch terminator). We want the whole middle block to 4500 // appear to execute on this line because: (a) it is all compiler generated, 4501 // (b) these instructions are always executed after evaluating the latch 4502 // conditional branch, and (c) other passes may add new predecessors which 4503 // terminate on this line. This is the easiest way to ensure we don't 4504 // accidentally cause an extra step back into the loop while debugging. 4505 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4506 if (PhiR->isOrdered()) 4507 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4508 else { 4509 // Floating-point operations should have some FMF to enable the reduction. 4510 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4511 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4512 for (unsigned Part = 1; Part < UF; ++Part) { 4513 Value *RdxPart = State.get(LoopExitInstDef, Part); 4514 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4515 ReducedPartRdx = Builder.CreateBinOp( 4516 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4517 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4518 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4519 ReducedPartRdx, RdxPart); 4520 else 4521 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4522 } 4523 } 4524 4525 // Create the reduction after the loop. Note that inloop reductions create the 4526 // target reduction in the loop using a Reduction recipe. 4527 if (VF.isVector() && !PhiR->isInLoop()) { 4528 ReducedPartRdx = 4529 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4530 // If the reduction can be performed in a smaller type, we need to extend 4531 // the reduction to the wider type before we branch to the original loop. 4532 if (PhiTy != RdxDesc.getRecurrenceType()) 4533 ReducedPartRdx = RdxDesc.isSigned() 4534 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4535 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4536 } 4537 4538 // Create a phi node that merges control-flow from the backedge-taken check 4539 // block and the middle block. 4540 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4541 LoopScalarPreHeader->getTerminator()); 4542 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4543 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4544 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4545 4546 // Now, we need to fix the users of the reduction variable 4547 // inside and outside of the scalar remainder loop. 4548 4549 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4550 // in the exit blocks. See comment on analogous loop in 4551 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4552 if (!Cost->requiresScalarEpilogue(VF)) 4553 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4554 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4555 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4556 4557 // Fix the scalar loop reduction variable with the incoming reduction sum 4558 // from the vector body and from the backedge value. 4559 int IncomingEdgeBlockIdx = 4560 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4561 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4562 // Pick the other block. 4563 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4564 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4565 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4566 } 4567 4568 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4569 VPTransformState &State) { 4570 RecurKind RK = RdxDesc.getRecurrenceKind(); 4571 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4572 return; 4573 4574 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4575 assert(LoopExitInstr && "null loop exit instruction"); 4576 SmallVector<Instruction *, 8> Worklist; 4577 SmallPtrSet<Instruction *, 8> Visited; 4578 Worklist.push_back(LoopExitInstr); 4579 Visited.insert(LoopExitInstr); 4580 4581 while (!Worklist.empty()) { 4582 Instruction *Cur = Worklist.pop_back_val(); 4583 if (isa<OverflowingBinaryOperator>(Cur)) 4584 for (unsigned Part = 0; Part < UF; ++Part) { 4585 // FIXME: Should not rely on getVPValue at this point. 4586 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4587 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4588 } 4589 4590 for (User *U : Cur->users()) { 4591 Instruction *UI = cast<Instruction>(U); 4592 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4593 Visited.insert(UI).second) 4594 Worklist.push_back(UI); 4595 } 4596 } 4597 } 4598 4599 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4600 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4601 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4602 // Some phis were already hand updated by the reduction and recurrence 4603 // code above, leave them alone. 4604 continue; 4605 4606 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4607 // Non-instruction incoming values will have only one value. 4608 4609 VPLane Lane = VPLane::getFirstLane(); 4610 if (isa<Instruction>(IncomingValue) && 4611 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4612 VF)) 4613 Lane = VPLane::getLastLaneForVF(VF); 4614 4615 // Can be a loop invariant incoming value or the last scalar value to be 4616 // extracted from the vectorized loop. 4617 // FIXME: Should not rely on getVPValue at this point. 4618 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4619 Value *lastIncomingValue = 4620 OrigLoop->isLoopInvariant(IncomingValue) 4621 ? IncomingValue 4622 : State.get(State.Plan->getVPValue(IncomingValue, true), 4623 VPIteration(UF - 1, Lane)); 4624 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4625 } 4626 } 4627 4628 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4629 // The basic block and loop containing the predicated instruction. 4630 auto *PredBB = PredInst->getParent(); 4631 auto *VectorLoop = LI->getLoopFor(PredBB); 4632 4633 // Initialize a worklist with the operands of the predicated instruction. 4634 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4635 4636 // Holds instructions that we need to analyze again. An instruction may be 4637 // reanalyzed if we don't yet know if we can sink it or not. 4638 SmallVector<Instruction *, 8> InstsToReanalyze; 4639 4640 // Returns true if a given use occurs in the predicated block. Phi nodes use 4641 // their operands in their corresponding predecessor blocks. 4642 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4643 auto *I = cast<Instruction>(U.getUser()); 4644 BasicBlock *BB = I->getParent(); 4645 if (auto *Phi = dyn_cast<PHINode>(I)) 4646 BB = Phi->getIncomingBlock( 4647 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4648 return BB == PredBB; 4649 }; 4650 4651 // Iteratively sink the scalarized operands of the predicated instruction 4652 // into the block we created for it. When an instruction is sunk, it's 4653 // operands are then added to the worklist. The algorithm ends after one pass 4654 // through the worklist doesn't sink a single instruction. 4655 bool Changed; 4656 do { 4657 // Add the instructions that need to be reanalyzed to the worklist, and 4658 // reset the changed indicator. 4659 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4660 InstsToReanalyze.clear(); 4661 Changed = false; 4662 4663 while (!Worklist.empty()) { 4664 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4665 4666 // We can't sink an instruction if it is a phi node, is not in the loop, 4667 // or may have side effects. 4668 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4669 I->mayHaveSideEffects()) 4670 continue; 4671 4672 // If the instruction is already in PredBB, check if we can sink its 4673 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4674 // sinking the scalar instruction I, hence it appears in PredBB; but it 4675 // may have failed to sink I's operands (recursively), which we try 4676 // (again) here. 4677 if (I->getParent() == PredBB) { 4678 Worklist.insert(I->op_begin(), I->op_end()); 4679 continue; 4680 } 4681 4682 // It's legal to sink the instruction if all its uses occur in the 4683 // predicated block. Otherwise, there's nothing to do yet, and we may 4684 // need to reanalyze the instruction. 4685 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4686 InstsToReanalyze.push_back(I); 4687 continue; 4688 } 4689 4690 // Move the instruction to the beginning of the predicated block, and add 4691 // it's operands to the worklist. 4692 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4693 Worklist.insert(I->op_begin(), I->op_end()); 4694 4695 // The sinking may have enabled other instructions to be sunk, so we will 4696 // need to iterate. 4697 Changed = true; 4698 } 4699 } while (Changed); 4700 } 4701 4702 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4703 for (PHINode *OrigPhi : OrigPHIsToFix) { 4704 VPWidenPHIRecipe *VPPhi = 4705 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4706 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4707 // Make sure the builder has a valid insert point. 4708 Builder.SetInsertPoint(NewPhi); 4709 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4710 VPValue *Inc = VPPhi->getIncomingValue(i); 4711 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4712 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4713 } 4714 } 4715 } 4716 4717 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4718 return Cost->useOrderedReductions(RdxDesc); 4719 } 4720 4721 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4722 VPWidenPHIRecipe *PhiR, 4723 VPTransformState &State) { 4724 PHINode *P = cast<PHINode>(PN); 4725 if (EnableVPlanNativePath) { 4726 // Currently we enter here in the VPlan-native path for non-induction 4727 // PHIs where all control flow is uniform. We simply widen these PHIs. 4728 // Create a vector phi with no operands - the vector phi operands will be 4729 // set at the end of vector code generation. 4730 Type *VecTy = (State.VF.isScalar()) 4731 ? PN->getType() 4732 : VectorType::get(PN->getType(), State.VF); 4733 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4734 State.set(PhiR, VecPhi, 0); 4735 OrigPHIsToFix.push_back(P); 4736 4737 return; 4738 } 4739 4740 assert(PN->getParent() == OrigLoop->getHeader() && 4741 "Non-header phis should have been handled elsewhere"); 4742 4743 // In order to support recurrences we need to be able to vectorize Phi nodes. 4744 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4745 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4746 // this value when we vectorize all of the instructions that use the PHI. 4747 4748 assert(!Legal->isReductionVariable(P) && 4749 "reductions should be handled elsewhere"); 4750 4751 setDebugLocFromInst(P); 4752 4753 // This PHINode must be an induction variable. 4754 // Make sure that we know about it. 4755 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4756 4757 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4758 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4759 4760 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4761 // which can be found from the original scalar operations. 4762 switch (II.getKind()) { 4763 case InductionDescriptor::IK_NoInduction: 4764 llvm_unreachable("Unknown induction"); 4765 case InductionDescriptor::IK_IntInduction: 4766 case InductionDescriptor::IK_FpInduction: 4767 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4768 case InductionDescriptor::IK_PtrInduction: { 4769 // Handle the pointer induction variable case. 4770 assert(P->getType()->isPointerTy() && "Unexpected type."); 4771 4772 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4773 // This is the normalized GEP that starts counting at zero. 4774 Value *PtrInd = 4775 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4776 // Determine the number of scalars we need to generate for each unroll 4777 // iteration. If the instruction is uniform, we only need to generate the 4778 // first lane. Otherwise, we generate all VF values. 4779 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4780 assert((IsUniform || !State.VF.isScalable()) && 4781 "Cannot scalarize a scalable VF"); 4782 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4783 4784 for (unsigned Part = 0; Part < UF; ++Part) { 4785 Value *PartStart = 4786 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4787 4788 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4789 Value *Idx = Builder.CreateAdd( 4790 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4791 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4792 Value *SclrGep = 4793 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4794 SclrGep->setName("next.gep"); 4795 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4796 } 4797 } 4798 return; 4799 } 4800 assert(isa<SCEVConstant>(II.getStep()) && 4801 "Induction step not a SCEV constant!"); 4802 Type *PhiType = II.getStep()->getType(); 4803 4804 // Build a pointer phi 4805 Value *ScalarStartValue = II.getStartValue(); 4806 Type *ScStValueType = ScalarStartValue->getType(); 4807 PHINode *NewPointerPhi = 4808 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4809 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4810 4811 // A pointer induction, performed by using a gep 4812 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4813 Instruction *InductionLoc = LoopLatch->getTerminator(); 4814 const SCEV *ScalarStep = II.getStep(); 4815 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4816 Value *ScalarStepValue = 4817 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4818 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4819 Value *NumUnrolledElems = 4820 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4821 Value *InductionGEP = GetElementPtrInst::Create( 4822 II.getElementType(), NewPointerPhi, 4823 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4824 InductionLoc); 4825 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4826 4827 // Create UF many actual address geps that use the pointer 4828 // phi as base and a vectorized version of the step value 4829 // (<step*0, ..., step*N>) as offset. 4830 for (unsigned Part = 0; Part < State.UF; ++Part) { 4831 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4832 Value *StartOffsetScalar = 4833 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4834 Value *StartOffset = 4835 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4836 // Create a vector of consecutive numbers from zero to VF. 4837 StartOffset = 4838 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4839 4840 Value *GEP = Builder.CreateGEP( 4841 II.getElementType(), NewPointerPhi, 4842 Builder.CreateMul( 4843 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4844 "vector.gep")); 4845 State.set(PhiR, GEP, Part); 4846 } 4847 } 4848 } 4849 } 4850 4851 /// A helper function for checking whether an integer division-related 4852 /// instruction may divide by zero (in which case it must be predicated if 4853 /// executed conditionally in the scalar code). 4854 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4855 /// Non-zero divisors that are non compile-time constants will not be 4856 /// converted into multiplication, so we will still end up scalarizing 4857 /// the division, but can do so w/o predication. 4858 static bool mayDivideByZero(Instruction &I) { 4859 assert((I.getOpcode() == Instruction::UDiv || 4860 I.getOpcode() == Instruction::SDiv || 4861 I.getOpcode() == Instruction::URem || 4862 I.getOpcode() == Instruction::SRem) && 4863 "Unexpected instruction"); 4864 Value *Divisor = I.getOperand(1); 4865 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4866 return !CInt || CInt->isZero(); 4867 } 4868 4869 void InnerLoopVectorizer::widenInstruction(Instruction &I, 4870 VPWidenRecipe *WidenRec, 4871 VPTransformState &State) { 4872 switch (I.getOpcode()) { 4873 case Instruction::Call: 4874 case Instruction::Br: 4875 case Instruction::PHI: 4876 case Instruction::GetElementPtr: 4877 case Instruction::Select: 4878 llvm_unreachable("This instruction is handled by a different recipe."); 4879 case Instruction::UDiv: 4880 case Instruction::SDiv: 4881 case Instruction::SRem: 4882 case Instruction::URem: 4883 case Instruction::Add: 4884 case Instruction::FAdd: 4885 case Instruction::Sub: 4886 case Instruction::FSub: 4887 case Instruction::FNeg: 4888 case Instruction::Mul: 4889 case Instruction::FMul: 4890 case Instruction::FDiv: 4891 case Instruction::FRem: 4892 case Instruction::Shl: 4893 case Instruction::LShr: 4894 case Instruction::AShr: 4895 case Instruction::And: 4896 case Instruction::Or: 4897 case Instruction::Xor: { 4898 // Just widen unops and binops. 4899 setDebugLocFromInst(&I); 4900 4901 for (unsigned Part = 0; Part < UF; ++Part) { 4902 SmallVector<Value *, 2> Ops; 4903 for (VPValue *VPOp : WidenRec->operands()) 4904 Ops.push_back(State.get(VPOp, Part)); 4905 4906 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4907 4908 if (auto *VecOp = dyn_cast<Instruction>(V)) { 4909 VecOp->copyIRFlags(&I); 4910 4911 // If the instruction is vectorized and was in a basic block that needed 4912 // predication, we can't propagate poison-generating flags (nuw/nsw, 4913 // exact, etc.). The control flow has been linearized and the 4914 // instruction is no longer guarded by the predicate, which could make 4915 // the flag properties to no longer hold. 4916 if (State.MayGeneratePoisonRecipes.count(WidenRec) > 0) 4917 VecOp->dropPoisonGeneratingFlags(); 4918 } 4919 4920 // Use this vector value for all users of the original instruction. 4921 State.set(WidenRec, V, Part); 4922 addMetadata(V, &I); 4923 } 4924 4925 break; 4926 } 4927 case Instruction::ICmp: 4928 case Instruction::FCmp: { 4929 // Widen compares. Generate vector compares. 4930 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4931 auto *Cmp = cast<CmpInst>(&I); 4932 setDebugLocFromInst(Cmp); 4933 for (unsigned Part = 0; Part < UF; ++Part) { 4934 Value *A = State.get(WidenRec->getOperand(0), Part); 4935 Value *B = State.get(WidenRec->getOperand(1), Part); 4936 Value *C = nullptr; 4937 if (FCmp) { 4938 // Propagate fast math flags. 4939 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4940 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4941 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4942 } else { 4943 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4944 } 4945 State.set(WidenRec, C, Part); 4946 addMetadata(C, &I); 4947 } 4948 4949 break; 4950 } 4951 4952 case Instruction::ZExt: 4953 case Instruction::SExt: 4954 case Instruction::FPToUI: 4955 case Instruction::FPToSI: 4956 case Instruction::FPExt: 4957 case Instruction::PtrToInt: 4958 case Instruction::IntToPtr: 4959 case Instruction::SIToFP: 4960 case Instruction::UIToFP: 4961 case Instruction::Trunc: 4962 case Instruction::FPTrunc: 4963 case Instruction::BitCast: { 4964 auto *CI = cast<CastInst>(&I); 4965 setDebugLocFromInst(CI); 4966 4967 /// Vectorize casts. 4968 Type *DestTy = 4969 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4970 4971 for (unsigned Part = 0; Part < UF; ++Part) { 4972 Value *A = State.get(WidenRec->getOperand(0), Part); 4973 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4974 State.set(WidenRec, Cast, Part); 4975 addMetadata(Cast, &I); 4976 } 4977 break; 4978 } 4979 default: 4980 // This instruction is not vectorized by simple widening. 4981 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4982 llvm_unreachable("Unhandled instruction!"); 4983 } // end of switch. 4984 } 4985 4986 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4987 VPUser &ArgOperands, 4988 VPTransformState &State) { 4989 assert(!isa<DbgInfoIntrinsic>(I) && 4990 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4991 setDebugLocFromInst(&I); 4992 4993 Module *M = I.getParent()->getParent()->getParent(); 4994 auto *CI = cast<CallInst>(&I); 4995 4996 SmallVector<Type *, 4> Tys; 4997 for (Value *ArgOperand : CI->args()) 4998 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4999 5000 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5001 5002 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5003 // version of the instruction. 5004 // Is it beneficial to perform intrinsic call compared to lib call? 5005 bool NeedToScalarize = false; 5006 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5007 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5008 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5009 assert((UseVectorIntrinsic || !NeedToScalarize) && 5010 "Instruction should be scalarized elsewhere."); 5011 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5012 "Either the intrinsic cost or vector call cost must be valid"); 5013 5014 for (unsigned Part = 0; Part < UF; ++Part) { 5015 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 5016 SmallVector<Value *, 4> Args; 5017 for (auto &I : enumerate(ArgOperands.operands())) { 5018 // Some intrinsics have a scalar argument - don't replace it with a 5019 // vector. 5020 Value *Arg; 5021 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5022 Arg = State.get(I.value(), Part); 5023 else { 5024 Arg = State.get(I.value(), VPIteration(0, 0)); 5025 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 5026 TysForDecl.push_back(Arg->getType()); 5027 } 5028 Args.push_back(Arg); 5029 } 5030 5031 Function *VectorF; 5032 if (UseVectorIntrinsic) { 5033 // Use vector version of the intrinsic. 5034 if (VF.isVector()) 5035 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5036 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5037 assert(VectorF && "Can't retrieve vector intrinsic."); 5038 } else { 5039 // Use vector version of the function call. 5040 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5041 #ifndef NDEBUG 5042 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5043 "Can't create vector function."); 5044 #endif 5045 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5046 } 5047 SmallVector<OperandBundleDef, 1> OpBundles; 5048 CI->getOperandBundlesAsDefs(OpBundles); 5049 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5050 5051 if (isa<FPMathOperator>(V)) 5052 V->copyFastMathFlags(CI); 5053 5054 State.set(Def, V, Part); 5055 addMetadata(V, &I); 5056 } 5057 } 5058 5059 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5060 VPUser &Operands, 5061 bool InvariantCond, 5062 VPTransformState &State) { 5063 setDebugLocFromInst(&I); 5064 5065 // The condition can be loop invariant but still defined inside the 5066 // loop. This means that we can't just use the original 'cond' value. 5067 // We have to take the 'vectorized' value and pick the first lane. 5068 // Instcombine will make this a no-op. 5069 auto *InvarCond = InvariantCond 5070 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5071 : nullptr; 5072 5073 for (unsigned Part = 0; Part < UF; ++Part) { 5074 Value *Cond = 5075 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5076 Value *Op0 = State.get(Operands.getOperand(1), Part); 5077 Value *Op1 = State.get(Operands.getOperand(2), Part); 5078 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5079 State.set(VPDef, Sel, Part); 5080 addMetadata(Sel, &I); 5081 } 5082 } 5083 5084 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5085 // We should not collect Scalars more than once per VF. Right now, this 5086 // function is called from collectUniformsAndScalars(), which already does 5087 // this check. Collecting Scalars for VF=1 does not make any sense. 5088 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5089 "This function should not be visited twice for the same VF"); 5090 5091 SmallSetVector<Instruction *, 8> Worklist; 5092 5093 // These sets are used to seed the analysis with pointers used by memory 5094 // accesses that will remain scalar. 5095 SmallSetVector<Instruction *, 8> ScalarPtrs; 5096 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5097 auto *Latch = TheLoop->getLoopLatch(); 5098 5099 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5100 // The pointer operands of loads and stores will be scalar as long as the 5101 // memory access is not a gather or scatter operation. The value operand of a 5102 // store will remain scalar if the store is scalarized. 5103 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5104 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5105 assert(WideningDecision != CM_Unknown && 5106 "Widening decision should be ready at this moment"); 5107 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5108 if (Ptr == Store->getValueOperand()) 5109 return WideningDecision == CM_Scalarize; 5110 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5111 "Ptr is neither a value or pointer operand"); 5112 return WideningDecision != CM_GatherScatter; 5113 }; 5114 5115 // A helper that returns true if the given value is a bitcast or 5116 // getelementptr instruction contained in the loop. 5117 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5118 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5119 isa<GetElementPtrInst>(V)) && 5120 !TheLoop->isLoopInvariant(V); 5121 }; 5122 5123 // A helper that evaluates a memory access's use of a pointer. If the use will 5124 // be a scalar use and the pointer is only used by memory accesses, we place 5125 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 5126 // PossibleNonScalarPtrs. 5127 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5128 // We only care about bitcast and getelementptr instructions contained in 5129 // the loop. 5130 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5131 return; 5132 5133 // If the pointer has already been identified as scalar (e.g., if it was 5134 // also identified as uniform), there's nothing to do. 5135 auto *I = cast<Instruction>(Ptr); 5136 if (Worklist.count(I)) 5137 return; 5138 5139 // If the use of the pointer will be a scalar use, and all users of the 5140 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5141 // place the pointer in PossibleNonScalarPtrs. 5142 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5143 return isa<LoadInst>(U) || isa<StoreInst>(U); 5144 })) 5145 ScalarPtrs.insert(I); 5146 else 5147 PossibleNonScalarPtrs.insert(I); 5148 }; 5149 5150 // We seed the scalars analysis with three classes of instructions: (1) 5151 // instructions marked uniform-after-vectorization and (2) bitcast, 5152 // getelementptr and (pointer) phi instructions used by memory accesses 5153 // requiring a scalar use. 5154 // 5155 // (1) Add to the worklist all instructions that have been identified as 5156 // uniform-after-vectorization. 5157 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5158 5159 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5160 // memory accesses requiring a scalar use. The pointer operands of loads and 5161 // stores will be scalar as long as the memory accesses is not a gather or 5162 // scatter operation. The value operand of a store will remain scalar if the 5163 // store is scalarized. 5164 for (auto *BB : TheLoop->blocks()) 5165 for (auto &I : *BB) { 5166 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5167 evaluatePtrUse(Load, Load->getPointerOperand()); 5168 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5169 evaluatePtrUse(Store, Store->getPointerOperand()); 5170 evaluatePtrUse(Store, Store->getValueOperand()); 5171 } 5172 } 5173 for (auto *I : ScalarPtrs) 5174 if (!PossibleNonScalarPtrs.count(I)) { 5175 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5176 Worklist.insert(I); 5177 } 5178 5179 // Insert the forced scalars. 5180 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5181 // induction variable when the PHI user is scalarized. 5182 auto ForcedScalar = ForcedScalars.find(VF); 5183 if (ForcedScalar != ForcedScalars.end()) 5184 for (auto *I : ForcedScalar->second) 5185 Worklist.insert(I); 5186 5187 // Expand the worklist by looking through any bitcasts and getelementptr 5188 // instructions we've already identified as scalar. This is similar to the 5189 // expansion step in collectLoopUniforms(); however, here we're only 5190 // expanding to include additional bitcasts and getelementptr instructions. 5191 unsigned Idx = 0; 5192 while (Idx != Worklist.size()) { 5193 Instruction *Dst = Worklist[Idx++]; 5194 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5195 continue; 5196 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5197 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5198 auto *J = cast<Instruction>(U); 5199 return !TheLoop->contains(J) || Worklist.count(J) || 5200 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5201 isScalarUse(J, Src)); 5202 })) { 5203 Worklist.insert(Src); 5204 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5205 } 5206 } 5207 5208 // An induction variable will remain scalar if all users of the induction 5209 // variable and induction variable update remain scalar. 5210 for (auto &Induction : Legal->getInductionVars()) { 5211 auto *Ind = Induction.first; 5212 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5213 5214 // If tail-folding is applied, the primary induction variable will be used 5215 // to feed a vector compare. 5216 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5217 continue; 5218 5219 // Returns true if \p Indvar is a pointer induction that is used directly by 5220 // load/store instruction \p I. 5221 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 5222 Instruction *I) { 5223 return Induction.second.getKind() == 5224 InductionDescriptor::IK_PtrInduction && 5225 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 5226 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 5227 }; 5228 5229 // Determine if all users of the induction variable are scalar after 5230 // vectorization. 5231 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5232 auto *I = cast<Instruction>(U); 5233 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5234 IsDirectLoadStoreFromPtrIndvar(Ind, I); 5235 }); 5236 if (!ScalarInd) 5237 continue; 5238 5239 // Determine if all users of the induction variable update instruction are 5240 // scalar after vectorization. 5241 auto ScalarIndUpdate = 5242 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5243 auto *I = cast<Instruction>(U); 5244 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5245 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 5246 }); 5247 if (!ScalarIndUpdate) 5248 continue; 5249 5250 // The induction variable and its update instruction will remain scalar. 5251 Worklist.insert(Ind); 5252 Worklist.insert(IndUpdate); 5253 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5254 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5255 << "\n"); 5256 } 5257 5258 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5259 } 5260 5261 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5262 if (!blockNeedsPredicationForAnyReason(I->getParent())) 5263 return false; 5264 switch(I->getOpcode()) { 5265 default: 5266 break; 5267 case Instruction::Load: 5268 case Instruction::Store: { 5269 if (!Legal->isMaskRequired(I)) 5270 return false; 5271 auto *Ptr = getLoadStorePointerOperand(I); 5272 auto *Ty = getLoadStoreType(I); 5273 const Align Alignment = getLoadStoreAlignment(I); 5274 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5275 TTI.isLegalMaskedGather(Ty, Alignment)) 5276 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5277 TTI.isLegalMaskedScatter(Ty, Alignment)); 5278 } 5279 case Instruction::UDiv: 5280 case Instruction::SDiv: 5281 case Instruction::SRem: 5282 case Instruction::URem: 5283 return mayDivideByZero(*I); 5284 } 5285 return false; 5286 } 5287 5288 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5289 Instruction *I, ElementCount VF) { 5290 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5291 assert(getWideningDecision(I, VF) == CM_Unknown && 5292 "Decision should not be set yet."); 5293 auto *Group = getInterleavedAccessGroup(I); 5294 assert(Group && "Must have a group."); 5295 5296 // If the instruction's allocated size doesn't equal it's type size, it 5297 // requires padding and will be scalarized. 5298 auto &DL = I->getModule()->getDataLayout(); 5299 auto *ScalarTy = getLoadStoreType(I); 5300 if (hasIrregularType(ScalarTy, DL)) 5301 return false; 5302 5303 // Check if masking is required. 5304 // A Group may need masking for one of two reasons: it resides in a block that 5305 // needs predication, or it was decided to use masking to deal with gaps 5306 // (either a gap at the end of a load-access that may result in a speculative 5307 // load, or any gaps in a store-access). 5308 bool PredicatedAccessRequiresMasking = 5309 blockNeedsPredicationForAnyReason(I->getParent()) && 5310 Legal->isMaskRequired(I); 5311 bool LoadAccessWithGapsRequiresEpilogMasking = 5312 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 5313 !isScalarEpilogueAllowed(); 5314 bool StoreAccessWithGapsRequiresMasking = 5315 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 5316 if (!PredicatedAccessRequiresMasking && 5317 !LoadAccessWithGapsRequiresEpilogMasking && 5318 !StoreAccessWithGapsRequiresMasking) 5319 return true; 5320 5321 // If masked interleaving is required, we expect that the user/target had 5322 // enabled it, because otherwise it either wouldn't have been created or 5323 // it should have been invalidated by the CostModel. 5324 assert(useMaskedInterleavedAccesses(TTI) && 5325 "Masked interleave-groups for predicated accesses are not enabled."); 5326 5327 if (Group->isReverse()) 5328 return false; 5329 5330 auto *Ty = getLoadStoreType(I); 5331 const Align Alignment = getLoadStoreAlignment(I); 5332 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5333 : TTI.isLegalMaskedStore(Ty, Alignment); 5334 } 5335 5336 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5337 Instruction *I, ElementCount VF) { 5338 // Get and ensure we have a valid memory instruction. 5339 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 5340 5341 auto *Ptr = getLoadStorePointerOperand(I); 5342 auto *ScalarTy = getLoadStoreType(I); 5343 5344 // In order to be widened, the pointer should be consecutive, first of all. 5345 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5346 return false; 5347 5348 // If the instruction is a store located in a predicated block, it will be 5349 // scalarized. 5350 if (isScalarWithPredication(I)) 5351 return false; 5352 5353 // If the instruction's allocated size doesn't equal it's type size, it 5354 // requires padding and will be scalarized. 5355 auto &DL = I->getModule()->getDataLayout(); 5356 if (hasIrregularType(ScalarTy, DL)) 5357 return false; 5358 5359 return true; 5360 } 5361 5362 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5363 // We should not collect Uniforms more than once per VF. Right now, 5364 // this function is called from collectUniformsAndScalars(), which 5365 // already does this check. Collecting Uniforms for VF=1 does not make any 5366 // sense. 5367 5368 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5369 "This function should not be visited twice for the same VF"); 5370 5371 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5372 // not analyze again. Uniforms.count(VF) will return 1. 5373 Uniforms[VF].clear(); 5374 5375 // We now know that the loop is vectorizable! 5376 // Collect instructions inside the loop that will remain uniform after 5377 // vectorization. 5378 5379 // Global values, params and instructions outside of current loop are out of 5380 // scope. 5381 auto isOutOfScope = [&](Value *V) -> bool { 5382 Instruction *I = dyn_cast<Instruction>(V); 5383 return (!I || !TheLoop->contains(I)); 5384 }; 5385 5386 // Worklist containing uniform instructions demanding lane 0. 5387 SetVector<Instruction *> Worklist; 5388 BasicBlock *Latch = TheLoop->getLoopLatch(); 5389 5390 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5391 // that are scalar with predication must not be considered uniform after 5392 // vectorization, because that would create an erroneous replicating region 5393 // where only a single instance out of VF should be formed. 5394 // TODO: optimize such seldom cases if found important, see PR40816. 5395 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5396 if (isOutOfScope(I)) { 5397 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5398 << *I << "\n"); 5399 return; 5400 } 5401 if (isScalarWithPredication(I)) { 5402 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5403 << *I << "\n"); 5404 return; 5405 } 5406 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5407 Worklist.insert(I); 5408 }; 5409 5410 // Start with the conditional branch. If the branch condition is an 5411 // instruction contained in the loop that is only used by the branch, it is 5412 // uniform. 5413 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5414 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5415 addToWorklistIfAllowed(Cmp); 5416 5417 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5418 InstWidening WideningDecision = getWideningDecision(I, VF); 5419 assert(WideningDecision != CM_Unknown && 5420 "Widening decision should be ready at this moment"); 5421 5422 // A uniform memory op is itself uniform. We exclude uniform stores 5423 // here as they demand the last lane, not the first one. 5424 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5425 assert(WideningDecision == CM_Scalarize); 5426 return true; 5427 } 5428 5429 return (WideningDecision == CM_Widen || 5430 WideningDecision == CM_Widen_Reverse || 5431 WideningDecision == CM_Interleave); 5432 }; 5433 5434 5435 // Returns true if Ptr is the pointer operand of a memory access instruction 5436 // I, and I is known to not require scalarization. 5437 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5438 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5439 }; 5440 5441 // Holds a list of values which are known to have at least one uniform use. 5442 // Note that there may be other uses which aren't uniform. A "uniform use" 5443 // here is something which only demands lane 0 of the unrolled iterations; 5444 // it does not imply that all lanes produce the same value (e.g. this is not 5445 // the usual meaning of uniform) 5446 SetVector<Value *> HasUniformUse; 5447 5448 // Scan the loop for instructions which are either a) known to have only 5449 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5450 for (auto *BB : TheLoop->blocks()) 5451 for (auto &I : *BB) { 5452 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5453 switch (II->getIntrinsicID()) { 5454 case Intrinsic::sideeffect: 5455 case Intrinsic::experimental_noalias_scope_decl: 5456 case Intrinsic::assume: 5457 case Intrinsic::lifetime_start: 5458 case Intrinsic::lifetime_end: 5459 if (TheLoop->hasLoopInvariantOperands(&I)) 5460 addToWorklistIfAllowed(&I); 5461 break; 5462 default: 5463 break; 5464 } 5465 } 5466 5467 // ExtractValue instructions must be uniform, because the operands are 5468 // known to be loop-invariant. 5469 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5470 assert(isOutOfScope(EVI->getAggregateOperand()) && 5471 "Expected aggregate value to be loop invariant"); 5472 addToWorklistIfAllowed(EVI); 5473 continue; 5474 } 5475 5476 // If there's no pointer operand, there's nothing to do. 5477 auto *Ptr = getLoadStorePointerOperand(&I); 5478 if (!Ptr) 5479 continue; 5480 5481 // A uniform memory op is itself uniform. We exclude uniform stores 5482 // here as they demand the last lane, not the first one. 5483 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5484 addToWorklistIfAllowed(&I); 5485 5486 if (isUniformDecision(&I, VF)) { 5487 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5488 HasUniformUse.insert(Ptr); 5489 } 5490 } 5491 5492 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5493 // demanding) users. Since loops are assumed to be in LCSSA form, this 5494 // disallows uses outside the loop as well. 5495 for (auto *V : HasUniformUse) { 5496 if (isOutOfScope(V)) 5497 continue; 5498 auto *I = cast<Instruction>(V); 5499 auto UsersAreMemAccesses = 5500 llvm::all_of(I->users(), [&](User *U) -> bool { 5501 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5502 }); 5503 if (UsersAreMemAccesses) 5504 addToWorklistIfAllowed(I); 5505 } 5506 5507 // Expand Worklist in topological order: whenever a new instruction 5508 // is added , its users should be already inside Worklist. It ensures 5509 // a uniform instruction will only be used by uniform instructions. 5510 unsigned idx = 0; 5511 while (idx != Worklist.size()) { 5512 Instruction *I = Worklist[idx++]; 5513 5514 for (auto OV : I->operand_values()) { 5515 // isOutOfScope operands cannot be uniform instructions. 5516 if (isOutOfScope(OV)) 5517 continue; 5518 // First order recurrence Phi's should typically be considered 5519 // non-uniform. 5520 auto *OP = dyn_cast<PHINode>(OV); 5521 if (OP && Legal->isFirstOrderRecurrence(OP)) 5522 continue; 5523 // If all the users of the operand are uniform, then add the 5524 // operand into the uniform worklist. 5525 auto *OI = cast<Instruction>(OV); 5526 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5527 auto *J = cast<Instruction>(U); 5528 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5529 })) 5530 addToWorklistIfAllowed(OI); 5531 } 5532 } 5533 5534 // For an instruction to be added into Worklist above, all its users inside 5535 // the loop should also be in Worklist. However, this condition cannot be 5536 // true for phi nodes that form a cyclic dependence. We must process phi 5537 // nodes separately. An induction variable will remain uniform if all users 5538 // of the induction variable and induction variable update remain uniform. 5539 // The code below handles both pointer and non-pointer induction variables. 5540 for (auto &Induction : Legal->getInductionVars()) { 5541 auto *Ind = Induction.first; 5542 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5543 5544 // Determine if all users of the induction variable are uniform after 5545 // vectorization. 5546 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5547 auto *I = cast<Instruction>(U); 5548 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5549 isVectorizedMemAccessUse(I, Ind); 5550 }); 5551 if (!UniformInd) 5552 continue; 5553 5554 // Determine if all users of the induction variable update instruction are 5555 // uniform after vectorization. 5556 auto UniformIndUpdate = 5557 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5558 auto *I = cast<Instruction>(U); 5559 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5560 isVectorizedMemAccessUse(I, IndUpdate); 5561 }); 5562 if (!UniformIndUpdate) 5563 continue; 5564 5565 // The induction variable and its update instruction will remain uniform. 5566 addToWorklistIfAllowed(Ind); 5567 addToWorklistIfAllowed(IndUpdate); 5568 } 5569 5570 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5571 } 5572 5573 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5574 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5575 5576 if (Legal->getRuntimePointerChecking()->Need) { 5577 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5578 "runtime pointer checks needed. Enable vectorization of this " 5579 "loop with '#pragma clang loop vectorize(enable)' when " 5580 "compiling with -Os/-Oz", 5581 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5582 return true; 5583 } 5584 5585 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5586 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5587 "runtime SCEV checks needed. Enable vectorization of this " 5588 "loop with '#pragma clang loop vectorize(enable)' when " 5589 "compiling with -Os/-Oz", 5590 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5591 return true; 5592 } 5593 5594 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5595 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5596 reportVectorizationFailure("Runtime stride check for small trip count", 5597 "runtime stride == 1 checks needed. Enable vectorization of " 5598 "this loop without such check by compiling with -Os/-Oz", 5599 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5600 return true; 5601 } 5602 5603 return false; 5604 } 5605 5606 ElementCount 5607 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5608 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5609 return ElementCount::getScalable(0); 5610 5611 if (Hints->isScalableVectorizationDisabled()) { 5612 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5613 "ScalableVectorizationDisabled", ORE, TheLoop); 5614 return ElementCount::getScalable(0); 5615 } 5616 5617 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5618 5619 auto MaxScalableVF = ElementCount::getScalable( 5620 std::numeric_limits<ElementCount::ScalarTy>::max()); 5621 5622 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5623 // FIXME: While for scalable vectors this is currently sufficient, this should 5624 // be replaced by a more detailed mechanism that filters out specific VFs, 5625 // instead of invalidating vectorization for a whole set of VFs based on the 5626 // MaxVF. 5627 5628 // Disable scalable vectorization if the loop contains unsupported reductions. 5629 if (!canVectorizeReductions(MaxScalableVF)) { 5630 reportVectorizationInfo( 5631 "Scalable vectorization not supported for the reduction " 5632 "operations found in this loop.", 5633 "ScalableVFUnfeasible", ORE, TheLoop); 5634 return ElementCount::getScalable(0); 5635 } 5636 5637 // Disable scalable vectorization if the loop contains any instructions 5638 // with element types not supported for scalable vectors. 5639 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5640 return !Ty->isVoidTy() && 5641 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5642 })) { 5643 reportVectorizationInfo("Scalable vectorization is not supported " 5644 "for all element types found in this loop.", 5645 "ScalableVFUnfeasible", ORE, TheLoop); 5646 return ElementCount::getScalable(0); 5647 } 5648 5649 if (Legal->isSafeForAnyVectorWidth()) 5650 return MaxScalableVF; 5651 5652 // Limit MaxScalableVF by the maximum safe dependence distance. 5653 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5654 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5655 unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) 5656 .getVScaleRangeArgs() 5657 .second; 5658 if (VScaleMax > 0) 5659 MaxVScale = VScaleMax; 5660 } 5661 MaxScalableVF = ElementCount::getScalable( 5662 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5663 if (!MaxScalableVF) 5664 reportVectorizationInfo( 5665 "Max legal vector width too small, scalable vectorization " 5666 "unfeasible.", 5667 "ScalableVFUnfeasible", ORE, TheLoop); 5668 5669 return MaxScalableVF; 5670 } 5671 5672 FixedScalableVFPair 5673 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5674 ElementCount UserVF) { 5675 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5676 unsigned SmallestType, WidestType; 5677 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5678 5679 // Get the maximum safe dependence distance in bits computed by LAA. 5680 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5681 // the memory accesses that is most restrictive (involved in the smallest 5682 // dependence distance). 5683 unsigned MaxSafeElements = 5684 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5685 5686 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5687 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5688 5689 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5690 << ".\n"); 5691 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5692 << ".\n"); 5693 5694 // First analyze the UserVF, fall back if the UserVF should be ignored. 5695 if (UserVF) { 5696 auto MaxSafeUserVF = 5697 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5698 5699 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5700 // If `VF=vscale x N` is safe, then so is `VF=N` 5701 if (UserVF.isScalable()) 5702 return FixedScalableVFPair( 5703 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5704 else 5705 return UserVF; 5706 } 5707 5708 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5709 5710 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5711 // is better to ignore the hint and let the compiler choose a suitable VF. 5712 if (!UserVF.isScalable()) { 5713 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5714 << " is unsafe, clamping to max safe VF=" 5715 << MaxSafeFixedVF << ".\n"); 5716 ORE->emit([&]() { 5717 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5718 TheLoop->getStartLoc(), 5719 TheLoop->getHeader()) 5720 << "User-specified vectorization factor " 5721 << ore::NV("UserVectorizationFactor", UserVF) 5722 << " is unsafe, clamping to maximum safe vectorization factor " 5723 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5724 }); 5725 return MaxSafeFixedVF; 5726 } 5727 5728 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5729 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5730 << " is ignored because scalable vectors are not " 5731 "available.\n"); 5732 ORE->emit([&]() { 5733 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5734 TheLoop->getStartLoc(), 5735 TheLoop->getHeader()) 5736 << "User-specified vectorization factor " 5737 << ore::NV("UserVectorizationFactor", UserVF) 5738 << " is ignored because the target does not support scalable " 5739 "vectors. The compiler will pick a more suitable value."; 5740 }); 5741 } else { 5742 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5743 << " is unsafe. Ignoring scalable UserVF.\n"); 5744 ORE->emit([&]() { 5745 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5746 TheLoop->getStartLoc(), 5747 TheLoop->getHeader()) 5748 << "User-specified vectorization factor " 5749 << ore::NV("UserVectorizationFactor", UserVF) 5750 << " is unsafe. Ignoring the hint to let the compiler pick a " 5751 "more suitable value."; 5752 }); 5753 } 5754 } 5755 5756 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5757 << " / " << WidestType << " bits.\n"); 5758 5759 FixedScalableVFPair Result(ElementCount::getFixed(1), 5760 ElementCount::getScalable(0)); 5761 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5762 WidestType, MaxSafeFixedVF)) 5763 Result.FixedVF = MaxVF; 5764 5765 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5766 WidestType, MaxSafeScalableVF)) 5767 if (MaxVF.isScalable()) { 5768 Result.ScalableVF = MaxVF; 5769 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5770 << "\n"); 5771 } 5772 5773 return Result; 5774 } 5775 5776 FixedScalableVFPair 5777 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5778 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5779 // TODO: It may by useful to do since it's still likely to be dynamically 5780 // uniform if the target can skip. 5781 reportVectorizationFailure( 5782 "Not inserting runtime ptr check for divergent target", 5783 "runtime pointer checks needed. Not enabled for divergent target", 5784 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5785 return FixedScalableVFPair::getNone(); 5786 } 5787 5788 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5789 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5790 if (TC == 1) { 5791 reportVectorizationFailure("Single iteration (non) loop", 5792 "loop trip count is one, irrelevant for vectorization", 5793 "SingleIterationLoop", ORE, TheLoop); 5794 return FixedScalableVFPair::getNone(); 5795 } 5796 5797 switch (ScalarEpilogueStatus) { 5798 case CM_ScalarEpilogueAllowed: 5799 return computeFeasibleMaxVF(TC, UserVF); 5800 case CM_ScalarEpilogueNotAllowedUsePredicate: 5801 LLVM_FALLTHROUGH; 5802 case CM_ScalarEpilogueNotNeededUsePredicate: 5803 LLVM_DEBUG( 5804 dbgs() << "LV: vector predicate hint/switch found.\n" 5805 << "LV: Not allowing scalar epilogue, creating predicated " 5806 << "vector loop.\n"); 5807 break; 5808 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5809 // fallthrough as a special case of OptForSize 5810 case CM_ScalarEpilogueNotAllowedOptSize: 5811 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5812 LLVM_DEBUG( 5813 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5814 else 5815 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5816 << "count.\n"); 5817 5818 // Bail if runtime checks are required, which are not good when optimising 5819 // for size. 5820 if (runtimeChecksRequired()) 5821 return FixedScalableVFPair::getNone(); 5822 5823 break; 5824 } 5825 5826 // The only loops we can vectorize without a scalar epilogue, are loops with 5827 // a bottom-test and a single exiting block. We'd have to handle the fact 5828 // that not every instruction executes on the last iteration. This will 5829 // require a lane mask which varies through the vector loop body. (TODO) 5830 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5831 // If there was a tail-folding hint/switch, but we can't fold the tail by 5832 // masking, fallback to a vectorization with a scalar epilogue. 5833 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5834 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5835 "scalar epilogue instead.\n"); 5836 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5837 return computeFeasibleMaxVF(TC, UserVF); 5838 } 5839 return FixedScalableVFPair::getNone(); 5840 } 5841 5842 // Now try the tail folding 5843 5844 // Invalidate interleave groups that require an epilogue if we can't mask 5845 // the interleave-group. 5846 if (!useMaskedInterleavedAccesses(TTI)) { 5847 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5848 "No decisions should have been taken at this point"); 5849 // Note: There is no need to invalidate any cost modeling decisions here, as 5850 // non where taken so far. 5851 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5852 } 5853 5854 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5855 // Avoid tail folding if the trip count is known to be a multiple of any VF 5856 // we chose. 5857 // FIXME: The condition below pessimises the case for fixed-width vectors, 5858 // when scalable VFs are also candidates for vectorization. 5859 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5860 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5861 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5862 "MaxFixedVF must be a power of 2"); 5863 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5864 : MaxFixedVF.getFixedValue(); 5865 ScalarEvolution *SE = PSE.getSE(); 5866 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5867 const SCEV *ExitCount = SE->getAddExpr( 5868 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5869 const SCEV *Rem = SE->getURemExpr( 5870 SE->applyLoopGuards(ExitCount, TheLoop), 5871 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5872 if (Rem->isZero()) { 5873 // Accept MaxFixedVF if we do not have a tail. 5874 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5875 return MaxFactors; 5876 } 5877 } 5878 5879 // For scalable vectors, don't use tail folding as this is currently not yet 5880 // supported. The code is likely to have ended up here if the tripcount is 5881 // low, in which case it makes sense not to use scalable vectors. 5882 if (MaxFactors.ScalableVF.isVector()) 5883 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5884 5885 // If we don't know the precise trip count, or if the trip count that we 5886 // found modulo the vectorization factor is not zero, try to fold the tail 5887 // by masking. 5888 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5889 if (Legal->prepareToFoldTailByMasking()) { 5890 FoldTailByMasking = true; 5891 return MaxFactors; 5892 } 5893 5894 // If there was a tail-folding hint/switch, but we can't fold the tail by 5895 // masking, fallback to a vectorization with a scalar epilogue. 5896 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5897 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5898 "scalar epilogue instead.\n"); 5899 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5900 return MaxFactors; 5901 } 5902 5903 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5904 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5905 return FixedScalableVFPair::getNone(); 5906 } 5907 5908 if (TC == 0) { 5909 reportVectorizationFailure( 5910 "Unable to calculate the loop count due to complex control flow", 5911 "unable to calculate the loop count due to complex control flow", 5912 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5913 return FixedScalableVFPair::getNone(); 5914 } 5915 5916 reportVectorizationFailure( 5917 "Cannot optimize for size and vectorize at the same time.", 5918 "cannot optimize for size and vectorize at the same time. " 5919 "Enable vectorization of this loop with '#pragma clang loop " 5920 "vectorize(enable)' when compiling with -Os/-Oz", 5921 "NoTailLoopWithOptForSize", ORE, TheLoop); 5922 return FixedScalableVFPair::getNone(); 5923 } 5924 5925 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5926 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5927 const ElementCount &MaxSafeVF) { 5928 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5929 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5930 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5931 : TargetTransformInfo::RGK_FixedWidthVector); 5932 5933 // Convenience function to return the minimum of two ElementCounts. 5934 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5935 assert((LHS.isScalable() == RHS.isScalable()) && 5936 "Scalable flags must match"); 5937 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5938 }; 5939 5940 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5941 // Note that both WidestRegister and WidestType may not be a powers of 2. 5942 auto MaxVectorElementCount = ElementCount::get( 5943 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5944 ComputeScalableMaxVF); 5945 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5946 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5947 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5948 5949 if (!MaxVectorElementCount) { 5950 LLVM_DEBUG(dbgs() << "LV: The target has no " 5951 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5952 << " vector registers.\n"); 5953 return ElementCount::getFixed(1); 5954 } 5955 5956 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5957 if (ConstTripCount && 5958 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5959 isPowerOf2_32(ConstTripCount)) { 5960 // We need to clamp the VF to be the ConstTripCount. There is no point in 5961 // choosing a higher viable VF as done in the loop below. If 5962 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5963 // the TC is less than or equal to the known number of lanes. 5964 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5965 << ConstTripCount << "\n"); 5966 return TripCountEC; 5967 } 5968 5969 ElementCount MaxVF = MaxVectorElementCount; 5970 if (TTI.shouldMaximizeVectorBandwidth() || 5971 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5972 auto MaxVectorElementCountMaxBW = ElementCount::get( 5973 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5974 ComputeScalableMaxVF); 5975 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5976 5977 // Collect all viable vectorization factors larger than the default MaxVF 5978 // (i.e. MaxVectorElementCount). 5979 SmallVector<ElementCount, 8> VFs; 5980 for (ElementCount VS = MaxVectorElementCount * 2; 5981 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5982 VFs.push_back(VS); 5983 5984 // For each VF calculate its register usage. 5985 auto RUs = calculateRegisterUsage(VFs); 5986 5987 // Select the largest VF which doesn't require more registers than existing 5988 // ones. 5989 for (int i = RUs.size() - 1; i >= 0; --i) { 5990 bool Selected = true; 5991 for (auto &pair : RUs[i].MaxLocalUsers) { 5992 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5993 if (pair.second > TargetNumRegisters) 5994 Selected = false; 5995 } 5996 if (Selected) { 5997 MaxVF = VFs[i]; 5998 break; 5999 } 6000 } 6001 if (ElementCount MinVF = 6002 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 6003 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 6004 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 6005 << ") with target's minimum: " << MinVF << '\n'); 6006 MaxVF = MinVF; 6007 } 6008 } 6009 } 6010 return MaxVF; 6011 } 6012 6013 bool LoopVectorizationCostModel::isMoreProfitable( 6014 const VectorizationFactor &A, const VectorizationFactor &B) const { 6015 InstructionCost CostA = A.Cost; 6016 InstructionCost CostB = B.Cost; 6017 6018 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6019 6020 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6021 MaxTripCount) { 6022 // If we are folding the tail and the trip count is a known (possibly small) 6023 // constant, the trip count will be rounded up to an integer number of 6024 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6025 // which we compare directly. When not folding the tail, the total cost will 6026 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6027 // approximated with the per-lane cost below instead of using the tripcount 6028 // as here. 6029 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6030 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6031 return RTCostA < RTCostB; 6032 } 6033 6034 // Improve estimate for the vector width if it is scalable. 6035 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 6036 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 6037 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 6038 if (A.Width.isScalable()) 6039 EstimatedWidthA *= VScale.getValue(); 6040 if (B.Width.isScalable()) 6041 EstimatedWidthB *= VScale.getValue(); 6042 } 6043 6044 // When set to preferred, for now assume vscale may be larger than 1 (or the 6045 // one being tuned for), so that scalable vectorization is slightly favorable 6046 // over fixed-width vectorization. 6047 if (Hints->isScalableVectorizationPreferred()) 6048 if (A.Width.isScalable() && !B.Width.isScalable()) 6049 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 6050 6051 // To avoid the need for FP division: 6052 // (CostA / A.Width) < (CostB / B.Width) 6053 // <=> (CostA * B.Width) < (CostB * A.Width) 6054 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 6055 } 6056 6057 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 6058 const ElementCountSet &VFCandidates) { 6059 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6060 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6061 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6062 assert(VFCandidates.count(ElementCount::getFixed(1)) && 6063 "Expected Scalar VF to be a candidate"); 6064 6065 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6066 VectorizationFactor ChosenFactor = ScalarCost; 6067 6068 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6069 if (ForceVectorization && VFCandidates.size() > 1) { 6070 // Ignore scalar width, because the user explicitly wants vectorization. 6071 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6072 // evaluation. 6073 ChosenFactor.Cost = InstructionCost::getMax(); 6074 } 6075 6076 SmallVector<InstructionVFPair> InvalidCosts; 6077 for (const auto &i : VFCandidates) { 6078 // The cost for scalar VF=1 is already calculated, so ignore it. 6079 if (i.isScalar()) 6080 continue; 6081 6082 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 6083 VectorizationFactor Candidate(i, C.first); 6084 6085 #ifndef NDEBUG 6086 unsigned AssumedMinimumVscale = 1; 6087 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 6088 AssumedMinimumVscale = VScale.getValue(); 6089 unsigned Width = 6090 Candidate.Width.isScalable() 6091 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 6092 : Candidate.Width.getFixedValue(); 6093 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 6094 << " costs: " << (Candidate.Cost / Width)); 6095 if (i.isScalable()) 6096 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 6097 << AssumedMinimumVscale << ")"); 6098 LLVM_DEBUG(dbgs() << ".\n"); 6099 #endif 6100 6101 if (!C.second && !ForceVectorization) { 6102 LLVM_DEBUG( 6103 dbgs() << "LV: Not considering vector loop of width " << i 6104 << " because it will not generate any vector instructions.\n"); 6105 continue; 6106 } 6107 6108 // If profitable add it to ProfitableVF list. 6109 if (isMoreProfitable(Candidate, ScalarCost)) 6110 ProfitableVFs.push_back(Candidate); 6111 6112 if (isMoreProfitable(Candidate, ChosenFactor)) 6113 ChosenFactor = Candidate; 6114 } 6115 6116 // Emit a report of VFs with invalid costs in the loop. 6117 if (!InvalidCosts.empty()) { 6118 // Group the remarks per instruction, keeping the instruction order from 6119 // InvalidCosts. 6120 std::map<Instruction *, unsigned> Numbering; 6121 unsigned I = 0; 6122 for (auto &Pair : InvalidCosts) 6123 if (!Numbering.count(Pair.first)) 6124 Numbering[Pair.first] = I++; 6125 6126 // Sort the list, first on instruction(number) then on VF. 6127 llvm::sort(InvalidCosts, 6128 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6129 if (Numbering[A.first] != Numbering[B.first]) 6130 return Numbering[A.first] < Numbering[B.first]; 6131 ElementCountComparator ECC; 6132 return ECC(A.second, B.second); 6133 }); 6134 6135 // For a list of ordered instruction-vf pairs: 6136 // [(load, vf1), (load, vf2), (store, vf1)] 6137 // Group the instructions together to emit separate remarks for: 6138 // load (vf1, vf2) 6139 // store (vf1) 6140 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6141 auto Subset = ArrayRef<InstructionVFPair>(); 6142 do { 6143 if (Subset.empty()) 6144 Subset = Tail.take_front(1); 6145 6146 Instruction *I = Subset.front().first; 6147 6148 // If the next instruction is different, or if there are no other pairs, 6149 // emit a remark for the collated subset. e.g. 6150 // [(load, vf1), (load, vf2))] 6151 // to emit: 6152 // remark: invalid costs for 'load' at VF=(vf, vf2) 6153 if (Subset == Tail || Tail[Subset.size()].first != I) { 6154 std::string OutString; 6155 raw_string_ostream OS(OutString); 6156 assert(!Subset.empty() && "Unexpected empty range"); 6157 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6158 for (auto &Pair : Subset) 6159 OS << (Pair.second == Subset.front().second ? "" : ", ") 6160 << Pair.second; 6161 OS << "):"; 6162 if (auto *CI = dyn_cast<CallInst>(I)) 6163 OS << " call to " << CI->getCalledFunction()->getName(); 6164 else 6165 OS << " " << I->getOpcodeName(); 6166 OS.flush(); 6167 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6168 Tail = Tail.drop_front(Subset.size()); 6169 Subset = {}; 6170 } else 6171 // Grow the subset by one element 6172 Subset = Tail.take_front(Subset.size() + 1); 6173 } while (!Tail.empty()); 6174 } 6175 6176 if (!EnableCondStoresVectorization && NumPredStores) { 6177 reportVectorizationFailure("There are conditional stores.", 6178 "store that is conditionally executed prevents vectorization", 6179 "ConditionalStore", ORE, TheLoop); 6180 ChosenFactor = ScalarCost; 6181 } 6182 6183 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6184 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6185 << "LV: Vectorization seems to be not beneficial, " 6186 << "but was forced by a user.\n"); 6187 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6188 return ChosenFactor; 6189 } 6190 6191 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6192 const Loop &L, ElementCount VF) const { 6193 // Cross iteration phis such as reductions need special handling and are 6194 // currently unsupported. 6195 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6196 return Legal->isFirstOrderRecurrence(&Phi) || 6197 Legal->isReductionVariable(&Phi); 6198 })) 6199 return false; 6200 6201 // Phis with uses outside of the loop require special handling and are 6202 // currently unsupported. 6203 for (auto &Entry : Legal->getInductionVars()) { 6204 // Look for uses of the value of the induction at the last iteration. 6205 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6206 for (User *U : PostInc->users()) 6207 if (!L.contains(cast<Instruction>(U))) 6208 return false; 6209 // Look for uses of penultimate value of the induction. 6210 for (User *U : Entry.first->users()) 6211 if (!L.contains(cast<Instruction>(U))) 6212 return false; 6213 } 6214 6215 // Induction variables that are widened require special handling that is 6216 // currently not supported. 6217 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6218 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6219 this->isProfitableToScalarize(Entry.first, VF)); 6220 })) 6221 return false; 6222 6223 // Epilogue vectorization code has not been auditted to ensure it handles 6224 // non-latch exits properly. It may be fine, but it needs auditted and 6225 // tested. 6226 if (L.getExitingBlock() != L.getLoopLatch()) 6227 return false; 6228 6229 return true; 6230 } 6231 6232 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6233 const ElementCount VF) const { 6234 // FIXME: We need a much better cost-model to take different parameters such 6235 // as register pressure, code size increase and cost of extra branches into 6236 // account. For now we apply a very crude heuristic and only consider loops 6237 // with vectorization factors larger than a certain value. 6238 // We also consider epilogue vectorization unprofitable for targets that don't 6239 // consider interleaving beneficial (eg. MVE). 6240 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6241 return false; 6242 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6243 return true; 6244 return false; 6245 } 6246 6247 VectorizationFactor 6248 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6249 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6250 VectorizationFactor Result = VectorizationFactor::Disabled(); 6251 if (!EnableEpilogueVectorization) { 6252 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6253 return Result; 6254 } 6255 6256 if (!isScalarEpilogueAllowed()) { 6257 LLVM_DEBUG( 6258 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6259 "allowed.\n";); 6260 return Result; 6261 } 6262 6263 // Not really a cost consideration, but check for unsupported cases here to 6264 // simplify the logic. 6265 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6266 LLVM_DEBUG( 6267 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6268 "not a supported candidate.\n";); 6269 return Result; 6270 } 6271 6272 if (EpilogueVectorizationForceVF > 1) { 6273 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6274 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 6275 if (LVP.hasPlanWithVF(ForcedEC)) 6276 return {ForcedEC, 0}; 6277 else { 6278 LLVM_DEBUG( 6279 dbgs() 6280 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6281 return Result; 6282 } 6283 } 6284 6285 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6286 TheLoop->getHeader()->getParent()->hasMinSize()) { 6287 LLVM_DEBUG( 6288 dbgs() 6289 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6290 return Result; 6291 } 6292 6293 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 6294 if (MainLoopVF.isScalable()) 6295 LLVM_DEBUG( 6296 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 6297 "yet supported. Converting to fixed-width (VF=" 6298 << FixedMainLoopVF << ") instead\n"); 6299 6300 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 6301 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 6302 "this loop\n"); 6303 return Result; 6304 } 6305 6306 for (auto &NextVF : ProfitableVFs) 6307 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 6308 (Result.Width.getFixedValue() == 1 || 6309 isMoreProfitable(NextVF, Result)) && 6310 LVP.hasPlanWithVF(NextVF.Width)) 6311 Result = NextVF; 6312 6313 if (Result != VectorizationFactor::Disabled()) 6314 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6315 << Result.Width.getFixedValue() << "\n";); 6316 return Result; 6317 } 6318 6319 std::pair<unsigned, unsigned> 6320 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6321 unsigned MinWidth = -1U; 6322 unsigned MaxWidth = 8; 6323 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6324 for (Type *T : ElementTypesInLoop) { 6325 MinWidth = std::min<unsigned>( 6326 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6327 MaxWidth = std::max<unsigned>( 6328 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6329 } 6330 return {MinWidth, MaxWidth}; 6331 } 6332 6333 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6334 ElementTypesInLoop.clear(); 6335 // For each block. 6336 for (BasicBlock *BB : TheLoop->blocks()) { 6337 // For each instruction in the loop. 6338 for (Instruction &I : BB->instructionsWithoutDebug()) { 6339 Type *T = I.getType(); 6340 6341 // Skip ignored values. 6342 if (ValuesToIgnore.count(&I)) 6343 continue; 6344 6345 // Only examine Loads, Stores and PHINodes. 6346 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6347 continue; 6348 6349 // Examine PHI nodes that are reduction variables. Update the type to 6350 // account for the recurrence type. 6351 if (auto *PN = dyn_cast<PHINode>(&I)) { 6352 if (!Legal->isReductionVariable(PN)) 6353 continue; 6354 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6355 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6356 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6357 RdxDesc.getRecurrenceType(), 6358 TargetTransformInfo::ReductionFlags())) 6359 continue; 6360 T = RdxDesc.getRecurrenceType(); 6361 } 6362 6363 // Examine the stored values. 6364 if (auto *ST = dyn_cast<StoreInst>(&I)) 6365 T = ST->getValueOperand()->getType(); 6366 6367 // Ignore loaded pointer types and stored pointer types that are not 6368 // vectorizable. 6369 // 6370 // FIXME: The check here attempts to predict whether a load or store will 6371 // be vectorized. We only know this for certain after a VF has 6372 // been selected. Here, we assume that if an access can be 6373 // vectorized, it will be. We should also look at extending this 6374 // optimization to non-pointer types. 6375 // 6376 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6377 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6378 continue; 6379 6380 ElementTypesInLoop.insert(T); 6381 } 6382 } 6383 } 6384 6385 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6386 unsigned LoopCost) { 6387 // -- The interleave heuristics -- 6388 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6389 // There are many micro-architectural considerations that we can't predict 6390 // at this level. For example, frontend pressure (on decode or fetch) due to 6391 // code size, or the number and capabilities of the execution ports. 6392 // 6393 // We use the following heuristics to select the interleave count: 6394 // 1. If the code has reductions, then we interleave to break the cross 6395 // iteration dependency. 6396 // 2. If the loop is really small, then we interleave to reduce the loop 6397 // overhead. 6398 // 3. We don't interleave if we think that we will spill registers to memory 6399 // due to the increased register pressure. 6400 6401 if (!isScalarEpilogueAllowed()) 6402 return 1; 6403 6404 // We used the distance for the interleave count. 6405 if (Legal->getMaxSafeDepDistBytes() != -1U) 6406 return 1; 6407 6408 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6409 const bool HasReductions = !Legal->getReductionVars().empty(); 6410 // Do not interleave loops with a relatively small known or estimated trip 6411 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6412 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6413 // because with the above conditions interleaving can expose ILP and break 6414 // cross iteration dependences for reductions. 6415 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6416 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6417 return 1; 6418 6419 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6420 // We divide by these constants so assume that we have at least one 6421 // instruction that uses at least one register. 6422 for (auto& pair : R.MaxLocalUsers) { 6423 pair.second = std::max(pair.second, 1U); 6424 } 6425 6426 // We calculate the interleave count using the following formula. 6427 // Subtract the number of loop invariants from the number of available 6428 // registers. These registers are used by all of the interleaved instances. 6429 // Next, divide the remaining registers by the number of registers that is 6430 // required by the loop, in order to estimate how many parallel instances 6431 // fit without causing spills. All of this is rounded down if necessary to be 6432 // a power of two. We want power of two interleave count to simplify any 6433 // addressing operations or alignment considerations. 6434 // We also want power of two interleave counts to ensure that the induction 6435 // variable of the vector loop wraps to zero, when tail is folded by masking; 6436 // this currently happens when OptForSize, in which case IC is set to 1 above. 6437 unsigned IC = UINT_MAX; 6438 6439 for (auto& pair : R.MaxLocalUsers) { 6440 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6441 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6442 << " registers of " 6443 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6444 if (VF.isScalar()) { 6445 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6446 TargetNumRegisters = ForceTargetNumScalarRegs; 6447 } else { 6448 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6449 TargetNumRegisters = ForceTargetNumVectorRegs; 6450 } 6451 unsigned MaxLocalUsers = pair.second; 6452 unsigned LoopInvariantRegs = 0; 6453 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6454 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6455 6456 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6457 // Don't count the induction variable as interleaved. 6458 if (EnableIndVarRegisterHeur) { 6459 TmpIC = 6460 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6461 std::max(1U, (MaxLocalUsers - 1))); 6462 } 6463 6464 IC = std::min(IC, TmpIC); 6465 } 6466 6467 // Clamp the interleave ranges to reasonable counts. 6468 unsigned MaxInterleaveCount = 6469 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6470 6471 // Check if the user has overridden the max. 6472 if (VF.isScalar()) { 6473 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6474 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6475 } else { 6476 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6477 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6478 } 6479 6480 // If trip count is known or estimated compile time constant, limit the 6481 // interleave count to be less than the trip count divided by VF, provided it 6482 // is at least 1. 6483 // 6484 // For scalable vectors we can't know if interleaving is beneficial. It may 6485 // not be beneficial for small loops if none of the lanes in the second vector 6486 // iterations is enabled. However, for larger loops, there is likely to be a 6487 // similar benefit as for fixed-width vectors. For now, we choose to leave 6488 // the InterleaveCount as if vscale is '1', although if some information about 6489 // the vector is known (e.g. min vector size), we can make a better decision. 6490 if (BestKnownTC) { 6491 MaxInterleaveCount = 6492 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6493 // Make sure MaxInterleaveCount is greater than 0. 6494 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6495 } 6496 6497 assert(MaxInterleaveCount > 0 && 6498 "Maximum interleave count must be greater than 0"); 6499 6500 // Clamp the calculated IC to be between the 1 and the max interleave count 6501 // that the target and trip count allows. 6502 if (IC > MaxInterleaveCount) 6503 IC = MaxInterleaveCount; 6504 else 6505 // Make sure IC is greater than 0. 6506 IC = std::max(1u, IC); 6507 6508 assert(IC > 0 && "Interleave count must be greater than 0."); 6509 6510 // If we did not calculate the cost for VF (because the user selected the VF) 6511 // then we calculate the cost of VF here. 6512 if (LoopCost == 0) { 6513 InstructionCost C = expectedCost(VF).first; 6514 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6515 LoopCost = *C.getValue(); 6516 } 6517 6518 assert(LoopCost && "Non-zero loop cost expected"); 6519 6520 // Interleave if we vectorized this loop and there is a reduction that could 6521 // benefit from interleaving. 6522 if (VF.isVector() && HasReductions) { 6523 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6524 return IC; 6525 } 6526 6527 // Note that if we've already vectorized the loop we will have done the 6528 // runtime check and so interleaving won't require further checks. 6529 bool InterleavingRequiresRuntimePointerCheck = 6530 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6531 6532 // We want to interleave small loops in order to reduce the loop overhead and 6533 // potentially expose ILP opportunities. 6534 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6535 << "LV: IC is " << IC << '\n' 6536 << "LV: VF is " << VF << '\n'); 6537 const bool AggressivelyInterleaveReductions = 6538 TTI.enableAggressiveInterleaving(HasReductions); 6539 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6540 // We assume that the cost overhead is 1 and we use the cost model 6541 // to estimate the cost of the loop and interleave until the cost of the 6542 // loop overhead is about 5% of the cost of the loop. 6543 unsigned SmallIC = 6544 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6545 6546 // Interleave until store/load ports (estimated by max interleave count) are 6547 // saturated. 6548 unsigned NumStores = Legal->getNumStores(); 6549 unsigned NumLoads = Legal->getNumLoads(); 6550 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6551 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6552 6553 // There is little point in interleaving for reductions containing selects 6554 // and compares when VF=1 since it may just create more overhead than it's 6555 // worth for loops with small trip counts. This is because we still have to 6556 // do the final reduction after the loop. 6557 bool HasSelectCmpReductions = 6558 HasReductions && 6559 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6560 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6561 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6562 RdxDesc.getRecurrenceKind()); 6563 }); 6564 if (HasSelectCmpReductions) { 6565 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6566 return 1; 6567 } 6568 6569 // If we have a scalar reduction (vector reductions are already dealt with 6570 // by this point), we can increase the critical path length if the loop 6571 // we're interleaving is inside another loop. For tree-wise reductions 6572 // set the limit to 2, and for ordered reductions it's best to disable 6573 // interleaving entirely. 6574 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6575 bool HasOrderedReductions = 6576 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6577 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6578 return RdxDesc.isOrdered(); 6579 }); 6580 if (HasOrderedReductions) { 6581 LLVM_DEBUG( 6582 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6583 return 1; 6584 } 6585 6586 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6587 SmallIC = std::min(SmallIC, F); 6588 StoresIC = std::min(StoresIC, F); 6589 LoadsIC = std::min(LoadsIC, F); 6590 } 6591 6592 if (EnableLoadStoreRuntimeInterleave && 6593 std::max(StoresIC, LoadsIC) > SmallIC) { 6594 LLVM_DEBUG( 6595 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6596 return std::max(StoresIC, LoadsIC); 6597 } 6598 6599 // If there are scalar reductions and TTI has enabled aggressive 6600 // interleaving for reductions, we will interleave to expose ILP. 6601 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6602 AggressivelyInterleaveReductions) { 6603 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6604 // Interleave no less than SmallIC but not as aggressive as the normal IC 6605 // to satisfy the rare situation when resources are too limited. 6606 return std::max(IC / 2, SmallIC); 6607 } else { 6608 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6609 return SmallIC; 6610 } 6611 } 6612 6613 // Interleave if this is a large loop (small loops are already dealt with by 6614 // this point) that could benefit from interleaving. 6615 if (AggressivelyInterleaveReductions) { 6616 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6617 return IC; 6618 } 6619 6620 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6621 return 1; 6622 } 6623 6624 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6625 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6626 // This function calculates the register usage by measuring the highest number 6627 // of values that are alive at a single location. Obviously, this is a very 6628 // rough estimation. We scan the loop in a topological order in order and 6629 // assign a number to each instruction. We use RPO to ensure that defs are 6630 // met before their users. We assume that each instruction that has in-loop 6631 // users starts an interval. We record every time that an in-loop value is 6632 // used, so we have a list of the first and last occurrences of each 6633 // instruction. Next, we transpose this data structure into a multi map that 6634 // holds the list of intervals that *end* at a specific location. This multi 6635 // map allows us to perform a linear search. We scan the instructions linearly 6636 // and record each time that a new interval starts, by placing it in a set. 6637 // If we find this value in the multi-map then we remove it from the set. 6638 // The max register usage is the maximum size of the set. 6639 // We also search for instructions that are defined outside the loop, but are 6640 // used inside the loop. We need this number separately from the max-interval 6641 // usage number because when we unroll, loop-invariant values do not take 6642 // more register. 6643 LoopBlocksDFS DFS(TheLoop); 6644 DFS.perform(LI); 6645 6646 RegisterUsage RU; 6647 6648 // Each 'key' in the map opens a new interval. The values 6649 // of the map are the index of the 'last seen' usage of the 6650 // instruction that is the key. 6651 using IntervalMap = DenseMap<Instruction *, unsigned>; 6652 6653 // Maps instruction to its index. 6654 SmallVector<Instruction *, 64> IdxToInstr; 6655 // Marks the end of each interval. 6656 IntervalMap EndPoint; 6657 // Saves the list of instruction indices that are used in the loop. 6658 SmallPtrSet<Instruction *, 8> Ends; 6659 // Saves the list of values that are used in the loop but are 6660 // defined outside the loop, such as arguments and constants. 6661 SmallPtrSet<Value *, 8> LoopInvariants; 6662 6663 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6664 for (Instruction &I : BB->instructionsWithoutDebug()) { 6665 IdxToInstr.push_back(&I); 6666 6667 // Save the end location of each USE. 6668 for (Value *U : I.operands()) { 6669 auto *Instr = dyn_cast<Instruction>(U); 6670 6671 // Ignore non-instruction values such as arguments, constants, etc. 6672 if (!Instr) 6673 continue; 6674 6675 // If this instruction is outside the loop then record it and continue. 6676 if (!TheLoop->contains(Instr)) { 6677 LoopInvariants.insert(Instr); 6678 continue; 6679 } 6680 6681 // Overwrite previous end points. 6682 EndPoint[Instr] = IdxToInstr.size(); 6683 Ends.insert(Instr); 6684 } 6685 } 6686 } 6687 6688 // Saves the list of intervals that end with the index in 'key'. 6689 using InstrList = SmallVector<Instruction *, 2>; 6690 DenseMap<unsigned, InstrList> TransposeEnds; 6691 6692 // Transpose the EndPoints to a list of values that end at each index. 6693 for (auto &Interval : EndPoint) 6694 TransposeEnds[Interval.second].push_back(Interval.first); 6695 6696 SmallPtrSet<Instruction *, 8> OpenIntervals; 6697 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6698 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6699 6700 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6701 6702 // A lambda that gets the register usage for the given type and VF. 6703 const auto &TTICapture = TTI; 6704 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6705 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6706 return 0; 6707 InstructionCost::CostType RegUsage = 6708 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6709 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6710 "Nonsensical values for register usage."); 6711 return RegUsage; 6712 }; 6713 6714 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6715 Instruction *I = IdxToInstr[i]; 6716 6717 // Remove all of the instructions that end at this location. 6718 InstrList &List = TransposeEnds[i]; 6719 for (Instruction *ToRemove : List) 6720 OpenIntervals.erase(ToRemove); 6721 6722 // Ignore instructions that are never used within the loop. 6723 if (!Ends.count(I)) 6724 continue; 6725 6726 // Skip ignored values. 6727 if (ValuesToIgnore.count(I)) 6728 continue; 6729 6730 // For each VF find the maximum usage of registers. 6731 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6732 // Count the number of live intervals. 6733 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6734 6735 if (VFs[j].isScalar()) { 6736 for (auto Inst : OpenIntervals) { 6737 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6738 if (RegUsage.find(ClassID) == RegUsage.end()) 6739 RegUsage[ClassID] = 1; 6740 else 6741 RegUsage[ClassID] += 1; 6742 } 6743 } else { 6744 collectUniformsAndScalars(VFs[j]); 6745 for (auto Inst : OpenIntervals) { 6746 // Skip ignored values for VF > 1. 6747 if (VecValuesToIgnore.count(Inst)) 6748 continue; 6749 if (isScalarAfterVectorization(Inst, VFs[j])) { 6750 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6751 if (RegUsage.find(ClassID) == RegUsage.end()) 6752 RegUsage[ClassID] = 1; 6753 else 6754 RegUsage[ClassID] += 1; 6755 } else { 6756 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6757 if (RegUsage.find(ClassID) == RegUsage.end()) 6758 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6759 else 6760 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6761 } 6762 } 6763 } 6764 6765 for (auto& pair : RegUsage) { 6766 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6767 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6768 else 6769 MaxUsages[j][pair.first] = pair.second; 6770 } 6771 } 6772 6773 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6774 << OpenIntervals.size() << '\n'); 6775 6776 // Add the current instruction to the list of open intervals. 6777 OpenIntervals.insert(I); 6778 } 6779 6780 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6781 SmallMapVector<unsigned, unsigned, 4> Invariant; 6782 6783 for (auto Inst : LoopInvariants) { 6784 unsigned Usage = 6785 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6786 unsigned ClassID = 6787 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6788 if (Invariant.find(ClassID) == Invariant.end()) 6789 Invariant[ClassID] = Usage; 6790 else 6791 Invariant[ClassID] += Usage; 6792 } 6793 6794 LLVM_DEBUG({ 6795 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6796 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6797 << " item\n"; 6798 for (const auto &pair : MaxUsages[i]) { 6799 dbgs() << "LV(REG): RegisterClass: " 6800 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6801 << " registers\n"; 6802 } 6803 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6804 << " item\n"; 6805 for (const auto &pair : Invariant) { 6806 dbgs() << "LV(REG): RegisterClass: " 6807 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6808 << " registers\n"; 6809 } 6810 }); 6811 6812 RU.LoopInvariantRegs = Invariant; 6813 RU.MaxLocalUsers = MaxUsages[i]; 6814 RUs[i] = RU; 6815 } 6816 6817 return RUs; 6818 } 6819 6820 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6821 // TODO: Cost model for emulated masked load/store is completely 6822 // broken. This hack guides the cost model to use an artificially 6823 // high enough value to practically disable vectorization with such 6824 // operations, except where previously deployed legality hack allowed 6825 // using very low cost values. This is to avoid regressions coming simply 6826 // from moving "masked load/store" check from legality to cost model. 6827 // Masked Load/Gather emulation was previously never allowed. 6828 // Limited number of Masked Store/Scatter emulation was allowed. 6829 assert(isPredicatedInst(I) && 6830 "Expecting a scalar emulated instruction"); 6831 return isa<LoadInst>(I) || 6832 (isa<StoreInst>(I) && 6833 NumPredStores > NumberOfStoresToPredicate); 6834 } 6835 6836 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6837 // If we aren't vectorizing the loop, or if we've already collected the 6838 // instructions to scalarize, there's nothing to do. Collection may already 6839 // have occurred if we have a user-selected VF and are now computing the 6840 // expected cost for interleaving. 6841 if (VF.isScalar() || VF.isZero() || 6842 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6843 return; 6844 6845 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6846 // not profitable to scalarize any instructions, the presence of VF in the 6847 // map will indicate that we've analyzed it already. 6848 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6849 6850 // Find all the instructions that are scalar with predication in the loop and 6851 // determine if it would be better to not if-convert the blocks they are in. 6852 // If so, we also record the instructions to scalarize. 6853 for (BasicBlock *BB : TheLoop->blocks()) { 6854 if (!blockNeedsPredicationForAnyReason(BB)) 6855 continue; 6856 for (Instruction &I : *BB) 6857 if (isScalarWithPredication(&I)) { 6858 ScalarCostsTy ScalarCosts; 6859 // Do not apply discount if scalable, because that would lead to 6860 // invalid scalarization costs. 6861 // Do not apply discount logic if hacked cost is needed 6862 // for emulated masked memrefs. 6863 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6864 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6865 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6866 // Remember that BB will remain after vectorization. 6867 PredicatedBBsAfterVectorization.insert(BB); 6868 } 6869 } 6870 } 6871 6872 int LoopVectorizationCostModel::computePredInstDiscount( 6873 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6874 assert(!isUniformAfterVectorization(PredInst, VF) && 6875 "Instruction marked uniform-after-vectorization will be predicated"); 6876 6877 // Initialize the discount to zero, meaning that the scalar version and the 6878 // vector version cost the same. 6879 InstructionCost Discount = 0; 6880 6881 // Holds instructions to analyze. The instructions we visit are mapped in 6882 // ScalarCosts. Those instructions are the ones that would be scalarized if 6883 // we find that the scalar version costs less. 6884 SmallVector<Instruction *, 8> Worklist; 6885 6886 // Returns true if the given instruction can be scalarized. 6887 auto canBeScalarized = [&](Instruction *I) -> bool { 6888 // We only attempt to scalarize instructions forming a single-use chain 6889 // from the original predicated block that would otherwise be vectorized. 6890 // Although not strictly necessary, we give up on instructions we know will 6891 // already be scalar to avoid traversing chains that are unlikely to be 6892 // beneficial. 6893 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6894 isScalarAfterVectorization(I, VF)) 6895 return false; 6896 6897 // If the instruction is scalar with predication, it will be analyzed 6898 // separately. We ignore it within the context of PredInst. 6899 if (isScalarWithPredication(I)) 6900 return false; 6901 6902 // If any of the instruction's operands are uniform after vectorization, 6903 // the instruction cannot be scalarized. This prevents, for example, a 6904 // masked load from being scalarized. 6905 // 6906 // We assume we will only emit a value for lane zero of an instruction 6907 // marked uniform after vectorization, rather than VF identical values. 6908 // Thus, if we scalarize an instruction that uses a uniform, we would 6909 // create uses of values corresponding to the lanes we aren't emitting code 6910 // for. This behavior can be changed by allowing getScalarValue to clone 6911 // the lane zero values for uniforms rather than asserting. 6912 for (Use &U : I->operands()) 6913 if (auto *J = dyn_cast<Instruction>(U.get())) 6914 if (isUniformAfterVectorization(J, VF)) 6915 return false; 6916 6917 // Otherwise, we can scalarize the instruction. 6918 return true; 6919 }; 6920 6921 // Compute the expected cost discount from scalarizing the entire expression 6922 // feeding the predicated instruction. We currently only consider expressions 6923 // that are single-use instruction chains. 6924 Worklist.push_back(PredInst); 6925 while (!Worklist.empty()) { 6926 Instruction *I = Worklist.pop_back_val(); 6927 6928 // If we've already analyzed the instruction, there's nothing to do. 6929 if (ScalarCosts.find(I) != ScalarCosts.end()) 6930 continue; 6931 6932 // Compute the cost of the vector instruction. Note that this cost already 6933 // includes the scalarization overhead of the predicated instruction. 6934 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6935 6936 // Compute the cost of the scalarized instruction. This cost is the cost of 6937 // the instruction as if it wasn't if-converted and instead remained in the 6938 // predicated block. We will scale this cost by block probability after 6939 // computing the scalarization overhead. 6940 InstructionCost ScalarCost = 6941 VF.getFixedValue() * 6942 getInstructionCost(I, ElementCount::getFixed(1)).first; 6943 6944 // Compute the scalarization overhead of needed insertelement instructions 6945 // and phi nodes. 6946 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6947 ScalarCost += TTI.getScalarizationOverhead( 6948 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6949 APInt::getAllOnes(VF.getFixedValue()), true, false); 6950 ScalarCost += 6951 VF.getFixedValue() * 6952 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6953 } 6954 6955 // Compute the scalarization overhead of needed extractelement 6956 // instructions. For each of the instruction's operands, if the operand can 6957 // be scalarized, add it to the worklist; otherwise, account for the 6958 // overhead. 6959 for (Use &U : I->operands()) 6960 if (auto *J = dyn_cast<Instruction>(U.get())) { 6961 assert(VectorType::isValidElementType(J->getType()) && 6962 "Instruction has non-scalar type"); 6963 if (canBeScalarized(J)) 6964 Worklist.push_back(J); 6965 else if (needsExtract(J, VF)) { 6966 ScalarCost += TTI.getScalarizationOverhead( 6967 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6968 APInt::getAllOnes(VF.getFixedValue()), false, true); 6969 } 6970 } 6971 6972 // Scale the total scalar cost by block probability. 6973 ScalarCost /= getReciprocalPredBlockProb(); 6974 6975 // Compute the discount. A non-negative discount means the vector version 6976 // of the instruction costs more, and scalarizing would be beneficial. 6977 Discount += VectorCost - ScalarCost; 6978 ScalarCosts[I] = ScalarCost; 6979 } 6980 6981 return *Discount.getValue(); 6982 } 6983 6984 LoopVectorizationCostModel::VectorizationCostTy 6985 LoopVectorizationCostModel::expectedCost( 6986 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6987 VectorizationCostTy Cost; 6988 6989 // For each block. 6990 for (BasicBlock *BB : TheLoop->blocks()) { 6991 VectorizationCostTy BlockCost; 6992 6993 // For each instruction in the old loop. 6994 for (Instruction &I : BB->instructionsWithoutDebug()) { 6995 // Skip ignored values. 6996 if (ValuesToIgnore.count(&I) || 6997 (VF.isVector() && VecValuesToIgnore.count(&I))) 6998 continue; 6999 7000 VectorizationCostTy C = getInstructionCost(&I, VF); 7001 7002 // Check if we should override the cost. 7003 if (C.first.isValid() && 7004 ForceTargetInstructionCost.getNumOccurrences() > 0) 7005 C.first = InstructionCost(ForceTargetInstructionCost); 7006 7007 // Keep a list of instructions with invalid costs. 7008 if (Invalid && !C.first.isValid()) 7009 Invalid->emplace_back(&I, VF); 7010 7011 BlockCost.first += C.first; 7012 BlockCost.second |= C.second; 7013 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 7014 << " for VF " << VF << " For instruction: " << I 7015 << '\n'); 7016 } 7017 7018 // If we are vectorizing a predicated block, it will have been 7019 // if-converted. This means that the block's instructions (aside from 7020 // stores and instructions that may divide by zero) will now be 7021 // unconditionally executed. For the scalar case, we may not always execute 7022 // the predicated block, if it is an if-else block. Thus, scale the block's 7023 // cost by the probability of executing it. blockNeedsPredication from 7024 // Legal is used so as to not include all blocks in tail folded loops. 7025 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 7026 BlockCost.first /= getReciprocalPredBlockProb(); 7027 7028 Cost.first += BlockCost.first; 7029 Cost.second |= BlockCost.second; 7030 } 7031 7032 return Cost; 7033 } 7034 7035 /// Gets Address Access SCEV after verifying that the access pattern 7036 /// is loop invariant except the induction variable dependence. 7037 /// 7038 /// This SCEV can be sent to the Target in order to estimate the address 7039 /// calculation cost. 7040 static const SCEV *getAddressAccessSCEV( 7041 Value *Ptr, 7042 LoopVectorizationLegality *Legal, 7043 PredicatedScalarEvolution &PSE, 7044 const Loop *TheLoop) { 7045 7046 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 7047 if (!Gep) 7048 return nullptr; 7049 7050 // We are looking for a gep with all loop invariant indices except for one 7051 // which should be an induction variable. 7052 auto SE = PSE.getSE(); 7053 unsigned NumOperands = Gep->getNumOperands(); 7054 for (unsigned i = 1; i < NumOperands; ++i) { 7055 Value *Opd = Gep->getOperand(i); 7056 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 7057 !Legal->isInductionVariable(Opd)) 7058 return nullptr; 7059 } 7060 7061 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 7062 return PSE.getSCEV(Ptr); 7063 } 7064 7065 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 7066 return Legal->hasStride(I->getOperand(0)) || 7067 Legal->hasStride(I->getOperand(1)); 7068 } 7069 7070 InstructionCost 7071 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 7072 ElementCount VF) { 7073 assert(VF.isVector() && 7074 "Scalarization cost of instruction implies vectorization."); 7075 if (VF.isScalable()) 7076 return InstructionCost::getInvalid(); 7077 7078 Type *ValTy = getLoadStoreType(I); 7079 auto SE = PSE.getSE(); 7080 7081 unsigned AS = getLoadStoreAddressSpace(I); 7082 Value *Ptr = getLoadStorePointerOperand(I); 7083 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 7084 7085 // Figure out whether the access is strided and get the stride value 7086 // if it's known in compile time 7087 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 7088 7089 // Get the cost of the scalar memory instruction and address computation. 7090 InstructionCost Cost = 7091 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 7092 7093 // Don't pass *I here, since it is scalar but will actually be part of a 7094 // vectorized loop where the user of it is a vectorized instruction. 7095 const Align Alignment = getLoadStoreAlignment(I); 7096 Cost += VF.getKnownMinValue() * 7097 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 7098 AS, TTI::TCK_RecipThroughput); 7099 7100 // Get the overhead of the extractelement and insertelement instructions 7101 // we might create due to scalarization. 7102 Cost += getScalarizationOverhead(I, VF); 7103 7104 // If we have a predicated load/store, it will need extra i1 extracts and 7105 // conditional branches, but may not be executed for each vector lane. Scale 7106 // the cost by the probability of executing the predicated block. 7107 if (isPredicatedInst(I)) { 7108 Cost /= getReciprocalPredBlockProb(); 7109 7110 // Add the cost of an i1 extract and a branch 7111 auto *Vec_i1Ty = 7112 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 7113 Cost += TTI.getScalarizationOverhead( 7114 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 7115 /*Insert=*/false, /*Extract=*/true); 7116 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 7117 7118 if (useEmulatedMaskMemRefHack(I)) 7119 // Artificially setting to a high enough value to practically disable 7120 // vectorization with such operations. 7121 Cost = 3000000; 7122 } 7123 7124 return Cost; 7125 } 7126 7127 InstructionCost 7128 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7129 ElementCount VF) { 7130 Type *ValTy = getLoadStoreType(I); 7131 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7132 Value *Ptr = getLoadStorePointerOperand(I); 7133 unsigned AS = getLoadStoreAddressSpace(I); 7134 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 7135 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7136 7137 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7138 "Stride should be 1 or -1 for consecutive memory access"); 7139 const Align Alignment = getLoadStoreAlignment(I); 7140 InstructionCost Cost = 0; 7141 if (Legal->isMaskRequired(I)) 7142 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7143 CostKind); 7144 else 7145 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7146 CostKind, I); 7147 7148 bool Reverse = ConsecutiveStride < 0; 7149 if (Reverse) 7150 Cost += 7151 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7152 return Cost; 7153 } 7154 7155 InstructionCost 7156 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7157 ElementCount VF) { 7158 assert(Legal->isUniformMemOp(*I)); 7159 7160 Type *ValTy = getLoadStoreType(I); 7161 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7162 const Align Alignment = getLoadStoreAlignment(I); 7163 unsigned AS = getLoadStoreAddressSpace(I); 7164 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7165 if (isa<LoadInst>(I)) { 7166 return TTI.getAddressComputationCost(ValTy) + 7167 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7168 CostKind) + 7169 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7170 } 7171 StoreInst *SI = cast<StoreInst>(I); 7172 7173 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7174 return TTI.getAddressComputationCost(ValTy) + 7175 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7176 CostKind) + 7177 (isLoopInvariantStoreValue 7178 ? 0 7179 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7180 VF.getKnownMinValue() - 1)); 7181 } 7182 7183 InstructionCost 7184 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7185 ElementCount VF) { 7186 Type *ValTy = getLoadStoreType(I); 7187 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7188 const Align Alignment = getLoadStoreAlignment(I); 7189 const Value *Ptr = getLoadStorePointerOperand(I); 7190 7191 return TTI.getAddressComputationCost(VectorTy) + 7192 TTI.getGatherScatterOpCost( 7193 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7194 TargetTransformInfo::TCK_RecipThroughput, I); 7195 } 7196 7197 InstructionCost 7198 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7199 ElementCount VF) { 7200 // TODO: Once we have support for interleaving with scalable vectors 7201 // we can calculate the cost properly here. 7202 if (VF.isScalable()) 7203 return InstructionCost::getInvalid(); 7204 7205 Type *ValTy = getLoadStoreType(I); 7206 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7207 unsigned AS = getLoadStoreAddressSpace(I); 7208 7209 auto Group = getInterleavedAccessGroup(I); 7210 assert(Group && "Fail to get an interleaved access group."); 7211 7212 unsigned InterleaveFactor = Group->getFactor(); 7213 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7214 7215 // Holds the indices of existing members in the interleaved group. 7216 SmallVector<unsigned, 4> Indices; 7217 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 7218 if (Group->getMember(IF)) 7219 Indices.push_back(IF); 7220 7221 // Calculate the cost of the whole interleaved group. 7222 bool UseMaskForGaps = 7223 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 7224 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 7225 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7226 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7227 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7228 7229 if (Group->isReverse()) { 7230 // TODO: Add support for reversed masked interleaved access. 7231 assert(!Legal->isMaskRequired(I) && 7232 "Reverse masked interleaved access not supported."); 7233 Cost += 7234 Group->getNumMembers() * 7235 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7236 } 7237 return Cost; 7238 } 7239 7240 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7241 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7242 using namespace llvm::PatternMatch; 7243 // Early exit for no inloop reductions 7244 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7245 return None; 7246 auto *VectorTy = cast<VectorType>(Ty); 7247 7248 // We are looking for a pattern of, and finding the minimal acceptable cost: 7249 // reduce(mul(ext(A), ext(B))) or 7250 // reduce(mul(A, B)) or 7251 // reduce(ext(A)) or 7252 // reduce(A). 7253 // The basic idea is that we walk down the tree to do that, finding the root 7254 // reduction instruction in InLoopReductionImmediateChains. From there we find 7255 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7256 // of the components. If the reduction cost is lower then we return it for the 7257 // reduction instruction and 0 for the other instructions in the pattern. If 7258 // it is not we return an invalid cost specifying the orignal cost method 7259 // should be used. 7260 Instruction *RetI = I; 7261 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7262 if (!RetI->hasOneUser()) 7263 return None; 7264 RetI = RetI->user_back(); 7265 } 7266 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7267 RetI->user_back()->getOpcode() == Instruction::Add) { 7268 if (!RetI->hasOneUser()) 7269 return None; 7270 RetI = RetI->user_back(); 7271 } 7272 7273 // Test if the found instruction is a reduction, and if not return an invalid 7274 // cost specifying the parent to use the original cost modelling. 7275 if (!InLoopReductionImmediateChains.count(RetI)) 7276 return None; 7277 7278 // Find the reduction this chain is a part of and calculate the basic cost of 7279 // the reduction on its own. 7280 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7281 Instruction *ReductionPhi = LastChain; 7282 while (!isa<PHINode>(ReductionPhi)) 7283 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7284 7285 const RecurrenceDescriptor &RdxDesc = 7286 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7287 7288 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7289 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7290 7291 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 7292 // normal fmul instruction to the cost of the fadd reduction. 7293 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 7294 BaseCost += 7295 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 7296 7297 // If we're using ordered reductions then we can just return the base cost 7298 // here, since getArithmeticReductionCost calculates the full ordered 7299 // reduction cost when FP reassociation is not allowed. 7300 if (useOrderedReductions(RdxDesc)) 7301 return BaseCost; 7302 7303 // Get the operand that was not the reduction chain and match it to one of the 7304 // patterns, returning the better cost if it is found. 7305 Instruction *RedOp = RetI->getOperand(1) == LastChain 7306 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7307 : dyn_cast<Instruction>(RetI->getOperand(1)); 7308 7309 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7310 7311 Instruction *Op0, *Op1; 7312 if (RedOp && 7313 match(RedOp, 7314 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7315 match(Op0, m_ZExtOrSExt(m_Value())) && 7316 Op0->getOpcode() == Op1->getOpcode() && 7317 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7318 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7319 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7320 7321 // Matched reduce(ext(mul(ext(A), ext(B))) 7322 // Note that the extend opcodes need to all match, or if A==B they will have 7323 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7324 // which is equally fine. 7325 bool IsUnsigned = isa<ZExtInst>(Op0); 7326 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7327 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7328 7329 InstructionCost ExtCost = 7330 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7331 TTI::CastContextHint::None, CostKind, Op0); 7332 InstructionCost MulCost = 7333 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7334 InstructionCost Ext2Cost = 7335 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7336 TTI::CastContextHint::None, CostKind, RedOp); 7337 7338 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7339 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7340 CostKind); 7341 7342 if (RedCost.isValid() && 7343 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7344 return I == RetI ? RedCost : 0; 7345 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7346 !TheLoop->isLoopInvariant(RedOp)) { 7347 // Matched reduce(ext(A)) 7348 bool IsUnsigned = isa<ZExtInst>(RedOp); 7349 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7350 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7351 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7352 CostKind); 7353 7354 InstructionCost ExtCost = 7355 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7356 TTI::CastContextHint::None, CostKind, RedOp); 7357 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7358 return I == RetI ? RedCost : 0; 7359 } else if (RedOp && 7360 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7361 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7362 Op0->getOpcode() == Op1->getOpcode() && 7363 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7364 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7365 bool IsUnsigned = isa<ZExtInst>(Op0); 7366 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7367 // Matched reduce(mul(ext, ext)) 7368 InstructionCost ExtCost = 7369 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7370 TTI::CastContextHint::None, CostKind, Op0); 7371 InstructionCost MulCost = 7372 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7373 7374 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7375 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7376 CostKind); 7377 7378 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7379 return I == RetI ? RedCost : 0; 7380 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7381 // Matched reduce(mul()) 7382 InstructionCost MulCost = 7383 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7384 7385 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7386 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7387 CostKind); 7388 7389 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7390 return I == RetI ? RedCost : 0; 7391 } 7392 } 7393 7394 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7395 } 7396 7397 InstructionCost 7398 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7399 ElementCount VF) { 7400 // Calculate scalar cost only. Vectorization cost should be ready at this 7401 // moment. 7402 if (VF.isScalar()) { 7403 Type *ValTy = getLoadStoreType(I); 7404 const Align Alignment = getLoadStoreAlignment(I); 7405 unsigned AS = getLoadStoreAddressSpace(I); 7406 7407 return TTI.getAddressComputationCost(ValTy) + 7408 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7409 TTI::TCK_RecipThroughput, I); 7410 } 7411 return getWideningCost(I, VF); 7412 } 7413 7414 LoopVectorizationCostModel::VectorizationCostTy 7415 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7416 ElementCount VF) { 7417 // If we know that this instruction will remain uniform, check the cost of 7418 // the scalar version. 7419 if (isUniformAfterVectorization(I, VF)) 7420 VF = ElementCount::getFixed(1); 7421 7422 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7423 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7424 7425 // Forced scalars do not have any scalarization overhead. 7426 auto ForcedScalar = ForcedScalars.find(VF); 7427 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7428 auto InstSet = ForcedScalar->second; 7429 if (InstSet.count(I)) 7430 return VectorizationCostTy( 7431 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7432 VF.getKnownMinValue()), 7433 false); 7434 } 7435 7436 Type *VectorTy; 7437 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7438 7439 bool TypeNotScalarized = false; 7440 if (VF.isVector() && VectorTy->isVectorTy()) { 7441 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7442 if (NumParts) 7443 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7444 else 7445 C = InstructionCost::getInvalid(); 7446 } 7447 return VectorizationCostTy(C, TypeNotScalarized); 7448 } 7449 7450 InstructionCost 7451 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7452 ElementCount VF) const { 7453 7454 // There is no mechanism yet to create a scalable scalarization loop, 7455 // so this is currently Invalid. 7456 if (VF.isScalable()) 7457 return InstructionCost::getInvalid(); 7458 7459 if (VF.isScalar()) 7460 return 0; 7461 7462 InstructionCost Cost = 0; 7463 Type *RetTy = ToVectorTy(I->getType(), VF); 7464 if (!RetTy->isVoidTy() && 7465 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7466 Cost += TTI.getScalarizationOverhead( 7467 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7468 false); 7469 7470 // Some targets keep addresses scalar. 7471 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7472 return Cost; 7473 7474 // Some targets support efficient element stores. 7475 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7476 return Cost; 7477 7478 // Collect operands to consider. 7479 CallInst *CI = dyn_cast<CallInst>(I); 7480 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7481 7482 // Skip operands that do not require extraction/scalarization and do not incur 7483 // any overhead. 7484 SmallVector<Type *> Tys; 7485 for (auto *V : filterExtractingOperands(Ops, VF)) 7486 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7487 return Cost + TTI.getOperandsScalarizationOverhead( 7488 filterExtractingOperands(Ops, VF), Tys); 7489 } 7490 7491 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7492 if (VF.isScalar()) 7493 return; 7494 NumPredStores = 0; 7495 for (BasicBlock *BB : TheLoop->blocks()) { 7496 // For each instruction in the old loop. 7497 for (Instruction &I : *BB) { 7498 Value *Ptr = getLoadStorePointerOperand(&I); 7499 if (!Ptr) 7500 continue; 7501 7502 // TODO: We should generate better code and update the cost model for 7503 // predicated uniform stores. Today they are treated as any other 7504 // predicated store (see added test cases in 7505 // invariant-store-vectorization.ll). 7506 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7507 NumPredStores++; 7508 7509 if (Legal->isUniformMemOp(I)) { 7510 // TODO: Avoid replicating loads and stores instead of 7511 // relying on instcombine to remove them. 7512 // Load: Scalar load + broadcast 7513 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7514 InstructionCost Cost; 7515 if (isa<StoreInst>(&I) && VF.isScalable() && 7516 isLegalGatherOrScatter(&I)) { 7517 Cost = getGatherScatterCost(&I, VF); 7518 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7519 } else { 7520 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7521 "Cannot yet scalarize uniform stores"); 7522 Cost = getUniformMemOpCost(&I, VF); 7523 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7524 } 7525 continue; 7526 } 7527 7528 // We assume that widening is the best solution when possible. 7529 if (memoryInstructionCanBeWidened(&I, VF)) { 7530 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7531 int ConsecutiveStride = Legal->isConsecutivePtr( 7532 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7533 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7534 "Expected consecutive stride."); 7535 InstWidening Decision = 7536 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7537 setWideningDecision(&I, VF, Decision, Cost); 7538 continue; 7539 } 7540 7541 // Choose between Interleaving, Gather/Scatter or Scalarization. 7542 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7543 unsigned NumAccesses = 1; 7544 if (isAccessInterleaved(&I)) { 7545 auto Group = getInterleavedAccessGroup(&I); 7546 assert(Group && "Fail to get an interleaved access group."); 7547 7548 // Make one decision for the whole group. 7549 if (getWideningDecision(&I, VF) != CM_Unknown) 7550 continue; 7551 7552 NumAccesses = Group->getNumMembers(); 7553 if (interleavedAccessCanBeWidened(&I, VF)) 7554 InterleaveCost = getInterleaveGroupCost(&I, VF); 7555 } 7556 7557 InstructionCost GatherScatterCost = 7558 isLegalGatherOrScatter(&I) 7559 ? getGatherScatterCost(&I, VF) * NumAccesses 7560 : InstructionCost::getInvalid(); 7561 7562 InstructionCost ScalarizationCost = 7563 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7564 7565 // Choose better solution for the current VF, 7566 // write down this decision and use it during vectorization. 7567 InstructionCost Cost; 7568 InstWidening Decision; 7569 if (InterleaveCost <= GatherScatterCost && 7570 InterleaveCost < ScalarizationCost) { 7571 Decision = CM_Interleave; 7572 Cost = InterleaveCost; 7573 } else if (GatherScatterCost < ScalarizationCost) { 7574 Decision = CM_GatherScatter; 7575 Cost = GatherScatterCost; 7576 } else { 7577 Decision = CM_Scalarize; 7578 Cost = ScalarizationCost; 7579 } 7580 // If the instructions belongs to an interleave group, the whole group 7581 // receives the same decision. The whole group receives the cost, but 7582 // the cost will actually be assigned to one instruction. 7583 if (auto Group = getInterleavedAccessGroup(&I)) 7584 setWideningDecision(Group, VF, Decision, Cost); 7585 else 7586 setWideningDecision(&I, VF, Decision, Cost); 7587 } 7588 } 7589 7590 // Make sure that any load of address and any other address computation 7591 // remains scalar unless there is gather/scatter support. This avoids 7592 // inevitable extracts into address registers, and also has the benefit of 7593 // activating LSR more, since that pass can't optimize vectorized 7594 // addresses. 7595 if (TTI.prefersVectorizedAddressing()) 7596 return; 7597 7598 // Start with all scalar pointer uses. 7599 SmallPtrSet<Instruction *, 8> AddrDefs; 7600 for (BasicBlock *BB : TheLoop->blocks()) 7601 for (Instruction &I : *BB) { 7602 Instruction *PtrDef = 7603 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7604 if (PtrDef && TheLoop->contains(PtrDef) && 7605 getWideningDecision(&I, VF) != CM_GatherScatter) 7606 AddrDefs.insert(PtrDef); 7607 } 7608 7609 // Add all instructions used to generate the addresses. 7610 SmallVector<Instruction *, 4> Worklist; 7611 append_range(Worklist, AddrDefs); 7612 while (!Worklist.empty()) { 7613 Instruction *I = Worklist.pop_back_val(); 7614 for (auto &Op : I->operands()) 7615 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7616 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7617 AddrDefs.insert(InstOp).second) 7618 Worklist.push_back(InstOp); 7619 } 7620 7621 for (auto *I : AddrDefs) { 7622 if (isa<LoadInst>(I)) { 7623 // Setting the desired widening decision should ideally be handled in 7624 // by cost functions, but since this involves the task of finding out 7625 // if the loaded register is involved in an address computation, it is 7626 // instead changed here when we know this is the case. 7627 InstWidening Decision = getWideningDecision(I, VF); 7628 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7629 // Scalarize a widened load of address. 7630 setWideningDecision( 7631 I, VF, CM_Scalarize, 7632 (VF.getKnownMinValue() * 7633 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7634 else if (auto Group = getInterleavedAccessGroup(I)) { 7635 // Scalarize an interleave group of address loads. 7636 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7637 if (Instruction *Member = Group->getMember(I)) 7638 setWideningDecision( 7639 Member, VF, CM_Scalarize, 7640 (VF.getKnownMinValue() * 7641 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7642 } 7643 } 7644 } else 7645 // Make sure I gets scalarized and a cost estimate without 7646 // scalarization overhead. 7647 ForcedScalars[VF].insert(I); 7648 } 7649 } 7650 7651 InstructionCost 7652 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7653 Type *&VectorTy) { 7654 Type *RetTy = I->getType(); 7655 if (canTruncateToMinimalBitwidth(I, VF)) 7656 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7657 auto SE = PSE.getSE(); 7658 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7659 7660 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7661 ElementCount VF) -> bool { 7662 if (VF.isScalar()) 7663 return true; 7664 7665 auto Scalarized = InstsToScalarize.find(VF); 7666 assert(Scalarized != InstsToScalarize.end() && 7667 "VF not yet analyzed for scalarization profitability"); 7668 return !Scalarized->second.count(I) && 7669 llvm::all_of(I->users(), [&](User *U) { 7670 auto *UI = cast<Instruction>(U); 7671 return !Scalarized->second.count(UI); 7672 }); 7673 }; 7674 (void) hasSingleCopyAfterVectorization; 7675 7676 if (isScalarAfterVectorization(I, VF)) { 7677 // With the exception of GEPs and PHIs, after scalarization there should 7678 // only be one copy of the instruction generated in the loop. This is 7679 // because the VF is either 1, or any instructions that need scalarizing 7680 // have already been dealt with by the the time we get here. As a result, 7681 // it means we don't have to multiply the instruction cost by VF. 7682 assert(I->getOpcode() == Instruction::GetElementPtr || 7683 I->getOpcode() == Instruction::PHI || 7684 (I->getOpcode() == Instruction::BitCast && 7685 I->getType()->isPointerTy()) || 7686 hasSingleCopyAfterVectorization(I, VF)); 7687 VectorTy = RetTy; 7688 } else 7689 VectorTy = ToVectorTy(RetTy, VF); 7690 7691 // TODO: We need to estimate the cost of intrinsic calls. 7692 switch (I->getOpcode()) { 7693 case Instruction::GetElementPtr: 7694 // We mark this instruction as zero-cost because the cost of GEPs in 7695 // vectorized code depends on whether the corresponding memory instruction 7696 // is scalarized or not. Therefore, we handle GEPs with the memory 7697 // instruction cost. 7698 return 0; 7699 case Instruction::Br: { 7700 // In cases of scalarized and predicated instructions, there will be VF 7701 // predicated blocks in the vectorized loop. Each branch around these 7702 // blocks requires also an extract of its vector compare i1 element. 7703 bool ScalarPredicatedBB = false; 7704 BranchInst *BI = cast<BranchInst>(I); 7705 if (VF.isVector() && BI->isConditional() && 7706 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7707 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7708 ScalarPredicatedBB = true; 7709 7710 if (ScalarPredicatedBB) { 7711 // Not possible to scalarize scalable vector with predicated instructions. 7712 if (VF.isScalable()) 7713 return InstructionCost::getInvalid(); 7714 // Return cost for branches around scalarized and predicated blocks. 7715 auto *Vec_i1Ty = 7716 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7717 return ( 7718 TTI.getScalarizationOverhead( 7719 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7720 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7721 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7722 // The back-edge branch will remain, as will all scalar branches. 7723 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7724 else 7725 // This branch will be eliminated by if-conversion. 7726 return 0; 7727 // Note: We currently assume zero cost for an unconditional branch inside 7728 // a predicated block since it will become a fall-through, although we 7729 // may decide in the future to call TTI for all branches. 7730 } 7731 case Instruction::PHI: { 7732 auto *Phi = cast<PHINode>(I); 7733 7734 // First-order recurrences are replaced by vector shuffles inside the loop. 7735 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7736 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7737 return TTI.getShuffleCost( 7738 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7739 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7740 7741 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7742 // converted into select instructions. We require N - 1 selects per phi 7743 // node, where N is the number of incoming values. 7744 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7745 return (Phi->getNumIncomingValues() - 1) * 7746 TTI.getCmpSelInstrCost( 7747 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7748 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7749 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7750 7751 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7752 } 7753 case Instruction::UDiv: 7754 case Instruction::SDiv: 7755 case Instruction::URem: 7756 case Instruction::SRem: 7757 // If we have a predicated instruction, it may not be executed for each 7758 // vector lane. Get the scalarization cost and scale this amount by the 7759 // probability of executing the predicated block. If the instruction is not 7760 // predicated, we fall through to the next case. 7761 if (VF.isVector() && isScalarWithPredication(I)) { 7762 InstructionCost Cost = 0; 7763 7764 // These instructions have a non-void type, so account for the phi nodes 7765 // that we will create. This cost is likely to be zero. The phi node 7766 // cost, if any, should be scaled by the block probability because it 7767 // models a copy at the end of each predicated block. 7768 Cost += VF.getKnownMinValue() * 7769 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7770 7771 // The cost of the non-predicated instruction. 7772 Cost += VF.getKnownMinValue() * 7773 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7774 7775 // The cost of insertelement and extractelement instructions needed for 7776 // scalarization. 7777 Cost += getScalarizationOverhead(I, VF); 7778 7779 // Scale the cost by the probability of executing the predicated blocks. 7780 // This assumes the predicated block for each vector lane is equally 7781 // likely. 7782 return Cost / getReciprocalPredBlockProb(); 7783 } 7784 LLVM_FALLTHROUGH; 7785 case Instruction::Add: 7786 case Instruction::FAdd: 7787 case Instruction::Sub: 7788 case Instruction::FSub: 7789 case Instruction::Mul: 7790 case Instruction::FMul: 7791 case Instruction::FDiv: 7792 case Instruction::FRem: 7793 case Instruction::Shl: 7794 case Instruction::LShr: 7795 case Instruction::AShr: 7796 case Instruction::And: 7797 case Instruction::Or: 7798 case Instruction::Xor: { 7799 // Since we will replace the stride by 1 the multiplication should go away. 7800 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7801 return 0; 7802 7803 // Detect reduction patterns 7804 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7805 return *RedCost; 7806 7807 // Certain instructions can be cheaper to vectorize if they have a constant 7808 // second vector operand. One example of this are shifts on x86. 7809 Value *Op2 = I->getOperand(1); 7810 TargetTransformInfo::OperandValueProperties Op2VP; 7811 TargetTransformInfo::OperandValueKind Op2VK = 7812 TTI.getOperandInfo(Op2, Op2VP); 7813 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7814 Op2VK = TargetTransformInfo::OK_UniformValue; 7815 7816 SmallVector<const Value *, 4> Operands(I->operand_values()); 7817 return TTI.getArithmeticInstrCost( 7818 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7819 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7820 } 7821 case Instruction::FNeg: { 7822 return TTI.getArithmeticInstrCost( 7823 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7824 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7825 TargetTransformInfo::OP_None, I->getOperand(0), I); 7826 } 7827 case Instruction::Select: { 7828 SelectInst *SI = cast<SelectInst>(I); 7829 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7830 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7831 7832 const Value *Op0, *Op1; 7833 using namespace llvm::PatternMatch; 7834 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7835 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7836 // select x, y, false --> x & y 7837 // select x, true, y --> x | y 7838 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7839 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7840 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7841 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7842 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7843 Op1->getType()->getScalarSizeInBits() == 1); 7844 7845 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7846 return TTI.getArithmeticInstrCost( 7847 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7848 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7849 } 7850 7851 Type *CondTy = SI->getCondition()->getType(); 7852 if (!ScalarCond) 7853 CondTy = VectorType::get(CondTy, VF); 7854 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7855 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7856 } 7857 case Instruction::ICmp: 7858 case Instruction::FCmp: { 7859 Type *ValTy = I->getOperand(0)->getType(); 7860 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7861 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7862 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7863 VectorTy = ToVectorTy(ValTy, VF); 7864 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7865 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7866 } 7867 case Instruction::Store: 7868 case Instruction::Load: { 7869 ElementCount Width = VF; 7870 if (Width.isVector()) { 7871 InstWidening Decision = getWideningDecision(I, Width); 7872 assert(Decision != CM_Unknown && 7873 "CM decision should be taken at this point"); 7874 if (Decision == CM_Scalarize) 7875 Width = ElementCount::getFixed(1); 7876 } 7877 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7878 return getMemoryInstructionCost(I, VF); 7879 } 7880 case Instruction::BitCast: 7881 if (I->getType()->isPointerTy()) 7882 return 0; 7883 LLVM_FALLTHROUGH; 7884 case Instruction::ZExt: 7885 case Instruction::SExt: 7886 case Instruction::FPToUI: 7887 case Instruction::FPToSI: 7888 case Instruction::FPExt: 7889 case Instruction::PtrToInt: 7890 case Instruction::IntToPtr: 7891 case Instruction::SIToFP: 7892 case Instruction::UIToFP: 7893 case Instruction::Trunc: 7894 case Instruction::FPTrunc: { 7895 // Computes the CastContextHint from a Load/Store instruction. 7896 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7897 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7898 "Expected a load or a store!"); 7899 7900 if (VF.isScalar() || !TheLoop->contains(I)) 7901 return TTI::CastContextHint::Normal; 7902 7903 switch (getWideningDecision(I, VF)) { 7904 case LoopVectorizationCostModel::CM_GatherScatter: 7905 return TTI::CastContextHint::GatherScatter; 7906 case LoopVectorizationCostModel::CM_Interleave: 7907 return TTI::CastContextHint::Interleave; 7908 case LoopVectorizationCostModel::CM_Scalarize: 7909 case LoopVectorizationCostModel::CM_Widen: 7910 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7911 : TTI::CastContextHint::Normal; 7912 case LoopVectorizationCostModel::CM_Widen_Reverse: 7913 return TTI::CastContextHint::Reversed; 7914 case LoopVectorizationCostModel::CM_Unknown: 7915 llvm_unreachable("Instr did not go through cost modelling?"); 7916 } 7917 7918 llvm_unreachable("Unhandled case!"); 7919 }; 7920 7921 unsigned Opcode = I->getOpcode(); 7922 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7923 // For Trunc, the context is the only user, which must be a StoreInst. 7924 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7925 if (I->hasOneUse()) 7926 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7927 CCH = ComputeCCH(Store); 7928 } 7929 // For Z/Sext, the context is the operand, which must be a LoadInst. 7930 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7931 Opcode == Instruction::FPExt) { 7932 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7933 CCH = ComputeCCH(Load); 7934 } 7935 7936 // We optimize the truncation of induction variables having constant 7937 // integer steps. The cost of these truncations is the same as the scalar 7938 // operation. 7939 if (isOptimizableIVTruncate(I, VF)) { 7940 auto *Trunc = cast<TruncInst>(I); 7941 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7942 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7943 } 7944 7945 // Detect reduction patterns 7946 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7947 return *RedCost; 7948 7949 Type *SrcScalarTy = I->getOperand(0)->getType(); 7950 Type *SrcVecTy = 7951 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7952 if (canTruncateToMinimalBitwidth(I, VF)) { 7953 // This cast is going to be shrunk. This may remove the cast or it might 7954 // turn it into slightly different cast. For example, if MinBW == 16, 7955 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7956 // 7957 // Calculate the modified src and dest types. 7958 Type *MinVecTy = VectorTy; 7959 if (Opcode == Instruction::Trunc) { 7960 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7961 VectorTy = 7962 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7963 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7964 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7965 VectorTy = 7966 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7967 } 7968 } 7969 7970 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7971 } 7972 case Instruction::Call: { 7973 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7974 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7975 return *RedCost; 7976 bool NeedToScalarize; 7977 CallInst *CI = cast<CallInst>(I); 7978 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7979 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7980 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7981 return std::min(CallCost, IntrinsicCost); 7982 } 7983 return CallCost; 7984 } 7985 case Instruction::ExtractValue: 7986 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7987 case Instruction::Alloca: 7988 // We cannot easily widen alloca to a scalable alloca, as 7989 // the result would need to be a vector of pointers. 7990 if (VF.isScalable()) 7991 return InstructionCost::getInvalid(); 7992 LLVM_FALLTHROUGH; 7993 default: 7994 // This opcode is unknown. Assume that it is the same as 'mul'. 7995 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7996 } // end of switch. 7997 } 7998 7999 char LoopVectorize::ID = 0; 8000 8001 static const char lv_name[] = "Loop Vectorization"; 8002 8003 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 8004 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 8005 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 8006 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 8007 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 8008 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 8009 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 8010 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 8011 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 8012 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 8013 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 8014 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 8015 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 8016 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 8017 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 8018 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 8019 8020 namespace llvm { 8021 8022 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 8023 8024 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 8025 bool VectorizeOnlyWhenForced) { 8026 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 8027 } 8028 8029 } // end namespace llvm 8030 8031 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 8032 // Check if the pointer operand of a load or store instruction is 8033 // consecutive. 8034 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 8035 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 8036 return false; 8037 } 8038 8039 void LoopVectorizationCostModel::collectValuesToIgnore() { 8040 // Ignore ephemeral values. 8041 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 8042 8043 // Ignore type-promoting instructions we identified during reduction 8044 // detection. 8045 for (auto &Reduction : Legal->getReductionVars()) { 8046 RecurrenceDescriptor &RedDes = Reduction.second; 8047 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 8048 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8049 } 8050 // Ignore type-casting instructions we identified during induction 8051 // detection. 8052 for (auto &Induction : Legal->getInductionVars()) { 8053 InductionDescriptor &IndDes = Induction.second; 8054 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8055 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8056 } 8057 } 8058 8059 void LoopVectorizationCostModel::collectInLoopReductions() { 8060 for (auto &Reduction : Legal->getReductionVars()) { 8061 PHINode *Phi = Reduction.first; 8062 RecurrenceDescriptor &RdxDesc = Reduction.second; 8063 8064 // We don't collect reductions that are type promoted (yet). 8065 if (RdxDesc.getRecurrenceType() != Phi->getType()) 8066 continue; 8067 8068 // If the target would prefer this reduction to happen "in-loop", then we 8069 // want to record it as such. 8070 unsigned Opcode = RdxDesc.getOpcode(); 8071 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 8072 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 8073 TargetTransformInfo::ReductionFlags())) 8074 continue; 8075 8076 // Check that we can correctly put the reductions into the loop, by 8077 // finding the chain of operations that leads from the phi to the loop 8078 // exit value. 8079 SmallVector<Instruction *, 4> ReductionOperations = 8080 RdxDesc.getReductionOpChain(Phi, TheLoop); 8081 bool InLoop = !ReductionOperations.empty(); 8082 if (InLoop) { 8083 InLoopReductionChains[Phi] = ReductionOperations; 8084 // Add the elements to InLoopReductionImmediateChains for cost modelling. 8085 Instruction *LastChain = Phi; 8086 for (auto *I : ReductionOperations) { 8087 InLoopReductionImmediateChains[I] = LastChain; 8088 LastChain = I; 8089 } 8090 } 8091 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 8092 << " reduction for phi: " << *Phi << "\n"); 8093 } 8094 } 8095 8096 // TODO: we could return a pair of values that specify the max VF and 8097 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 8098 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 8099 // doesn't have a cost model that can choose which plan to execute if 8100 // more than one is generated. 8101 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 8102 LoopVectorizationCostModel &CM) { 8103 unsigned WidestType; 8104 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 8105 return WidestVectorRegBits / WidestType; 8106 } 8107 8108 VectorizationFactor 8109 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 8110 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 8111 ElementCount VF = UserVF; 8112 // Outer loop handling: They may require CFG and instruction level 8113 // transformations before even evaluating whether vectorization is profitable. 8114 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8115 // the vectorization pipeline. 8116 if (!OrigLoop->isInnermost()) { 8117 // If the user doesn't provide a vectorization factor, determine a 8118 // reasonable one. 8119 if (UserVF.isZero()) { 8120 VF = ElementCount::getFixed(determineVPlanVF( 8121 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8122 .getFixedSize(), 8123 CM)); 8124 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8125 8126 // Make sure we have a VF > 1 for stress testing. 8127 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8128 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8129 << "overriding computed VF.\n"); 8130 VF = ElementCount::getFixed(4); 8131 } 8132 } 8133 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8134 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8135 "VF needs to be a power of two"); 8136 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8137 << "VF " << VF << " to build VPlans.\n"); 8138 buildVPlans(VF, VF); 8139 8140 // For VPlan build stress testing, we bail out after VPlan construction. 8141 if (VPlanBuildStressTest) 8142 return VectorizationFactor::Disabled(); 8143 8144 return {VF, 0 /*Cost*/}; 8145 } 8146 8147 LLVM_DEBUG( 8148 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8149 "VPlan-native path.\n"); 8150 return VectorizationFactor::Disabled(); 8151 } 8152 8153 Optional<VectorizationFactor> 8154 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8155 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8156 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8157 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8158 return None; 8159 8160 // Invalidate interleave groups if all blocks of loop will be predicated. 8161 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 8162 !useMaskedInterleavedAccesses(*TTI)) { 8163 LLVM_DEBUG( 8164 dbgs() 8165 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8166 "which requires masked-interleaved support.\n"); 8167 if (CM.InterleaveInfo.invalidateGroups()) 8168 // Invalidating interleave groups also requires invalidating all decisions 8169 // based on them, which includes widening decisions and uniform and scalar 8170 // values. 8171 CM.invalidateCostModelingDecisions(); 8172 } 8173 8174 ElementCount MaxUserVF = 8175 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8176 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8177 if (!UserVF.isZero() && UserVFIsLegal) { 8178 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8179 "VF needs to be a power of two"); 8180 // Collect the instructions (and their associated costs) that will be more 8181 // profitable to scalarize. 8182 if (CM.selectUserVectorizationFactor(UserVF)) { 8183 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8184 CM.collectInLoopReductions(); 8185 buildVPlansWithVPRecipes(UserVF, UserVF); 8186 LLVM_DEBUG(printPlans(dbgs())); 8187 return {{UserVF, 0}}; 8188 } else 8189 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8190 "InvalidCost", ORE, OrigLoop); 8191 } 8192 8193 // Populate the set of Vectorization Factor Candidates. 8194 ElementCountSet VFCandidates; 8195 for (auto VF = ElementCount::getFixed(1); 8196 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8197 VFCandidates.insert(VF); 8198 for (auto VF = ElementCount::getScalable(1); 8199 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8200 VFCandidates.insert(VF); 8201 8202 for (const auto &VF : VFCandidates) { 8203 // Collect Uniform and Scalar instructions after vectorization with VF. 8204 CM.collectUniformsAndScalars(VF); 8205 8206 // Collect the instructions (and their associated costs) that will be more 8207 // profitable to scalarize. 8208 if (VF.isVector()) 8209 CM.collectInstsToScalarize(VF); 8210 } 8211 8212 CM.collectInLoopReductions(); 8213 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8214 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8215 8216 LLVM_DEBUG(printPlans(dbgs())); 8217 if (!MaxFactors.hasVector()) 8218 return VectorizationFactor::Disabled(); 8219 8220 // Select the optimal vectorization factor. 8221 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8222 8223 // Check if it is profitable to vectorize with runtime checks. 8224 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8225 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8226 bool PragmaThresholdReached = 8227 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8228 bool ThresholdReached = 8229 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8230 if ((ThresholdReached && !Hints.allowReordering()) || 8231 PragmaThresholdReached) { 8232 ORE->emit([&]() { 8233 return OptimizationRemarkAnalysisAliasing( 8234 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8235 OrigLoop->getHeader()) 8236 << "loop not vectorized: cannot prove it is safe to reorder " 8237 "memory operations"; 8238 }); 8239 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8240 Hints.emitRemarkWithHints(); 8241 return VectorizationFactor::Disabled(); 8242 } 8243 } 8244 return SelectedVF; 8245 } 8246 8247 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 8248 assert(count_if(VPlans, 8249 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 8250 1 && 8251 "Best VF has not a single VPlan."); 8252 8253 for (const VPlanPtr &Plan : VPlans) { 8254 if (Plan->hasVF(VF)) 8255 return *Plan.get(); 8256 } 8257 llvm_unreachable("No plan found!"); 8258 } 8259 8260 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 8261 VPlan &BestVPlan, 8262 InnerLoopVectorizer &ILV, 8263 DominatorTree *DT) { 8264 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 8265 << '\n'); 8266 8267 // Perform the actual loop transformation. 8268 8269 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8270 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 8271 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8272 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8273 State.CanonicalIV = ILV.Induction; 8274 ILV.collectPoisonGeneratingRecipes(State); 8275 8276 ILV.printDebugTracesAtStart(); 8277 8278 //===------------------------------------------------===// 8279 // 8280 // Notice: any optimization or new instruction that go 8281 // into the code below should also be implemented in 8282 // the cost-model. 8283 // 8284 //===------------------------------------------------===// 8285 8286 // 2. Copy and widen instructions from the old loop into the new loop. 8287 BestVPlan.execute(&State); 8288 8289 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8290 // predication, updating analyses. 8291 ILV.fixVectorizedLoop(State); 8292 8293 ILV.printDebugTracesAtEnd(); 8294 } 8295 8296 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8297 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8298 for (const auto &Plan : VPlans) 8299 if (PrintVPlansInDotFormat) 8300 Plan->printDOT(O); 8301 else 8302 Plan->print(O); 8303 } 8304 #endif 8305 8306 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8307 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8308 8309 // We create new control-flow for the vectorized loop, so the original exit 8310 // conditions will be dead after vectorization if it's only used by the 8311 // terminator 8312 SmallVector<BasicBlock*> ExitingBlocks; 8313 OrigLoop->getExitingBlocks(ExitingBlocks); 8314 for (auto *BB : ExitingBlocks) { 8315 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8316 if (!Cmp || !Cmp->hasOneUse()) 8317 continue; 8318 8319 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8320 if (!DeadInstructions.insert(Cmp).second) 8321 continue; 8322 8323 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8324 // TODO: can recurse through operands in general 8325 for (Value *Op : Cmp->operands()) { 8326 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8327 DeadInstructions.insert(cast<Instruction>(Op)); 8328 } 8329 } 8330 8331 // We create new "steps" for induction variable updates to which the original 8332 // induction variables map. An original update instruction will be dead if 8333 // all its users except the induction variable are dead. 8334 auto *Latch = OrigLoop->getLoopLatch(); 8335 for (auto &Induction : Legal->getInductionVars()) { 8336 PHINode *Ind = Induction.first; 8337 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8338 8339 // If the tail is to be folded by masking, the primary induction variable, 8340 // if exists, isn't dead: it will be used for masking. Don't kill it. 8341 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8342 continue; 8343 8344 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8345 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8346 })) 8347 DeadInstructions.insert(IndUpdate); 8348 8349 // We record as "Dead" also the type-casting instructions we had identified 8350 // during induction analysis. We don't need any handling for them in the 8351 // vectorized loop because we have proven that, under a proper runtime 8352 // test guarding the vectorized loop, the value of the phi, and the casted 8353 // value of the phi, are the same. The last instruction in this casting chain 8354 // will get its scalar/vector/widened def from the scalar/vector/widened def 8355 // of the respective phi node. Any other casts in the induction def-use chain 8356 // have no other uses outside the phi update chain, and will be ignored. 8357 InductionDescriptor &IndDes = Induction.second; 8358 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8359 DeadInstructions.insert(Casts.begin(), Casts.end()); 8360 } 8361 } 8362 8363 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8364 8365 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8366 8367 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8368 Value *Step, 8369 Instruction::BinaryOps BinOp) { 8370 // When unrolling and the VF is 1, we only need to add a simple scalar. 8371 Type *Ty = Val->getType(); 8372 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8373 8374 if (Ty->isFloatingPointTy()) { 8375 // Floating-point operations inherit FMF via the builder's flags. 8376 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8377 return Builder.CreateBinOp(BinOp, Val, MulOp); 8378 } 8379 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8380 } 8381 8382 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8383 SmallVector<Metadata *, 4> MDs; 8384 // Reserve first location for self reference to the LoopID metadata node. 8385 MDs.push_back(nullptr); 8386 bool IsUnrollMetadata = false; 8387 MDNode *LoopID = L->getLoopID(); 8388 if (LoopID) { 8389 // First find existing loop unrolling disable metadata. 8390 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8391 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8392 if (MD) { 8393 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8394 IsUnrollMetadata = 8395 S && S->getString().startswith("llvm.loop.unroll.disable"); 8396 } 8397 MDs.push_back(LoopID->getOperand(i)); 8398 } 8399 } 8400 8401 if (!IsUnrollMetadata) { 8402 // Add runtime unroll disable metadata. 8403 LLVMContext &Context = L->getHeader()->getContext(); 8404 SmallVector<Metadata *, 1> DisableOperands; 8405 DisableOperands.push_back( 8406 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8407 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8408 MDs.push_back(DisableNode); 8409 MDNode *NewLoopID = MDNode::get(Context, MDs); 8410 // Set operand 0 to refer to the loop id itself. 8411 NewLoopID->replaceOperandWith(0, NewLoopID); 8412 L->setLoopID(NewLoopID); 8413 } 8414 } 8415 8416 //===--------------------------------------------------------------------===// 8417 // EpilogueVectorizerMainLoop 8418 //===--------------------------------------------------------------------===// 8419 8420 /// This function is partially responsible for generating the control flow 8421 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8422 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8423 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8424 Loop *Lp = createVectorLoopSkeleton(""); 8425 8426 // Generate the code to check the minimum iteration count of the vector 8427 // epilogue (see below). 8428 EPI.EpilogueIterationCountCheck = 8429 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8430 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8431 8432 // Generate the code to check any assumptions that we've made for SCEV 8433 // expressions. 8434 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8435 8436 // Generate the code that checks at runtime if arrays overlap. We put the 8437 // checks into a separate block to make the more common case of few elements 8438 // faster. 8439 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8440 8441 // Generate the iteration count check for the main loop, *after* the check 8442 // for the epilogue loop, so that the path-length is shorter for the case 8443 // that goes directly through the vector epilogue. The longer-path length for 8444 // the main loop is compensated for, by the gain from vectorizing the larger 8445 // trip count. Note: the branch will get updated later on when we vectorize 8446 // the epilogue. 8447 EPI.MainLoopIterationCountCheck = 8448 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8449 8450 // Generate the induction variable. 8451 OldInduction = Legal->getPrimaryInduction(); 8452 Type *IdxTy = Legal->getWidestInductionType(); 8453 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8454 8455 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8456 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8457 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8458 EPI.VectorTripCount = CountRoundDown; 8459 Induction = 8460 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8461 getDebugLocFromInstOrOperands(OldInduction)); 8462 8463 // Skip induction resume value creation here because they will be created in 8464 // the second pass. If we created them here, they wouldn't be used anyway, 8465 // because the vplan in the second pass still contains the inductions from the 8466 // original loop. 8467 8468 return completeLoopSkeleton(Lp, OrigLoopID); 8469 } 8470 8471 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8472 LLVM_DEBUG({ 8473 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8474 << "Main Loop VF:" << EPI.MainLoopVF 8475 << ", Main Loop UF:" << EPI.MainLoopUF 8476 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8477 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8478 }); 8479 } 8480 8481 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8482 DEBUG_WITH_TYPE(VerboseDebug, { 8483 dbgs() << "intermediate fn:\n" 8484 << *OrigLoop->getHeader()->getParent() << "\n"; 8485 }); 8486 } 8487 8488 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8489 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8490 assert(L && "Expected valid Loop."); 8491 assert(Bypass && "Expected valid bypass basic block."); 8492 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8493 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8494 Value *Count = getOrCreateTripCount(L); 8495 // Reuse existing vector loop preheader for TC checks. 8496 // Note that new preheader block is generated for vector loop. 8497 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8498 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8499 8500 // Generate code to check if the loop's trip count is less than VF * UF of the 8501 // main vector loop. 8502 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8503 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8504 8505 Value *CheckMinIters = Builder.CreateICmp( 8506 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8507 "min.iters.check"); 8508 8509 if (!ForEpilogue) 8510 TCCheckBlock->setName("vector.main.loop.iter.check"); 8511 8512 // Create new preheader for vector loop. 8513 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8514 DT, LI, nullptr, "vector.ph"); 8515 8516 if (ForEpilogue) { 8517 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8518 DT->getNode(Bypass)->getIDom()) && 8519 "TC check is expected to dominate Bypass"); 8520 8521 // Update dominator for Bypass & LoopExit. 8522 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8523 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8524 // For loops with multiple exits, there's no edge from the middle block 8525 // to exit blocks (as the epilogue must run) and thus no need to update 8526 // the immediate dominator of the exit blocks. 8527 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8528 8529 LoopBypassBlocks.push_back(TCCheckBlock); 8530 8531 // Save the trip count so we don't have to regenerate it in the 8532 // vec.epilog.iter.check. This is safe to do because the trip count 8533 // generated here dominates the vector epilog iter check. 8534 EPI.TripCount = Count; 8535 } 8536 8537 ReplaceInstWithInst( 8538 TCCheckBlock->getTerminator(), 8539 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8540 8541 return TCCheckBlock; 8542 } 8543 8544 //===--------------------------------------------------------------------===// 8545 // EpilogueVectorizerEpilogueLoop 8546 //===--------------------------------------------------------------------===// 8547 8548 /// This function is partially responsible for generating the control flow 8549 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8550 BasicBlock * 8551 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8552 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8553 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8554 8555 // Now, compare the remaining count and if there aren't enough iterations to 8556 // execute the vectorized epilogue skip to the scalar part. 8557 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8558 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8559 LoopVectorPreHeader = 8560 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8561 LI, nullptr, "vec.epilog.ph"); 8562 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8563 VecEpilogueIterationCountCheck); 8564 8565 // Adjust the control flow taking the state info from the main loop 8566 // vectorization into account. 8567 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8568 "expected this to be saved from the previous pass."); 8569 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8570 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8571 8572 DT->changeImmediateDominator(LoopVectorPreHeader, 8573 EPI.MainLoopIterationCountCheck); 8574 8575 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8576 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8577 8578 if (EPI.SCEVSafetyCheck) 8579 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8580 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8581 if (EPI.MemSafetyCheck) 8582 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8583 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8584 8585 DT->changeImmediateDominator( 8586 VecEpilogueIterationCountCheck, 8587 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8588 8589 DT->changeImmediateDominator(LoopScalarPreHeader, 8590 EPI.EpilogueIterationCountCheck); 8591 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8592 // If there is an epilogue which must run, there's no edge from the 8593 // middle block to exit blocks and thus no need to update the immediate 8594 // dominator of the exit blocks. 8595 DT->changeImmediateDominator(LoopExitBlock, 8596 EPI.EpilogueIterationCountCheck); 8597 8598 // Keep track of bypass blocks, as they feed start values to the induction 8599 // phis in the scalar loop preheader. 8600 if (EPI.SCEVSafetyCheck) 8601 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8602 if (EPI.MemSafetyCheck) 8603 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8604 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8605 8606 // Generate a resume induction for the vector epilogue and put it in the 8607 // vector epilogue preheader 8608 Type *IdxTy = Legal->getWidestInductionType(); 8609 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8610 LoopVectorPreHeader->getFirstNonPHI()); 8611 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8612 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8613 EPI.MainLoopIterationCountCheck); 8614 8615 // Generate the induction variable. 8616 OldInduction = Legal->getPrimaryInduction(); 8617 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8618 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8619 Value *StartIdx = EPResumeVal; 8620 Induction = 8621 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8622 getDebugLocFromInstOrOperands(OldInduction)); 8623 8624 // Generate induction resume values. These variables save the new starting 8625 // indexes for the scalar loop. They are used to test if there are any tail 8626 // iterations left once the vector loop has completed. 8627 // Note that when the vectorized epilogue is skipped due to iteration count 8628 // check, then the resume value for the induction variable comes from 8629 // the trip count of the main vector loop, hence passing the AdditionalBypass 8630 // argument. 8631 createInductionResumeValues(Lp, CountRoundDown, 8632 {VecEpilogueIterationCountCheck, 8633 EPI.VectorTripCount} /* AdditionalBypass */); 8634 8635 AddRuntimeUnrollDisableMetaData(Lp); 8636 return completeLoopSkeleton(Lp, OrigLoopID); 8637 } 8638 8639 BasicBlock * 8640 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8641 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8642 8643 assert(EPI.TripCount && 8644 "Expected trip count to have been safed in the first pass."); 8645 assert( 8646 (!isa<Instruction>(EPI.TripCount) || 8647 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8648 "saved trip count does not dominate insertion point."); 8649 Value *TC = EPI.TripCount; 8650 IRBuilder<> Builder(Insert->getTerminator()); 8651 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8652 8653 // Generate code to check if the loop's trip count is less than VF * UF of the 8654 // vector epilogue loop. 8655 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8656 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8657 8658 Value *CheckMinIters = 8659 Builder.CreateICmp(P, Count, 8660 createStepForVF(Builder, Count->getType(), 8661 EPI.EpilogueVF, EPI.EpilogueUF), 8662 "min.epilog.iters.check"); 8663 8664 ReplaceInstWithInst( 8665 Insert->getTerminator(), 8666 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8667 8668 LoopBypassBlocks.push_back(Insert); 8669 return Insert; 8670 } 8671 8672 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8673 LLVM_DEBUG({ 8674 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8675 << "Epilogue Loop VF:" << EPI.EpilogueVF 8676 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8677 }); 8678 } 8679 8680 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8681 DEBUG_WITH_TYPE(VerboseDebug, { 8682 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8683 }); 8684 } 8685 8686 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8687 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8688 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8689 bool PredicateAtRangeStart = Predicate(Range.Start); 8690 8691 for (ElementCount TmpVF = Range.Start * 2; 8692 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8693 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8694 Range.End = TmpVF; 8695 break; 8696 } 8697 8698 return PredicateAtRangeStart; 8699 } 8700 8701 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8702 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8703 /// of VF's starting at a given VF and extending it as much as possible. Each 8704 /// vectorization decision can potentially shorten this sub-range during 8705 /// buildVPlan(). 8706 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8707 ElementCount MaxVF) { 8708 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8709 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8710 VFRange SubRange = {VF, MaxVFPlusOne}; 8711 VPlans.push_back(buildVPlan(SubRange)); 8712 VF = SubRange.End; 8713 } 8714 } 8715 8716 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8717 VPlanPtr &Plan) { 8718 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8719 8720 // Look for cached value. 8721 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8722 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8723 if (ECEntryIt != EdgeMaskCache.end()) 8724 return ECEntryIt->second; 8725 8726 VPValue *SrcMask = createBlockInMask(Src, Plan); 8727 8728 // The terminator has to be a branch inst! 8729 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8730 assert(BI && "Unexpected terminator found"); 8731 8732 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8733 return EdgeMaskCache[Edge] = SrcMask; 8734 8735 // If source is an exiting block, we know the exit edge is dynamically dead 8736 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8737 // adding uses of an otherwise potentially dead instruction. 8738 if (OrigLoop->isLoopExiting(Src)) 8739 return EdgeMaskCache[Edge] = SrcMask; 8740 8741 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8742 assert(EdgeMask && "No Edge Mask found for condition"); 8743 8744 if (BI->getSuccessor(0) != Dst) 8745 EdgeMask = Builder.createNot(EdgeMask); 8746 8747 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8748 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8749 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8750 // The select version does not introduce new UB if SrcMask is false and 8751 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8752 VPValue *False = Plan->getOrAddVPValue( 8753 ConstantInt::getFalse(BI->getCondition()->getType())); 8754 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8755 } 8756 8757 return EdgeMaskCache[Edge] = EdgeMask; 8758 } 8759 8760 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8761 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8762 8763 // Look for cached value. 8764 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8765 if (BCEntryIt != BlockMaskCache.end()) 8766 return BCEntryIt->second; 8767 8768 // All-one mask is modelled as no-mask following the convention for masked 8769 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8770 VPValue *BlockMask = nullptr; 8771 8772 if (OrigLoop->getHeader() == BB) { 8773 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8774 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8775 8776 // Create the block in mask as the first non-phi instruction in the block. 8777 VPBuilder::InsertPointGuard Guard(Builder); 8778 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8779 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8780 8781 // Introduce the early-exit compare IV <= BTC to form header block mask. 8782 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8783 // Start by constructing the desired canonical IV. 8784 VPValue *IV = nullptr; 8785 if (Legal->getPrimaryInduction()) 8786 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8787 else { 8788 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8789 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8790 IV = IVRecipe; 8791 } 8792 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8793 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8794 8795 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8796 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8797 // as a second argument, we only pass the IV here and extract the 8798 // tripcount from the transform state where codegen of the VP instructions 8799 // happen. 8800 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8801 } else { 8802 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8803 } 8804 return BlockMaskCache[BB] = BlockMask; 8805 } 8806 8807 // This is the block mask. We OR all incoming edges. 8808 for (auto *Predecessor : predecessors(BB)) { 8809 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8810 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8811 return BlockMaskCache[BB] = EdgeMask; 8812 8813 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8814 BlockMask = EdgeMask; 8815 continue; 8816 } 8817 8818 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8819 } 8820 8821 return BlockMaskCache[BB] = BlockMask; 8822 } 8823 8824 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8825 ArrayRef<VPValue *> Operands, 8826 VFRange &Range, 8827 VPlanPtr &Plan) { 8828 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8829 "Must be called with either a load or store"); 8830 8831 auto willWiden = [&](ElementCount VF) -> bool { 8832 if (VF.isScalar()) 8833 return false; 8834 LoopVectorizationCostModel::InstWidening Decision = 8835 CM.getWideningDecision(I, VF); 8836 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8837 "CM decision should be taken at this point."); 8838 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8839 return true; 8840 if (CM.isScalarAfterVectorization(I, VF) || 8841 CM.isProfitableToScalarize(I, VF)) 8842 return false; 8843 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8844 }; 8845 8846 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8847 return nullptr; 8848 8849 VPValue *Mask = nullptr; 8850 if (Legal->isMaskRequired(I)) 8851 Mask = createBlockInMask(I->getParent(), Plan); 8852 8853 // Determine if the pointer operand of the access is either consecutive or 8854 // reverse consecutive. 8855 LoopVectorizationCostModel::InstWidening Decision = 8856 CM.getWideningDecision(I, Range.Start); 8857 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8858 bool Consecutive = 8859 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8860 8861 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8862 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8863 Consecutive, Reverse); 8864 8865 StoreInst *Store = cast<StoreInst>(I); 8866 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8867 Mask, Consecutive, Reverse); 8868 } 8869 8870 VPWidenIntOrFpInductionRecipe * 8871 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8872 ArrayRef<VPValue *> Operands) const { 8873 // Check if this is an integer or fp induction. If so, build the recipe that 8874 // produces its scalar and vector values. 8875 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8876 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8877 II.getKind() == InductionDescriptor::IK_FpInduction) { 8878 assert(II.getStartValue() == 8879 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8880 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8881 return new VPWidenIntOrFpInductionRecipe( 8882 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8883 } 8884 8885 return nullptr; 8886 } 8887 8888 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8889 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8890 VPlan &Plan) const { 8891 // Optimize the special case where the source is a constant integer 8892 // induction variable. Notice that we can only optimize the 'trunc' case 8893 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8894 // (c) other casts depend on pointer size. 8895 8896 // Determine whether \p K is a truncation based on an induction variable that 8897 // can be optimized. 8898 auto isOptimizableIVTruncate = 8899 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8900 return [=](ElementCount VF) -> bool { 8901 return CM.isOptimizableIVTruncate(K, VF); 8902 }; 8903 }; 8904 8905 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8906 isOptimizableIVTruncate(I), Range)) { 8907 8908 InductionDescriptor II = 8909 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8910 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8911 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8912 Start, nullptr, I); 8913 } 8914 return nullptr; 8915 } 8916 8917 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8918 ArrayRef<VPValue *> Operands, 8919 VPlanPtr &Plan) { 8920 // If all incoming values are equal, the incoming VPValue can be used directly 8921 // instead of creating a new VPBlendRecipe. 8922 VPValue *FirstIncoming = Operands[0]; 8923 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8924 return FirstIncoming == Inc; 8925 })) { 8926 return Operands[0]; 8927 } 8928 8929 // We know that all PHIs in non-header blocks are converted into selects, so 8930 // we don't have to worry about the insertion order and we can just use the 8931 // builder. At this point we generate the predication tree. There may be 8932 // duplications since this is a simple recursive scan, but future 8933 // optimizations will clean it up. 8934 SmallVector<VPValue *, 2> OperandsWithMask; 8935 unsigned NumIncoming = Phi->getNumIncomingValues(); 8936 8937 for (unsigned In = 0; In < NumIncoming; In++) { 8938 VPValue *EdgeMask = 8939 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8940 assert((EdgeMask || NumIncoming == 1) && 8941 "Multiple predecessors with one having a full mask"); 8942 OperandsWithMask.push_back(Operands[In]); 8943 if (EdgeMask) 8944 OperandsWithMask.push_back(EdgeMask); 8945 } 8946 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8947 } 8948 8949 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8950 ArrayRef<VPValue *> Operands, 8951 VFRange &Range) const { 8952 8953 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8954 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8955 Range); 8956 8957 if (IsPredicated) 8958 return nullptr; 8959 8960 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8961 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8962 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8963 ID == Intrinsic::pseudoprobe || 8964 ID == Intrinsic::experimental_noalias_scope_decl)) 8965 return nullptr; 8966 8967 auto willWiden = [&](ElementCount VF) -> bool { 8968 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8969 // The following case may be scalarized depending on the VF. 8970 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8971 // version of the instruction. 8972 // Is it beneficial to perform intrinsic call compared to lib call? 8973 bool NeedToScalarize = false; 8974 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8975 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8976 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8977 return UseVectorIntrinsic || !NeedToScalarize; 8978 }; 8979 8980 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8981 return nullptr; 8982 8983 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8984 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8985 } 8986 8987 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8988 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8989 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8990 // Instruction should be widened, unless it is scalar after vectorization, 8991 // scalarization is profitable or it is predicated. 8992 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8993 return CM.isScalarAfterVectorization(I, VF) || 8994 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8995 }; 8996 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8997 Range); 8998 } 8999 9000 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 9001 ArrayRef<VPValue *> Operands) const { 9002 auto IsVectorizableOpcode = [](unsigned Opcode) { 9003 switch (Opcode) { 9004 case Instruction::Add: 9005 case Instruction::And: 9006 case Instruction::AShr: 9007 case Instruction::BitCast: 9008 case Instruction::FAdd: 9009 case Instruction::FCmp: 9010 case Instruction::FDiv: 9011 case Instruction::FMul: 9012 case Instruction::FNeg: 9013 case Instruction::FPExt: 9014 case Instruction::FPToSI: 9015 case Instruction::FPToUI: 9016 case Instruction::FPTrunc: 9017 case Instruction::FRem: 9018 case Instruction::FSub: 9019 case Instruction::ICmp: 9020 case Instruction::IntToPtr: 9021 case Instruction::LShr: 9022 case Instruction::Mul: 9023 case Instruction::Or: 9024 case Instruction::PtrToInt: 9025 case Instruction::SDiv: 9026 case Instruction::Select: 9027 case Instruction::SExt: 9028 case Instruction::Shl: 9029 case Instruction::SIToFP: 9030 case Instruction::SRem: 9031 case Instruction::Sub: 9032 case Instruction::Trunc: 9033 case Instruction::UDiv: 9034 case Instruction::UIToFP: 9035 case Instruction::URem: 9036 case Instruction::Xor: 9037 case Instruction::ZExt: 9038 return true; 9039 } 9040 return false; 9041 }; 9042 9043 if (!IsVectorizableOpcode(I->getOpcode())) 9044 return nullptr; 9045 9046 // Success: widen this instruction. 9047 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 9048 } 9049 9050 void VPRecipeBuilder::fixHeaderPhis() { 9051 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 9052 for (VPWidenPHIRecipe *R : PhisToFix) { 9053 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 9054 VPRecipeBase *IncR = 9055 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 9056 R->addOperand(IncR->getVPSingleValue()); 9057 } 9058 } 9059 9060 VPBasicBlock *VPRecipeBuilder::handleReplication( 9061 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 9062 VPlanPtr &Plan) { 9063 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 9064 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 9065 Range); 9066 9067 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 9068 [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, 9069 Range); 9070 9071 // Even if the instruction is not marked as uniform, there are certain 9072 // intrinsic calls that can be effectively treated as such, so we check for 9073 // them here. Conservatively, we only do this for scalable vectors, since 9074 // for fixed-width VFs we can always fall back on full scalarization. 9075 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 9076 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 9077 case Intrinsic::assume: 9078 case Intrinsic::lifetime_start: 9079 case Intrinsic::lifetime_end: 9080 // For scalable vectors if one of the operands is variant then we still 9081 // want to mark as uniform, which will generate one instruction for just 9082 // the first lane of the vector. We can't scalarize the call in the same 9083 // way as for fixed-width vectors because we don't know how many lanes 9084 // there are. 9085 // 9086 // The reasons for doing it this way for scalable vectors are: 9087 // 1. For the assume intrinsic generating the instruction for the first 9088 // lane is still be better than not generating any at all. For 9089 // example, the input may be a splat across all lanes. 9090 // 2. For the lifetime start/end intrinsics the pointer operand only 9091 // does anything useful when the input comes from a stack object, 9092 // which suggests it should always be uniform. For non-stack objects 9093 // the effect is to poison the object, which still allows us to 9094 // remove the call. 9095 IsUniform = true; 9096 break; 9097 default: 9098 break; 9099 } 9100 } 9101 9102 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 9103 IsUniform, IsPredicated); 9104 setRecipe(I, Recipe); 9105 Plan->addVPValue(I, Recipe); 9106 9107 // Find if I uses a predicated instruction. If so, it will use its scalar 9108 // value. Avoid hoisting the insert-element which packs the scalar value into 9109 // a vector value, as that happens iff all users use the vector value. 9110 for (VPValue *Op : Recipe->operands()) { 9111 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 9112 if (!PredR) 9113 continue; 9114 auto *RepR = 9115 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 9116 assert(RepR->isPredicated() && 9117 "expected Replicate recipe to be predicated"); 9118 RepR->setAlsoPack(false); 9119 } 9120 9121 // Finalize the recipe for Instr, first if it is not predicated. 9122 if (!IsPredicated) { 9123 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9124 VPBB->appendRecipe(Recipe); 9125 return VPBB; 9126 } 9127 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9128 assert(VPBB->getSuccessors().empty() && 9129 "VPBB has successors when handling predicated replication."); 9130 // Record predicated instructions for above packing optimizations. 9131 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9132 VPBlockUtils::insertBlockAfter(Region, VPBB); 9133 auto *RegSucc = new VPBasicBlock(); 9134 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9135 return RegSucc; 9136 } 9137 9138 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9139 VPRecipeBase *PredRecipe, 9140 VPlanPtr &Plan) { 9141 // Instructions marked for predication are replicated and placed under an 9142 // if-then construct to prevent side-effects. 9143 9144 // Generate recipes to compute the block mask for this region. 9145 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9146 9147 // Build the triangular if-then region. 9148 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9149 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9150 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9151 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9152 auto *PHIRecipe = Instr->getType()->isVoidTy() 9153 ? nullptr 9154 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9155 if (PHIRecipe) { 9156 Plan->removeVPValueFor(Instr); 9157 Plan->addVPValue(Instr, PHIRecipe); 9158 } 9159 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9160 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9161 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9162 9163 // Note: first set Entry as region entry and then connect successors starting 9164 // from it in order, to propagate the "parent" of each VPBasicBlock. 9165 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9166 VPBlockUtils::connectBlocks(Pred, Exit); 9167 9168 return Region; 9169 } 9170 9171 VPRecipeOrVPValueTy 9172 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9173 ArrayRef<VPValue *> Operands, 9174 VFRange &Range, VPlanPtr &Plan) { 9175 // First, check for specific widening recipes that deal with calls, memory 9176 // operations, inductions and Phi nodes. 9177 if (auto *CI = dyn_cast<CallInst>(Instr)) 9178 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9179 9180 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9181 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9182 9183 VPRecipeBase *Recipe; 9184 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9185 if (Phi->getParent() != OrigLoop->getHeader()) 9186 return tryToBlend(Phi, Operands, Plan); 9187 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9188 return toVPRecipeResult(Recipe); 9189 9190 VPWidenPHIRecipe *PhiRecipe = nullptr; 9191 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9192 VPValue *StartV = Operands[0]; 9193 if (Legal->isReductionVariable(Phi)) { 9194 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9195 assert(RdxDesc.getRecurrenceStartValue() == 9196 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9197 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9198 CM.isInLoopReduction(Phi), 9199 CM.useOrderedReductions(RdxDesc)); 9200 } else { 9201 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9202 } 9203 9204 // Record the incoming value from the backedge, so we can add the incoming 9205 // value from the backedge after all recipes have been created. 9206 recordRecipeOf(cast<Instruction>( 9207 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9208 PhisToFix.push_back(PhiRecipe); 9209 } else { 9210 // TODO: record start and backedge value for remaining pointer induction 9211 // phis. 9212 assert(Phi->getType()->isPointerTy() && 9213 "only pointer phis should be handled here"); 9214 PhiRecipe = new VPWidenPHIRecipe(Phi); 9215 } 9216 9217 return toVPRecipeResult(PhiRecipe); 9218 } 9219 9220 if (isa<TruncInst>(Instr) && 9221 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9222 Range, *Plan))) 9223 return toVPRecipeResult(Recipe); 9224 9225 if (!shouldWiden(Instr, Range)) 9226 return nullptr; 9227 9228 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9229 return toVPRecipeResult(new VPWidenGEPRecipe( 9230 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9231 9232 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9233 bool InvariantCond = 9234 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9235 return toVPRecipeResult(new VPWidenSelectRecipe( 9236 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9237 } 9238 9239 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9240 } 9241 9242 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9243 ElementCount MaxVF) { 9244 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9245 9246 // Collect instructions from the original loop that will become trivially dead 9247 // in the vectorized loop. We don't need to vectorize these instructions. For 9248 // example, original induction update instructions can become dead because we 9249 // separately emit induction "steps" when generating code for the new loop. 9250 // Similarly, we create a new latch condition when setting up the structure 9251 // of the new loop, so the old one can become dead. 9252 SmallPtrSet<Instruction *, 4> DeadInstructions; 9253 collectTriviallyDeadInstructions(DeadInstructions); 9254 9255 // Add assume instructions we need to drop to DeadInstructions, to prevent 9256 // them from being added to the VPlan. 9257 // TODO: We only need to drop assumes in blocks that get flattend. If the 9258 // control flow is preserved, we should keep them. 9259 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9260 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9261 9262 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9263 // Dead instructions do not need sinking. Remove them from SinkAfter. 9264 for (Instruction *I : DeadInstructions) 9265 SinkAfter.erase(I); 9266 9267 // Cannot sink instructions after dead instructions (there won't be any 9268 // recipes for them). Instead, find the first non-dead previous instruction. 9269 for (auto &P : Legal->getSinkAfter()) { 9270 Instruction *SinkTarget = P.second; 9271 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9272 (void)FirstInst; 9273 while (DeadInstructions.contains(SinkTarget)) { 9274 assert( 9275 SinkTarget != FirstInst && 9276 "Must find a live instruction (at least the one feeding the " 9277 "first-order recurrence PHI) before reaching beginning of the block"); 9278 SinkTarget = SinkTarget->getPrevNode(); 9279 assert(SinkTarget != P.first && 9280 "sink source equals target, no sinking required"); 9281 } 9282 P.second = SinkTarget; 9283 } 9284 9285 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9286 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9287 VFRange SubRange = {VF, MaxVFPlusOne}; 9288 VPlans.push_back( 9289 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9290 VF = SubRange.End; 9291 } 9292 } 9293 9294 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9295 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9296 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9297 9298 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9299 9300 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9301 9302 // --------------------------------------------------------------------------- 9303 // Pre-construction: record ingredients whose recipes we'll need to further 9304 // process after constructing the initial VPlan. 9305 // --------------------------------------------------------------------------- 9306 9307 // Mark instructions we'll need to sink later and their targets as 9308 // ingredients whose recipe we'll need to record. 9309 for (auto &Entry : SinkAfter) { 9310 RecipeBuilder.recordRecipeOf(Entry.first); 9311 RecipeBuilder.recordRecipeOf(Entry.second); 9312 } 9313 for (auto &Reduction : CM.getInLoopReductionChains()) { 9314 PHINode *Phi = Reduction.first; 9315 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9316 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9317 9318 RecipeBuilder.recordRecipeOf(Phi); 9319 for (auto &R : ReductionOperations) { 9320 RecipeBuilder.recordRecipeOf(R); 9321 // For min/max reducitons, where we have a pair of icmp/select, we also 9322 // need to record the ICmp recipe, so it can be removed later. 9323 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9324 "Only min/max recurrences allowed for inloop reductions"); 9325 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9326 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9327 } 9328 } 9329 9330 // For each interleave group which is relevant for this (possibly trimmed) 9331 // Range, add it to the set of groups to be later applied to the VPlan and add 9332 // placeholders for its members' Recipes which we'll be replacing with a 9333 // single VPInterleaveRecipe. 9334 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9335 auto applyIG = [IG, this](ElementCount VF) -> bool { 9336 return (VF.isVector() && // Query is illegal for VF == 1 9337 CM.getWideningDecision(IG->getInsertPos(), VF) == 9338 LoopVectorizationCostModel::CM_Interleave); 9339 }; 9340 if (!getDecisionAndClampRange(applyIG, Range)) 9341 continue; 9342 InterleaveGroups.insert(IG); 9343 for (unsigned i = 0; i < IG->getFactor(); i++) 9344 if (Instruction *Member = IG->getMember(i)) 9345 RecipeBuilder.recordRecipeOf(Member); 9346 }; 9347 9348 // --------------------------------------------------------------------------- 9349 // Build initial VPlan: Scan the body of the loop in a topological order to 9350 // visit each basic block after having visited its predecessor basic blocks. 9351 // --------------------------------------------------------------------------- 9352 9353 auto Plan = std::make_unique<VPlan>(); 9354 9355 // Scan the body of the loop in a topological order to visit each basic block 9356 // after having visited its predecessor basic blocks. 9357 LoopBlocksDFS DFS(OrigLoop); 9358 DFS.perform(LI); 9359 9360 VPBasicBlock *VPBB = nullptr; 9361 VPBasicBlock *HeaderVPBB = nullptr; 9362 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9363 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9364 // Relevant instructions from basic block BB will be grouped into VPRecipe 9365 // ingredients and fill a new VPBasicBlock. 9366 unsigned VPBBsForBB = 0; 9367 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9368 if (VPBB) 9369 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9370 else { 9371 auto *TopRegion = new VPRegionBlock("vector loop"); 9372 TopRegion->setEntry(FirstVPBBForBB); 9373 Plan->setEntry(TopRegion); 9374 HeaderVPBB = FirstVPBBForBB; 9375 } 9376 VPBB = FirstVPBBForBB; 9377 Builder.setInsertPoint(VPBB); 9378 9379 // Introduce each ingredient into VPlan. 9380 // TODO: Model and preserve debug instrinsics in VPlan. 9381 for (Instruction &I : BB->instructionsWithoutDebug()) { 9382 Instruction *Instr = &I; 9383 9384 // First filter out irrelevant instructions, to ensure no recipes are 9385 // built for them. 9386 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9387 continue; 9388 9389 SmallVector<VPValue *, 4> Operands; 9390 auto *Phi = dyn_cast<PHINode>(Instr); 9391 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9392 Operands.push_back(Plan->getOrAddVPValue( 9393 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9394 } else { 9395 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9396 Operands = {OpRange.begin(), OpRange.end()}; 9397 } 9398 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9399 Instr, Operands, Range, Plan)) { 9400 // If Instr can be simplified to an existing VPValue, use it. 9401 if (RecipeOrValue.is<VPValue *>()) { 9402 auto *VPV = RecipeOrValue.get<VPValue *>(); 9403 Plan->addVPValue(Instr, VPV); 9404 // If the re-used value is a recipe, register the recipe for the 9405 // instruction, in case the recipe for Instr needs to be recorded. 9406 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9407 RecipeBuilder.setRecipe(Instr, R); 9408 continue; 9409 } 9410 // Otherwise, add the new recipe. 9411 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9412 for (auto *Def : Recipe->definedValues()) { 9413 auto *UV = Def->getUnderlyingValue(); 9414 Plan->addVPValue(UV, Def); 9415 } 9416 9417 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9418 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9419 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9420 // of the header block. That can happen for truncates of induction 9421 // variables. Those recipes are moved to the phi section of the header 9422 // block after applying SinkAfter, which relies on the original 9423 // position of the trunc. 9424 assert(isa<TruncInst>(Instr)); 9425 InductionsToMove.push_back( 9426 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9427 } 9428 RecipeBuilder.setRecipe(Instr, Recipe); 9429 VPBB->appendRecipe(Recipe); 9430 continue; 9431 } 9432 9433 // Otherwise, if all widening options failed, Instruction is to be 9434 // replicated. This may create a successor for VPBB. 9435 VPBasicBlock *NextVPBB = 9436 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9437 if (NextVPBB != VPBB) { 9438 VPBB = NextVPBB; 9439 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9440 : ""); 9441 } 9442 } 9443 } 9444 9445 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9446 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9447 "entry block must be set to a VPRegionBlock having a non-empty entry " 9448 "VPBasicBlock"); 9449 cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); 9450 RecipeBuilder.fixHeaderPhis(); 9451 9452 // --------------------------------------------------------------------------- 9453 // Transform initial VPlan: Apply previously taken decisions, in order, to 9454 // bring the VPlan to its final state. 9455 // --------------------------------------------------------------------------- 9456 9457 // Apply Sink-After legal constraints. 9458 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9459 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9460 if (Region && Region->isReplicator()) { 9461 assert(Region->getNumSuccessors() == 1 && 9462 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9463 assert(R->getParent()->size() == 1 && 9464 "A recipe in an original replicator region must be the only " 9465 "recipe in its block"); 9466 return Region; 9467 } 9468 return nullptr; 9469 }; 9470 for (auto &Entry : SinkAfter) { 9471 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9472 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9473 9474 auto *TargetRegion = GetReplicateRegion(Target); 9475 auto *SinkRegion = GetReplicateRegion(Sink); 9476 if (!SinkRegion) { 9477 // If the sink source is not a replicate region, sink the recipe directly. 9478 if (TargetRegion) { 9479 // The target is in a replication region, make sure to move Sink to 9480 // the block after it, not into the replication region itself. 9481 VPBasicBlock *NextBlock = 9482 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9483 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9484 } else 9485 Sink->moveAfter(Target); 9486 continue; 9487 } 9488 9489 // The sink source is in a replicate region. Unhook the region from the CFG. 9490 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9491 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9492 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9493 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9494 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9495 9496 if (TargetRegion) { 9497 // The target recipe is also in a replicate region, move the sink region 9498 // after the target region. 9499 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9500 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9501 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9502 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9503 } else { 9504 // The sink source is in a replicate region, we need to move the whole 9505 // replicate region, which should only contain a single recipe in the 9506 // main block. 9507 auto *SplitBlock = 9508 Target->getParent()->splitAt(std::next(Target->getIterator())); 9509 9510 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9511 9512 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9513 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9514 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9515 if (VPBB == SplitPred) 9516 VPBB = SplitBlock; 9517 } 9518 } 9519 9520 // Now that sink-after is done, move induction recipes for optimized truncates 9521 // to the phi section of the header block. 9522 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9523 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9524 9525 // Adjust the recipes for any inloop reductions. 9526 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9527 9528 // Introduce a recipe to combine the incoming and previous values of a 9529 // first-order recurrence. 9530 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9531 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9532 if (!RecurPhi) 9533 continue; 9534 9535 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9536 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9537 auto *Region = GetReplicateRegion(PrevRecipe); 9538 if (Region) 9539 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9540 if (Region || PrevRecipe->isPhi()) 9541 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9542 else 9543 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9544 9545 auto *RecurSplice = cast<VPInstruction>( 9546 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9547 {RecurPhi, RecurPhi->getBackedgeValue()})); 9548 9549 RecurPhi->replaceAllUsesWith(RecurSplice); 9550 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9551 // all users. 9552 RecurSplice->setOperand(0, RecurPhi); 9553 } 9554 9555 // Interleave memory: for each Interleave Group we marked earlier as relevant 9556 // for this VPlan, replace the Recipes widening its memory instructions with a 9557 // single VPInterleaveRecipe at its insertion point. 9558 for (auto IG : InterleaveGroups) { 9559 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9560 RecipeBuilder.getRecipe(IG->getInsertPos())); 9561 SmallVector<VPValue *, 4> StoredValues; 9562 for (unsigned i = 0; i < IG->getFactor(); ++i) 9563 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9564 auto *StoreR = 9565 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9566 StoredValues.push_back(StoreR->getStoredValue()); 9567 } 9568 9569 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9570 Recipe->getMask()); 9571 VPIG->insertBefore(Recipe); 9572 unsigned J = 0; 9573 for (unsigned i = 0; i < IG->getFactor(); ++i) 9574 if (Instruction *Member = IG->getMember(i)) { 9575 if (!Member->getType()->isVoidTy()) { 9576 VPValue *OriginalV = Plan->getVPValue(Member); 9577 Plan->removeVPValueFor(Member); 9578 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9579 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9580 J++; 9581 } 9582 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9583 } 9584 } 9585 9586 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9587 // in ways that accessing values using original IR values is incorrect. 9588 Plan->disableValue2VPValue(); 9589 9590 VPlanTransforms::sinkScalarOperands(*Plan); 9591 VPlanTransforms::mergeReplicateRegions(*Plan); 9592 9593 std::string PlanName; 9594 raw_string_ostream RSO(PlanName); 9595 ElementCount VF = Range.Start; 9596 Plan->addVF(VF); 9597 RSO << "Initial VPlan for VF={" << VF; 9598 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9599 Plan->addVF(VF); 9600 RSO << "," << VF; 9601 } 9602 RSO << "},UF>=1"; 9603 RSO.flush(); 9604 Plan->setName(PlanName); 9605 9606 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9607 return Plan; 9608 } 9609 9610 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9611 // Outer loop handling: They may require CFG and instruction level 9612 // transformations before even evaluating whether vectorization is profitable. 9613 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9614 // the vectorization pipeline. 9615 assert(!OrigLoop->isInnermost()); 9616 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9617 9618 // Create new empty VPlan 9619 auto Plan = std::make_unique<VPlan>(); 9620 9621 // Build hierarchical CFG 9622 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9623 HCFGBuilder.buildHierarchicalCFG(); 9624 9625 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9626 VF *= 2) 9627 Plan->addVF(VF); 9628 9629 if (EnableVPlanPredication) { 9630 VPlanPredicator VPP(*Plan); 9631 VPP.predicate(); 9632 9633 // Avoid running transformation to recipes until masked code generation in 9634 // VPlan-native path is in place. 9635 return Plan; 9636 } 9637 9638 SmallPtrSet<Instruction *, 1> DeadInstructions; 9639 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9640 Legal->getInductionVars(), 9641 DeadInstructions, *PSE.getSE()); 9642 return Plan; 9643 } 9644 9645 // Adjust the recipes for reductions. For in-loop reductions the chain of 9646 // instructions leading from the loop exit instr to the phi need to be converted 9647 // to reductions, with one operand being vector and the other being the scalar 9648 // reduction chain. For other reductions, a select is introduced between the phi 9649 // and live-out recipes when folding the tail. 9650 void LoopVectorizationPlanner::adjustRecipesForReductions( 9651 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9652 ElementCount MinVF) { 9653 for (auto &Reduction : CM.getInLoopReductionChains()) { 9654 PHINode *Phi = Reduction.first; 9655 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9656 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9657 9658 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9659 continue; 9660 9661 // ReductionOperations are orders top-down from the phi's use to the 9662 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9663 // which of the two operands will remain scalar and which will be reduced. 9664 // For minmax the chain will be the select instructions. 9665 Instruction *Chain = Phi; 9666 for (Instruction *R : ReductionOperations) { 9667 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9668 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9669 9670 VPValue *ChainOp = Plan->getVPValue(Chain); 9671 unsigned FirstOpId; 9672 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9673 "Only min/max recurrences allowed for inloop reductions"); 9674 // Recognize a call to the llvm.fmuladd intrinsic. 9675 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9676 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9677 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9678 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9679 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9680 "Expected to replace a VPWidenSelectSC"); 9681 FirstOpId = 1; 9682 } else { 9683 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9684 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9685 "Expected to replace a VPWidenSC"); 9686 FirstOpId = 0; 9687 } 9688 unsigned VecOpId = 9689 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9690 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9691 9692 auto *CondOp = CM.foldTailByMasking() 9693 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9694 : nullptr; 9695 9696 if (IsFMulAdd) { 9697 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9698 // need to create an fmul recipe to use as the vector operand for the 9699 // fadd reduction. 9700 VPInstruction *FMulRecipe = new VPInstruction( 9701 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9702 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9703 WidenRecipe->getParent()->insert(FMulRecipe, 9704 WidenRecipe->getIterator()); 9705 VecOp = FMulRecipe; 9706 } 9707 VPReductionRecipe *RedRecipe = 9708 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9709 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9710 Plan->removeVPValueFor(R); 9711 Plan->addVPValue(R, RedRecipe); 9712 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9713 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9714 WidenRecipe->eraseFromParent(); 9715 9716 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9717 VPRecipeBase *CompareRecipe = 9718 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9719 assert(isa<VPWidenRecipe>(CompareRecipe) && 9720 "Expected to replace a VPWidenSC"); 9721 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9722 "Expected no remaining users"); 9723 CompareRecipe->eraseFromParent(); 9724 } 9725 Chain = R; 9726 } 9727 } 9728 9729 // If tail is folded by masking, introduce selects between the phi 9730 // and the live-out instruction of each reduction, at the end of the latch. 9731 if (CM.foldTailByMasking()) { 9732 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9733 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9734 if (!PhiR || PhiR->isInLoop()) 9735 continue; 9736 Builder.setInsertPoint(LatchVPBB); 9737 VPValue *Cond = 9738 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9739 VPValue *Red = PhiR->getBackedgeValue(); 9740 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9741 } 9742 } 9743 } 9744 9745 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9746 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9747 VPSlotTracker &SlotTracker) const { 9748 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9749 IG->getInsertPos()->printAsOperand(O, false); 9750 O << ", "; 9751 getAddr()->printAsOperand(O, SlotTracker); 9752 VPValue *Mask = getMask(); 9753 if (Mask) { 9754 O << ", "; 9755 Mask->printAsOperand(O, SlotTracker); 9756 } 9757 9758 unsigned OpIdx = 0; 9759 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9760 if (!IG->getMember(i)) 9761 continue; 9762 if (getNumStoreOperands() > 0) { 9763 O << "\n" << Indent << " store "; 9764 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9765 O << " to index " << i; 9766 } else { 9767 O << "\n" << Indent << " "; 9768 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9769 O << " = load from index " << i; 9770 } 9771 ++OpIdx; 9772 } 9773 } 9774 #endif 9775 9776 void VPWidenCallRecipe::execute(VPTransformState &State) { 9777 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9778 *this, State); 9779 } 9780 9781 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9782 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9783 this, *this, InvariantCond, State); 9784 } 9785 9786 void VPWidenRecipe::execute(VPTransformState &State) { 9787 State.ILV->widenInstruction(*getUnderlyingInstr(), this, State); 9788 } 9789 9790 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9791 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9792 // Construct a vector GEP by widening the operands of the scalar GEP as 9793 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9794 // results in a vector of pointers when at least one operand of the GEP 9795 // is vector-typed. Thus, to keep the representation compact, we only use 9796 // vector-typed operands for loop-varying values. 9797 9798 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9799 // If we are vectorizing, but the GEP has only loop-invariant operands, 9800 // the GEP we build (by only using vector-typed operands for 9801 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9802 // produce a vector of pointers, we need to either arbitrarily pick an 9803 // operand to broadcast, or broadcast a clone of the original GEP. 9804 // Here, we broadcast a clone of the original. 9805 // 9806 // TODO: If at some point we decide to scalarize instructions having 9807 // loop-invariant operands, this special case will no longer be 9808 // required. We would add the scalarization decision to 9809 // collectLoopScalars() and teach getVectorValue() to broadcast 9810 // the lane-zero scalar value. 9811 auto *Clone = State.Builder.Insert(GEP->clone()); 9812 for (unsigned Part = 0; Part < State.UF; ++Part) { 9813 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9814 State.set(this, EntryPart, Part); 9815 State.ILV->addMetadata(EntryPart, GEP); 9816 } 9817 } else { 9818 // If the GEP has at least one loop-varying operand, we are sure to 9819 // produce a vector of pointers. But if we are only unrolling, we want 9820 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9821 // produce with the code below will be scalar (if VF == 1) or vector 9822 // (otherwise). Note that for the unroll-only case, we still maintain 9823 // values in the vector mapping with initVector, as we do for other 9824 // instructions. 9825 for (unsigned Part = 0; Part < State.UF; ++Part) { 9826 // The pointer operand of the new GEP. If it's loop-invariant, we 9827 // won't broadcast it. 9828 auto *Ptr = IsPtrLoopInvariant 9829 ? State.get(getOperand(0), VPIteration(0, 0)) 9830 : State.get(getOperand(0), Part); 9831 9832 // Collect all the indices for the new GEP. If any index is 9833 // loop-invariant, we won't broadcast it. 9834 SmallVector<Value *, 4> Indices; 9835 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9836 VPValue *Operand = getOperand(I); 9837 if (IsIndexLoopInvariant[I - 1]) 9838 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9839 else 9840 Indices.push_back(State.get(Operand, Part)); 9841 } 9842 9843 // If the GEP instruction is vectorized and was in a basic block that 9844 // needed predication, we can't propagate the poison-generating 'inbounds' 9845 // flag. The control flow has been linearized and the GEP is no longer 9846 // guarded by the predicate, which could make the 'inbounds' properties to 9847 // no longer hold. 9848 bool IsInBounds = 9849 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9850 9851 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9852 // but it should be a vector, otherwise. 9853 auto *NewGEP = IsInBounds 9854 ? State.Builder.CreateInBoundsGEP( 9855 GEP->getSourceElementType(), Ptr, Indices) 9856 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9857 Ptr, Indices); 9858 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9859 "NewGEP is not a pointer vector"); 9860 State.set(this, NewGEP, Part); 9861 State.ILV->addMetadata(NewGEP, GEP); 9862 } 9863 } 9864 } 9865 9866 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9867 assert(!State.Instance && "Int or FP induction being replicated."); 9868 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9869 getTruncInst(), getVPValue(0), 9870 getCastValue(), State); 9871 } 9872 9873 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9874 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9875 State); 9876 } 9877 9878 void VPBlendRecipe::execute(VPTransformState &State) { 9879 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9880 // We know that all PHIs in non-header blocks are converted into 9881 // selects, so we don't have to worry about the insertion order and we 9882 // can just use the builder. 9883 // At this point we generate the predication tree. There may be 9884 // duplications since this is a simple recursive scan, but future 9885 // optimizations will clean it up. 9886 9887 unsigned NumIncoming = getNumIncomingValues(); 9888 9889 // Generate a sequence of selects of the form: 9890 // SELECT(Mask3, In3, 9891 // SELECT(Mask2, In2, 9892 // SELECT(Mask1, In1, 9893 // In0))) 9894 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9895 // are essentially undef are taken from In0. 9896 InnerLoopVectorizer::VectorParts Entry(State.UF); 9897 for (unsigned In = 0; In < NumIncoming; ++In) { 9898 for (unsigned Part = 0; Part < State.UF; ++Part) { 9899 // We might have single edge PHIs (blocks) - use an identity 9900 // 'select' for the first PHI operand. 9901 Value *In0 = State.get(getIncomingValue(In), Part); 9902 if (In == 0) 9903 Entry[Part] = In0; // Initialize with the first incoming value. 9904 else { 9905 // Select between the current value and the previous incoming edge 9906 // based on the incoming mask. 9907 Value *Cond = State.get(getMask(In), Part); 9908 Entry[Part] = 9909 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9910 } 9911 } 9912 } 9913 for (unsigned Part = 0; Part < State.UF; ++Part) 9914 State.set(this, Entry[Part], Part); 9915 } 9916 9917 void VPInterleaveRecipe::execute(VPTransformState &State) { 9918 assert(!State.Instance && "Interleave group being replicated."); 9919 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9920 getStoredValues(), getMask()); 9921 } 9922 9923 void VPReductionRecipe::execute(VPTransformState &State) { 9924 assert(!State.Instance && "Reduction being replicated."); 9925 Value *PrevInChain = State.get(getChainOp(), 0); 9926 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9927 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9928 // Propagate the fast-math flags carried by the underlying instruction. 9929 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9930 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9931 for (unsigned Part = 0; Part < State.UF; ++Part) { 9932 Value *NewVecOp = State.get(getVecOp(), Part); 9933 if (VPValue *Cond = getCondOp()) { 9934 Value *NewCond = State.get(Cond, Part); 9935 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9936 Value *Iden = RdxDesc->getRecurrenceIdentity( 9937 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9938 Value *IdenVec = 9939 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9940 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9941 NewVecOp = Select; 9942 } 9943 Value *NewRed; 9944 Value *NextInChain; 9945 if (IsOrdered) { 9946 if (State.VF.isVector()) 9947 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9948 PrevInChain); 9949 else 9950 NewRed = State.Builder.CreateBinOp( 9951 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9952 NewVecOp); 9953 PrevInChain = NewRed; 9954 } else { 9955 PrevInChain = State.get(getChainOp(), Part); 9956 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9957 } 9958 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9959 NextInChain = 9960 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9961 NewRed, PrevInChain); 9962 } else if (IsOrdered) 9963 NextInChain = NewRed; 9964 else 9965 NextInChain = State.Builder.CreateBinOp( 9966 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9967 PrevInChain); 9968 State.set(this, NextInChain, Part); 9969 } 9970 } 9971 9972 void VPReplicateRecipe::execute(VPTransformState &State) { 9973 if (State.Instance) { // Generate a single instance. 9974 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9975 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9976 IsPredicated, State); 9977 // Insert scalar instance packing it into a vector. 9978 if (AlsoPack && State.VF.isVector()) { 9979 // If we're constructing lane 0, initialize to start from poison. 9980 if (State.Instance->Lane.isFirstLane()) { 9981 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9982 Value *Poison = PoisonValue::get( 9983 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9984 State.set(this, Poison, State.Instance->Part); 9985 } 9986 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9987 } 9988 return; 9989 } 9990 9991 // Generate scalar instances for all VF lanes of all UF parts, unless the 9992 // instruction is uniform inwhich case generate only the first lane for each 9993 // of the UF parts. 9994 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9995 assert((!State.VF.isScalable() || IsUniform) && 9996 "Can't scalarize a scalable vector"); 9997 for (unsigned Part = 0; Part < State.UF; ++Part) 9998 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9999 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 10000 VPIteration(Part, Lane), IsPredicated, 10001 State); 10002 } 10003 10004 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 10005 assert(State.Instance && "Branch on Mask works only on single instance."); 10006 10007 unsigned Part = State.Instance->Part; 10008 unsigned Lane = State.Instance->Lane.getKnownLane(); 10009 10010 Value *ConditionBit = nullptr; 10011 VPValue *BlockInMask = getMask(); 10012 if (BlockInMask) { 10013 ConditionBit = State.get(BlockInMask, Part); 10014 if (ConditionBit->getType()->isVectorTy()) 10015 ConditionBit = State.Builder.CreateExtractElement( 10016 ConditionBit, State.Builder.getInt32(Lane)); 10017 } else // Block in mask is all-one. 10018 ConditionBit = State.Builder.getTrue(); 10019 10020 // Replace the temporary unreachable terminator with a new conditional branch, 10021 // whose two destinations will be set later when they are created. 10022 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 10023 assert(isa<UnreachableInst>(CurrentTerminator) && 10024 "Expected to replace unreachable terminator with conditional branch."); 10025 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 10026 CondBr->setSuccessor(0, nullptr); 10027 ReplaceInstWithInst(CurrentTerminator, CondBr); 10028 } 10029 10030 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 10031 assert(State.Instance && "Predicated instruction PHI works per instance."); 10032 Instruction *ScalarPredInst = 10033 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 10034 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 10035 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 10036 assert(PredicatingBB && "Predicated block has no single predecessor."); 10037 assert(isa<VPReplicateRecipe>(getOperand(0)) && 10038 "operand must be VPReplicateRecipe"); 10039 10040 // By current pack/unpack logic we need to generate only a single phi node: if 10041 // a vector value for the predicated instruction exists at this point it means 10042 // the instruction has vector users only, and a phi for the vector value is 10043 // needed. In this case the recipe of the predicated instruction is marked to 10044 // also do that packing, thereby "hoisting" the insert-element sequence. 10045 // Otherwise, a phi node for the scalar value is needed. 10046 unsigned Part = State.Instance->Part; 10047 if (State.hasVectorValue(getOperand(0), Part)) { 10048 Value *VectorValue = State.get(getOperand(0), Part); 10049 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 10050 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 10051 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 10052 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 10053 if (State.hasVectorValue(this, Part)) 10054 State.reset(this, VPhi, Part); 10055 else 10056 State.set(this, VPhi, Part); 10057 // NOTE: Currently we need to update the value of the operand, so the next 10058 // predicated iteration inserts its generated value in the correct vector. 10059 State.reset(getOperand(0), VPhi, Part); 10060 } else { 10061 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 10062 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 10063 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 10064 PredicatingBB); 10065 Phi->addIncoming(ScalarPredInst, PredicatedBB); 10066 if (State.hasScalarValue(this, *State.Instance)) 10067 State.reset(this, Phi, *State.Instance); 10068 else 10069 State.set(this, Phi, *State.Instance); 10070 // NOTE: Currently we need to update the value of the operand, so the next 10071 // predicated iteration inserts its generated value in the correct vector. 10072 State.reset(getOperand(0), Phi, *State.Instance); 10073 } 10074 } 10075 10076 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 10077 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 10078 State.ILV->vectorizeMemoryInstruction( 10079 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 10080 StoredValue, getMask(), Consecutive, Reverse); 10081 } 10082 10083 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10084 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10085 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10086 // for predication. 10087 static ScalarEpilogueLowering getScalarEpilogueLowering( 10088 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10089 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10090 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10091 LoopVectorizationLegality &LVL) { 10092 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10093 // don't look at hints or options, and don't request a scalar epilogue. 10094 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10095 // LoopAccessInfo (due to code dependency and not being able to reliably get 10096 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10097 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10098 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10099 // back to the old way and vectorize with versioning when forced. See D81345.) 10100 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10101 PGSOQueryType::IRPass) && 10102 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10103 return CM_ScalarEpilogueNotAllowedOptSize; 10104 10105 // 2) If set, obey the directives 10106 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10107 switch (PreferPredicateOverEpilogue) { 10108 case PreferPredicateTy::ScalarEpilogue: 10109 return CM_ScalarEpilogueAllowed; 10110 case PreferPredicateTy::PredicateElseScalarEpilogue: 10111 return CM_ScalarEpilogueNotNeededUsePredicate; 10112 case PreferPredicateTy::PredicateOrDontVectorize: 10113 return CM_ScalarEpilogueNotAllowedUsePredicate; 10114 }; 10115 } 10116 10117 // 3) If set, obey the hints 10118 switch (Hints.getPredicate()) { 10119 case LoopVectorizeHints::FK_Enabled: 10120 return CM_ScalarEpilogueNotNeededUsePredicate; 10121 case LoopVectorizeHints::FK_Disabled: 10122 return CM_ScalarEpilogueAllowed; 10123 }; 10124 10125 // 4) if the TTI hook indicates this is profitable, request predication. 10126 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10127 LVL.getLAI())) 10128 return CM_ScalarEpilogueNotNeededUsePredicate; 10129 10130 return CM_ScalarEpilogueAllowed; 10131 } 10132 10133 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10134 // If Values have been set for this Def return the one relevant for \p Part. 10135 if (hasVectorValue(Def, Part)) 10136 return Data.PerPartOutput[Def][Part]; 10137 10138 if (!hasScalarValue(Def, {Part, 0})) { 10139 Value *IRV = Def->getLiveInIRValue(); 10140 Value *B = ILV->getBroadcastInstrs(IRV); 10141 set(Def, B, Part); 10142 return B; 10143 } 10144 10145 Value *ScalarValue = get(Def, {Part, 0}); 10146 // If we aren't vectorizing, we can just copy the scalar map values over 10147 // to the vector map. 10148 if (VF.isScalar()) { 10149 set(Def, ScalarValue, Part); 10150 return ScalarValue; 10151 } 10152 10153 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10154 bool IsUniform = RepR && RepR->isUniform(); 10155 10156 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10157 // Check if there is a scalar value for the selected lane. 10158 if (!hasScalarValue(Def, {Part, LastLane})) { 10159 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10160 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10161 "unexpected recipe found to be invariant"); 10162 IsUniform = true; 10163 LastLane = 0; 10164 } 10165 10166 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10167 // Set the insert point after the last scalarized instruction or after the 10168 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10169 // will directly follow the scalar definitions. 10170 auto OldIP = Builder.saveIP(); 10171 auto NewIP = 10172 isa<PHINode>(LastInst) 10173 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10174 : std::next(BasicBlock::iterator(LastInst)); 10175 Builder.SetInsertPoint(&*NewIP); 10176 10177 // However, if we are vectorizing, we need to construct the vector values. 10178 // If the value is known to be uniform after vectorization, we can just 10179 // broadcast the scalar value corresponding to lane zero for each unroll 10180 // iteration. Otherwise, we construct the vector values using 10181 // insertelement instructions. Since the resulting vectors are stored in 10182 // State, we will only generate the insertelements once. 10183 Value *VectorValue = nullptr; 10184 if (IsUniform) { 10185 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10186 set(Def, VectorValue, Part); 10187 } else { 10188 // Initialize packing with insertelements to start from undef. 10189 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10190 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10191 set(Def, Undef, Part); 10192 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10193 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10194 VectorValue = get(Def, Part); 10195 } 10196 Builder.restoreIP(OldIP); 10197 return VectorValue; 10198 } 10199 10200 // Process the loop in the VPlan-native vectorization path. This path builds 10201 // VPlan upfront in the vectorization pipeline, which allows to apply 10202 // VPlan-to-VPlan transformations from the very beginning without modifying the 10203 // input LLVM IR. 10204 static bool processLoopInVPlanNativePath( 10205 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10206 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10207 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10208 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10209 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10210 LoopVectorizationRequirements &Requirements) { 10211 10212 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10213 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10214 return false; 10215 } 10216 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10217 Function *F = L->getHeader()->getParent(); 10218 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10219 10220 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10221 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10222 10223 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10224 &Hints, IAI); 10225 // Use the planner for outer loop vectorization. 10226 // TODO: CM is not used at this point inside the planner. Turn CM into an 10227 // optional argument if we don't need it in the future. 10228 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10229 Requirements, ORE); 10230 10231 // Get user vectorization factor. 10232 ElementCount UserVF = Hints.getWidth(); 10233 10234 CM.collectElementTypesForWidening(); 10235 10236 // Plan how to best vectorize, return the best VF and its cost. 10237 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10238 10239 // If we are stress testing VPlan builds, do not attempt to generate vector 10240 // code. Masked vector code generation support will follow soon. 10241 // Also, do not attempt to vectorize if no vector code will be produced. 10242 if (VPlanBuildStressTest || EnableVPlanPredication || 10243 VectorizationFactor::Disabled() == VF) 10244 return false; 10245 10246 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10247 10248 { 10249 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10250 F->getParent()->getDataLayout()); 10251 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10252 &CM, BFI, PSI, Checks); 10253 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10254 << L->getHeader()->getParent()->getName() << "\"\n"); 10255 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10256 } 10257 10258 // Mark the loop as already vectorized to avoid vectorizing again. 10259 Hints.setAlreadyVectorized(); 10260 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10261 return true; 10262 } 10263 10264 // Emit a remark if there are stores to floats that required a floating point 10265 // extension. If the vectorized loop was generated with floating point there 10266 // will be a performance penalty from the conversion overhead and the change in 10267 // the vector width. 10268 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10269 SmallVector<Instruction *, 4> Worklist; 10270 for (BasicBlock *BB : L->getBlocks()) { 10271 for (Instruction &Inst : *BB) { 10272 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10273 if (S->getValueOperand()->getType()->isFloatTy()) 10274 Worklist.push_back(S); 10275 } 10276 } 10277 } 10278 10279 // Traverse the floating point stores upwards searching, for floating point 10280 // conversions. 10281 SmallPtrSet<const Instruction *, 4> Visited; 10282 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10283 while (!Worklist.empty()) { 10284 auto *I = Worklist.pop_back_val(); 10285 if (!L->contains(I)) 10286 continue; 10287 if (!Visited.insert(I).second) 10288 continue; 10289 10290 // Emit a remark if the floating point store required a floating 10291 // point conversion. 10292 // TODO: More work could be done to identify the root cause such as a 10293 // constant or a function return type and point the user to it. 10294 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10295 ORE->emit([&]() { 10296 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10297 I->getDebugLoc(), L->getHeader()) 10298 << "floating point conversion changes vector width. " 10299 << "Mixed floating point precision requires an up/down " 10300 << "cast that will negatively impact performance."; 10301 }); 10302 10303 for (Use &Op : I->operands()) 10304 if (auto *OpI = dyn_cast<Instruction>(Op)) 10305 Worklist.push_back(OpI); 10306 } 10307 } 10308 10309 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10310 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10311 !EnableLoopInterleaving), 10312 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10313 !EnableLoopVectorization) {} 10314 10315 bool LoopVectorizePass::processLoop(Loop *L) { 10316 assert((EnableVPlanNativePath || L->isInnermost()) && 10317 "VPlan-native path is not enabled. Only process inner loops."); 10318 10319 #ifndef NDEBUG 10320 const std::string DebugLocStr = getDebugLocString(L); 10321 #endif /* NDEBUG */ 10322 10323 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10324 << L->getHeader()->getParent()->getName() << "\" from " 10325 << DebugLocStr << "\n"); 10326 10327 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10328 10329 LLVM_DEBUG( 10330 dbgs() << "LV: Loop hints:" 10331 << " force=" 10332 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10333 ? "disabled" 10334 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10335 ? "enabled" 10336 : "?")) 10337 << " width=" << Hints.getWidth() 10338 << " interleave=" << Hints.getInterleave() << "\n"); 10339 10340 // Function containing loop 10341 Function *F = L->getHeader()->getParent(); 10342 10343 // Looking at the diagnostic output is the only way to determine if a loop 10344 // was vectorized (other than looking at the IR or machine code), so it 10345 // is important to generate an optimization remark for each loop. Most of 10346 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10347 // generated as OptimizationRemark and OptimizationRemarkMissed are 10348 // less verbose reporting vectorized loops and unvectorized loops that may 10349 // benefit from vectorization, respectively. 10350 10351 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10352 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10353 return false; 10354 } 10355 10356 PredicatedScalarEvolution PSE(*SE, *L); 10357 10358 // Check if it is legal to vectorize the loop. 10359 LoopVectorizationRequirements Requirements; 10360 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10361 &Requirements, &Hints, DB, AC, BFI, PSI); 10362 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10363 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10364 Hints.emitRemarkWithHints(); 10365 return false; 10366 } 10367 10368 // Check the function attributes and profiles to find out if this function 10369 // should be optimized for size. 10370 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10371 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10372 10373 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10374 // here. They may require CFG and instruction level transformations before 10375 // even evaluating whether vectorization is profitable. Since we cannot modify 10376 // the incoming IR, we need to build VPlan upfront in the vectorization 10377 // pipeline. 10378 if (!L->isInnermost()) 10379 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10380 ORE, BFI, PSI, Hints, Requirements); 10381 10382 assert(L->isInnermost() && "Inner loop expected."); 10383 10384 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10385 // count by optimizing for size, to minimize overheads. 10386 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10387 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10388 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10389 << "This loop is worth vectorizing only if no scalar " 10390 << "iteration overheads are incurred."); 10391 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10392 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10393 else { 10394 LLVM_DEBUG(dbgs() << "\n"); 10395 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10396 } 10397 } 10398 10399 // Check the function attributes to see if implicit floats are allowed. 10400 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10401 // an integer loop and the vector instructions selected are purely integer 10402 // vector instructions? 10403 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10404 reportVectorizationFailure( 10405 "Can't vectorize when the NoImplicitFloat attribute is used", 10406 "loop not vectorized due to NoImplicitFloat attribute", 10407 "NoImplicitFloat", ORE, L); 10408 Hints.emitRemarkWithHints(); 10409 return false; 10410 } 10411 10412 // Check if the target supports potentially unsafe FP vectorization. 10413 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10414 // for the target we're vectorizing for, to make sure none of the 10415 // additional fp-math flags can help. 10416 if (Hints.isPotentiallyUnsafe() && 10417 TTI->isFPVectorizationPotentiallyUnsafe()) { 10418 reportVectorizationFailure( 10419 "Potentially unsafe FP op prevents vectorization", 10420 "loop not vectorized due to unsafe FP support.", 10421 "UnsafeFP", ORE, L); 10422 Hints.emitRemarkWithHints(); 10423 return false; 10424 } 10425 10426 bool AllowOrderedReductions; 10427 // If the flag is set, use that instead and override the TTI behaviour. 10428 if (ForceOrderedReductions.getNumOccurrences() > 0) 10429 AllowOrderedReductions = ForceOrderedReductions; 10430 else 10431 AllowOrderedReductions = TTI->enableOrderedReductions(); 10432 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10433 ORE->emit([&]() { 10434 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10435 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10436 ExactFPMathInst->getDebugLoc(), 10437 ExactFPMathInst->getParent()) 10438 << "loop not vectorized: cannot prove it is safe to reorder " 10439 "floating-point operations"; 10440 }); 10441 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10442 "reorder floating-point operations\n"); 10443 Hints.emitRemarkWithHints(); 10444 return false; 10445 } 10446 10447 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10448 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10449 10450 // If an override option has been passed in for interleaved accesses, use it. 10451 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10452 UseInterleaved = EnableInterleavedMemAccesses; 10453 10454 // Analyze interleaved memory accesses. 10455 if (UseInterleaved) { 10456 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10457 } 10458 10459 // Use the cost model. 10460 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10461 F, &Hints, IAI); 10462 CM.collectValuesToIgnore(); 10463 CM.collectElementTypesForWidening(); 10464 10465 // Use the planner for vectorization. 10466 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10467 Requirements, ORE); 10468 10469 // Get user vectorization factor and interleave count. 10470 ElementCount UserVF = Hints.getWidth(); 10471 unsigned UserIC = Hints.getInterleave(); 10472 10473 // Plan how to best vectorize, return the best VF and its cost. 10474 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10475 10476 VectorizationFactor VF = VectorizationFactor::Disabled(); 10477 unsigned IC = 1; 10478 10479 if (MaybeVF) { 10480 VF = *MaybeVF; 10481 // Select the interleave count. 10482 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10483 } 10484 10485 // Identify the diagnostic messages that should be produced. 10486 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10487 bool VectorizeLoop = true, InterleaveLoop = true; 10488 if (VF.Width.isScalar()) { 10489 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10490 VecDiagMsg = std::make_pair( 10491 "VectorizationNotBeneficial", 10492 "the cost-model indicates that vectorization is not beneficial"); 10493 VectorizeLoop = false; 10494 } 10495 10496 if (!MaybeVF && UserIC > 1) { 10497 // Tell the user interleaving was avoided up-front, despite being explicitly 10498 // requested. 10499 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10500 "interleaving should be avoided up front\n"); 10501 IntDiagMsg = std::make_pair( 10502 "InterleavingAvoided", 10503 "Ignoring UserIC, because interleaving was avoided up front"); 10504 InterleaveLoop = false; 10505 } else if (IC == 1 && UserIC <= 1) { 10506 // Tell the user interleaving is not beneficial. 10507 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10508 IntDiagMsg = std::make_pair( 10509 "InterleavingNotBeneficial", 10510 "the cost-model indicates that interleaving is not beneficial"); 10511 InterleaveLoop = false; 10512 if (UserIC == 1) { 10513 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10514 IntDiagMsg.second += 10515 " and is explicitly disabled or interleave count is set to 1"; 10516 } 10517 } else if (IC > 1 && UserIC == 1) { 10518 // Tell the user interleaving is beneficial, but it explicitly disabled. 10519 LLVM_DEBUG( 10520 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10521 IntDiagMsg = std::make_pair( 10522 "InterleavingBeneficialButDisabled", 10523 "the cost-model indicates that interleaving is beneficial " 10524 "but is explicitly disabled or interleave count is set to 1"); 10525 InterleaveLoop = false; 10526 } 10527 10528 // Override IC if user provided an interleave count. 10529 IC = UserIC > 0 ? UserIC : IC; 10530 10531 // Emit diagnostic messages, if any. 10532 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10533 if (!VectorizeLoop && !InterleaveLoop) { 10534 // Do not vectorize or interleaving the loop. 10535 ORE->emit([&]() { 10536 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10537 L->getStartLoc(), L->getHeader()) 10538 << VecDiagMsg.second; 10539 }); 10540 ORE->emit([&]() { 10541 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10542 L->getStartLoc(), L->getHeader()) 10543 << IntDiagMsg.second; 10544 }); 10545 return false; 10546 } else if (!VectorizeLoop && InterleaveLoop) { 10547 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10548 ORE->emit([&]() { 10549 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10550 L->getStartLoc(), L->getHeader()) 10551 << VecDiagMsg.second; 10552 }); 10553 } else if (VectorizeLoop && !InterleaveLoop) { 10554 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10555 << ") in " << DebugLocStr << '\n'); 10556 ORE->emit([&]() { 10557 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10558 L->getStartLoc(), L->getHeader()) 10559 << IntDiagMsg.second; 10560 }); 10561 } else if (VectorizeLoop && InterleaveLoop) { 10562 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10563 << ") in " << DebugLocStr << '\n'); 10564 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10565 } 10566 10567 bool DisableRuntimeUnroll = false; 10568 MDNode *OrigLoopID = L->getLoopID(); 10569 { 10570 // Optimistically generate runtime checks. Drop them if they turn out to not 10571 // be profitable. Limit the scope of Checks, so the cleanup happens 10572 // immediately after vector codegeneration is done. 10573 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10574 F->getParent()->getDataLayout()); 10575 if (!VF.Width.isScalar() || IC > 1) 10576 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10577 10578 using namespace ore; 10579 if (!VectorizeLoop) { 10580 assert(IC > 1 && "interleave count should not be 1 or 0"); 10581 // If we decided that it is not legal to vectorize the loop, then 10582 // interleave it. 10583 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10584 &CM, BFI, PSI, Checks); 10585 10586 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10587 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10588 10589 ORE->emit([&]() { 10590 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10591 L->getHeader()) 10592 << "interleaved loop (interleaved count: " 10593 << NV("InterleaveCount", IC) << ")"; 10594 }); 10595 } else { 10596 // If we decided that it is *legal* to vectorize the loop, then do it. 10597 10598 // Consider vectorizing the epilogue too if it's profitable. 10599 VectorizationFactor EpilogueVF = 10600 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10601 if (EpilogueVF.Width.isVector()) { 10602 10603 // The first pass vectorizes the main loop and creates a scalar epilogue 10604 // to be vectorized by executing the plan (potentially with a different 10605 // factor) again shortly afterwards. 10606 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10607 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10608 EPI, &LVL, &CM, BFI, PSI, Checks); 10609 10610 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10611 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10612 DT); 10613 ++LoopsVectorized; 10614 10615 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10616 formLCSSARecursively(*L, *DT, LI, SE); 10617 10618 // Second pass vectorizes the epilogue and adjusts the control flow 10619 // edges from the first pass. 10620 EPI.MainLoopVF = EPI.EpilogueVF; 10621 EPI.MainLoopUF = EPI.EpilogueUF; 10622 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10623 ORE, EPI, &LVL, &CM, BFI, PSI, 10624 Checks); 10625 10626 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10627 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10628 DT); 10629 ++LoopsEpilogueVectorized; 10630 10631 if (!MainILV.areSafetyChecksAdded()) 10632 DisableRuntimeUnroll = true; 10633 } else { 10634 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10635 &LVL, &CM, BFI, PSI, Checks); 10636 10637 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10638 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10639 ++LoopsVectorized; 10640 10641 // Add metadata to disable runtime unrolling a scalar loop when there 10642 // are no runtime checks about strides and memory. A scalar loop that is 10643 // rarely used is not worth unrolling. 10644 if (!LB.areSafetyChecksAdded()) 10645 DisableRuntimeUnroll = true; 10646 } 10647 // Report the vectorization decision. 10648 ORE->emit([&]() { 10649 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10650 L->getHeader()) 10651 << "vectorized loop (vectorization width: " 10652 << NV("VectorizationFactor", VF.Width) 10653 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10654 }); 10655 } 10656 10657 if (ORE->allowExtraAnalysis(LV_NAME)) 10658 checkMixedPrecision(L, ORE); 10659 } 10660 10661 Optional<MDNode *> RemainderLoopID = 10662 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10663 LLVMLoopVectorizeFollowupEpilogue}); 10664 if (RemainderLoopID.hasValue()) { 10665 L->setLoopID(RemainderLoopID.getValue()); 10666 } else { 10667 if (DisableRuntimeUnroll) 10668 AddRuntimeUnrollDisableMetaData(L); 10669 10670 // Mark the loop as already vectorized to avoid vectorizing again. 10671 Hints.setAlreadyVectorized(); 10672 } 10673 10674 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10675 return true; 10676 } 10677 10678 LoopVectorizeResult LoopVectorizePass::runImpl( 10679 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10680 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10681 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10682 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10683 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10684 SE = &SE_; 10685 LI = &LI_; 10686 TTI = &TTI_; 10687 DT = &DT_; 10688 BFI = &BFI_; 10689 TLI = TLI_; 10690 AA = &AA_; 10691 AC = &AC_; 10692 GetLAA = &GetLAA_; 10693 DB = &DB_; 10694 ORE = &ORE_; 10695 PSI = PSI_; 10696 10697 // Don't attempt if 10698 // 1. the target claims to have no vector registers, and 10699 // 2. interleaving won't help ILP. 10700 // 10701 // The second condition is necessary because, even if the target has no 10702 // vector registers, loop vectorization may still enable scalar 10703 // interleaving. 10704 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10705 TTI->getMaxInterleaveFactor(1) < 2) 10706 return LoopVectorizeResult(false, false); 10707 10708 bool Changed = false, CFGChanged = false; 10709 10710 // The vectorizer requires loops to be in simplified form. 10711 // Since simplification may add new inner loops, it has to run before the 10712 // legality and profitability checks. This means running the loop vectorizer 10713 // will simplify all loops, regardless of whether anything end up being 10714 // vectorized. 10715 for (auto &L : *LI) 10716 Changed |= CFGChanged |= 10717 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10718 10719 // Build up a worklist of inner-loops to vectorize. This is necessary as 10720 // the act of vectorizing or partially unrolling a loop creates new loops 10721 // and can invalidate iterators across the loops. 10722 SmallVector<Loop *, 8> Worklist; 10723 10724 for (Loop *L : *LI) 10725 collectSupportedLoops(*L, LI, ORE, Worklist); 10726 10727 LoopsAnalyzed += Worklist.size(); 10728 10729 // Now walk the identified inner loops. 10730 while (!Worklist.empty()) { 10731 Loop *L = Worklist.pop_back_val(); 10732 10733 // For the inner loops we actually process, form LCSSA to simplify the 10734 // transform. 10735 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10736 10737 Changed |= CFGChanged |= processLoop(L); 10738 } 10739 10740 // Process each loop nest in the function. 10741 return LoopVectorizeResult(Changed, CFGChanged); 10742 } 10743 10744 PreservedAnalyses LoopVectorizePass::run(Function &F, 10745 FunctionAnalysisManager &AM) { 10746 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10747 auto &LI = AM.getResult<LoopAnalysis>(F); 10748 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10749 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10750 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10751 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10752 auto &AA = AM.getResult<AAManager>(F); 10753 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10754 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10755 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10756 10757 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10758 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10759 [&](Loop &L) -> const LoopAccessInfo & { 10760 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10761 TLI, TTI, nullptr, nullptr, nullptr}; 10762 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10763 }; 10764 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10765 ProfileSummaryInfo *PSI = 10766 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10767 LoopVectorizeResult Result = 10768 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10769 if (!Result.MadeAnyChange) 10770 return PreservedAnalyses::all(); 10771 PreservedAnalyses PA; 10772 10773 // We currently do not preserve loopinfo/dominator analyses with outer loop 10774 // vectorization. Until this is addressed, mark these analyses as preserved 10775 // only for non-VPlan-native path. 10776 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10777 if (!EnableVPlanNativePath) { 10778 PA.preserve<LoopAnalysis>(); 10779 PA.preserve<DominatorTreeAnalysis>(); 10780 } 10781 if (!Result.MadeCFGChange) 10782 PA.preserveSet<CFGAnalyses>(); 10783 return PA; 10784 } 10785 10786 void LoopVectorizePass::printPipeline( 10787 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10788 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10789 OS, MapClassName2PassName); 10790 10791 OS << "<"; 10792 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10793 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10794 OS << ">"; 10795 } 10796