1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/Metadata.h" 116 #include "llvm/IR/Module.h" 117 #include "llvm/IR/Operator.h" 118 #include "llvm/IR/PatternMatch.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <functional> 147 #include <iterator> 148 #include <limits> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 200 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks with a " 202 "vectorize(enable) pragma.")); 203 204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 205 // that predication is preferred, and this lists all options. I.e., the 206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 207 // and predicate the instructions accordingly. If tail-folding fails, there are 208 // different fallback strategies depending on these values: 209 namespace PreferPredicateTy { 210 enum Option { 211 ScalarEpilogue = 0, 212 PredicateElseScalarEpilogue, 213 PredicateOrDontVectorize 214 }; 215 } // namespace PreferPredicateTy 216 217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 218 "prefer-predicate-over-epilogue", 219 cl::init(PreferPredicateTy::ScalarEpilogue), 220 cl::Hidden, 221 cl::desc("Tail-folding and predication preferences over creating a scalar " 222 "epilogue loop."), 223 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 224 "scalar-epilogue", 225 "Don't tail-predicate loops, create scalar epilogue"), 226 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 227 "predicate-else-scalar-epilogue", 228 "prefer tail-folding, create scalar epilogue if tail " 229 "folding fails."), 230 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 231 "predicate-dont-vectorize", 232 "prefers tail-folding, don't attempt vectorization if " 233 "tail-folding fails."))); 234 235 static cl::opt<bool> MaximizeBandwidth( 236 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 237 cl::desc("Maximize bandwidth when selecting vectorization factor which " 238 "will be determined by the smallest type in loop.")); 239 240 static cl::opt<bool> EnableInterleavedMemAccesses( 241 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 242 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 243 244 /// An interleave-group may need masking if it resides in a block that needs 245 /// predication, or in order to mask away gaps. 246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 247 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 248 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 249 250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 251 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 252 cl::desc("We don't interleave loops with a estimated constant trip count " 253 "below this number")); 254 255 static cl::opt<unsigned> ForceTargetNumScalarRegs( 256 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of scalar registers.")); 258 259 static cl::opt<unsigned> ForceTargetNumVectorRegs( 260 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's number of vector registers.")); 262 263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 264 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 265 cl::desc("A flag that overrides the target's max interleave factor for " 266 "scalar loops.")); 267 268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 269 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 270 cl::desc("A flag that overrides the target's max interleave factor for " 271 "vectorized loops.")); 272 273 static cl::opt<unsigned> ForceTargetInstructionCost( 274 "force-target-instruction-cost", cl::init(0), cl::Hidden, 275 cl::desc("A flag that overrides the target's expected cost for " 276 "an instruction to a single constant value. Mostly " 277 "useful for getting consistent testing.")); 278 279 static cl::opt<bool> ForceTargetSupportsScalableVectors( 280 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 281 cl::desc( 282 "Pretend that scalable vectors are supported, even if the target does " 283 "not support them. This flag should only be used for testing.")); 284 285 static cl::opt<unsigned> SmallLoopCost( 286 "small-loop-cost", cl::init(20), cl::Hidden, 287 cl::desc( 288 "The cost of a loop that is considered 'small' by the interleaver.")); 289 290 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 291 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 292 cl::desc("Enable the use of the block frequency analysis to access PGO " 293 "heuristics minimizing code growth in cold regions and being more " 294 "aggressive in hot regions.")); 295 296 // Runtime interleave loops for load/store throughput. 297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 298 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 299 cl::desc( 300 "Enable runtime interleaving until load/store ports are saturated")); 301 302 /// Interleave small loops with scalar reductions. 303 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 304 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 305 cl::desc("Enable interleaving for loops with small iteration counts that " 306 "contain scalar reductions to expose ILP.")); 307 308 /// The number of stores in a loop that are allowed to need predication. 309 static cl::opt<unsigned> NumberOfStoresToPredicate( 310 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 311 cl::desc("Max number of stores to be predicated behind an if.")); 312 313 static cl::opt<bool> EnableIndVarRegisterHeur( 314 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 315 cl::desc("Count the induction variable only once when interleaving")); 316 317 static cl::opt<bool> EnableCondStoresVectorization( 318 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 319 cl::desc("Enable if predication of stores during vectorization.")); 320 321 static cl::opt<unsigned> MaxNestedScalarReductionIC( 322 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 323 cl::desc("The maximum interleave count to use when interleaving a scalar " 324 "reduction in a nested loop.")); 325 326 static cl::opt<bool> 327 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 328 cl::Hidden, 329 cl::desc("Prefer in-loop vector reductions, " 330 "overriding the targets preference.")); 331 332 static cl::opt<bool> ForceOrderedReductions( 333 "force-ordered-reductions", cl::init(false), cl::Hidden, 334 cl::desc("Enable the vectorisation of loops with in-order (strict) " 335 "FP reductions")); 336 337 static cl::opt<bool> PreferPredicatedReductionSelect( 338 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 339 cl::desc( 340 "Prefer predicating a reduction operation over an after loop select.")); 341 342 cl::opt<bool> EnableVPlanNativePath( 343 "enable-vplan-native-path", cl::init(false), cl::Hidden, 344 cl::desc("Enable VPlan-native vectorization path with " 345 "support for outer loop vectorization.")); 346 347 // FIXME: Remove this switch once we have divergence analysis. Currently we 348 // assume divergent non-backedge branches when this switch is true. 349 cl::opt<bool> EnableVPlanPredication( 350 "enable-vplan-predication", cl::init(false), cl::Hidden, 351 cl::desc("Enable VPlan-native vectorization path predicator with " 352 "support for outer loop vectorization.")); 353 354 // This flag enables the stress testing of the VPlan H-CFG construction in the 355 // VPlan-native vectorization path. It must be used in conjuction with 356 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 357 // verification of the H-CFGs built. 358 static cl::opt<bool> VPlanBuildStressTest( 359 "vplan-build-stress-test", cl::init(false), cl::Hidden, 360 cl::desc( 361 "Build VPlan for every supported loop nest in the function and bail " 362 "out right after the build (stress test the VPlan H-CFG construction " 363 "in the VPlan-native vectorization path).")); 364 365 cl::opt<bool> llvm::EnableLoopInterleaving( 366 "interleave-loops", cl::init(true), cl::Hidden, 367 cl::desc("Enable loop interleaving in Loop vectorization passes")); 368 cl::opt<bool> llvm::EnableLoopVectorization( 369 "vectorize-loops", cl::init(true), cl::Hidden, 370 cl::desc("Run the Loop vectorization passes")); 371 372 cl::opt<bool> PrintVPlansInDotFormat( 373 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 374 cl::desc("Use dot format instead of plain text when dumping VPlans")); 375 376 /// A helper function that returns true if the given type is irregular. The 377 /// type is irregular if its allocated size doesn't equal the store size of an 378 /// element of the corresponding vector type. 379 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 380 // Determine if an array of N elements of type Ty is "bitcast compatible" 381 // with a <N x Ty> vector. 382 // This is only true if there is no padding between the array elements. 383 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 384 } 385 386 /// A helper function that returns the reciprocal of the block probability of 387 /// predicated blocks. If we return X, we are assuming the predicated block 388 /// will execute once for every X iterations of the loop header. 389 /// 390 /// TODO: We should use actual block probability here, if available. Currently, 391 /// we always assume predicated blocks have a 50% chance of executing. 392 static unsigned getReciprocalPredBlockProb() { return 2; } 393 394 /// A helper function that returns an integer or floating-point constant with 395 /// value C. 396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 397 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 398 : ConstantFP::get(Ty, C); 399 } 400 401 /// Returns "best known" trip count for the specified loop \p L as defined by 402 /// the following procedure: 403 /// 1) Returns exact trip count if it is known. 404 /// 2) Returns expected trip count according to profile data if any. 405 /// 3) Returns upper bound estimate if it is known. 406 /// 4) Returns None if all of the above failed. 407 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 408 // Check if exact trip count is known. 409 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 410 return ExpectedTC; 411 412 // Check if there is an expected trip count available from profile data. 413 if (LoopVectorizeWithBlockFrequency) 414 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 415 return EstimatedTC; 416 417 // Check if upper bound estimate is known. 418 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 419 return ExpectedTC; 420 421 return None; 422 } 423 424 // Forward declare GeneratedRTChecks. 425 class GeneratedRTChecks; 426 427 namespace llvm { 428 429 AnalysisKey ShouldRunExtraVectorPasses::Key; 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop and the start value for the canonical induction, if it is != 0. The 472 /// latter is the case when vectorizing the epilogue loop. In the case of 473 /// epilogue vectorization, this function is overriden to handle the more 474 /// complex control flow around the loops. 475 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 476 477 /// Widen a single call instruction within the innermost loop. 478 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 479 VPTransformState &State); 480 481 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 482 void fixVectorizedLoop(VPTransformState &State); 483 484 // Return true if any runtime check is added. 485 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 486 487 /// A type for vectorized values in the new loop. Each value from the 488 /// original loop, when vectorized, is represented by UF vector values in the 489 /// new unrolled loop, where UF is the unroll factor. 490 using VectorParts = SmallVector<Value *, 2>; 491 492 /// Vectorize a single first-order recurrence or pointer induction PHINode in 493 /// a block. This method handles the induction variable canonicalization. It 494 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 495 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 496 VPTransformState &State); 497 498 /// A helper function to scalarize a single Instruction in the innermost loop. 499 /// Generates a sequence of scalar instances for each lane between \p MinLane 500 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 501 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 502 /// Instr's operands. 503 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 504 const VPIteration &Instance, bool IfPredicateInstr, 505 VPTransformState &State); 506 507 /// Construct the vector value of a scalarized value \p V one lane at a time. 508 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 509 VPTransformState &State); 510 511 /// Try to vectorize interleaved access group \p Group with the base address 512 /// given in \p Addr, optionally masking the vector operations if \p 513 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 514 /// values in the vectorized loop. 515 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 516 ArrayRef<VPValue *> VPDefs, 517 VPTransformState &State, VPValue *Addr, 518 ArrayRef<VPValue *> StoredValues, 519 VPValue *BlockInMask = nullptr); 520 521 /// Set the debug location in the builder \p Ptr using the debug location in 522 /// \p V. If \p Ptr is None then it uses the class member's Builder. 523 void setDebugLocFromInst(const Value *V, 524 Optional<IRBuilderBase *> CustomBuilder = None); 525 526 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 527 void fixNonInductionPHIs(VPTransformState &State); 528 529 /// Returns true if the reordering of FP operations is not allowed, but we are 530 /// able to vectorize with strict in-order reductions for the given RdxDesc. 531 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 532 533 /// Create a broadcast instruction. This method generates a broadcast 534 /// instruction (shuffle) for loop invariant values and for the induction 535 /// value. If this is the induction variable then we extend it to N, N+1, ... 536 /// this is needed because each iteration in the loop corresponds to a SIMD 537 /// element. 538 virtual Value *getBroadcastInstrs(Value *V); 539 540 /// Add metadata from one instruction to another. 541 /// 542 /// This includes both the original MDs from \p From and additional ones (\see 543 /// addNewMetadata). Use this for *newly created* instructions in the vector 544 /// loop. 545 void addMetadata(Instruction *To, Instruction *From); 546 547 /// Similar to the previous function but it adds the metadata to a 548 /// vector of instructions. 549 void addMetadata(ArrayRef<Value *> To, Instruction *From); 550 551 // Returns the resume value (bc.merge.rdx) for a reduction as 552 // generated by fixReduction. 553 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 554 555 protected: 556 friend class LoopVectorizationPlanner; 557 558 /// A small list of PHINodes. 559 using PhiVector = SmallVector<PHINode *, 4>; 560 561 /// A type for scalarized values in the new loop. Each value from the 562 /// original loop, when scalarized, is represented by UF x VF scalar values 563 /// in the new unrolled loop, where UF is the unroll factor and VF is the 564 /// vectorization factor. 565 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 566 567 /// Set up the values of the IVs correctly when exiting the vector loop. 568 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 569 Value *CountRoundDown, Value *EndValue, 570 BasicBlock *MiddleBlock); 571 572 /// Introduce a conditional branch (on true, condition to be set later) at the 573 /// end of the header=latch connecting it to itself (across the backedge) and 574 /// to the exit block of \p L. 575 void createHeaderBranch(Loop *L); 576 577 /// Handle all cross-iteration phis in the header. 578 void fixCrossIterationPHIs(VPTransformState &State); 579 580 /// Create the exit value of first order recurrences in the middle block and 581 /// update their users. 582 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 583 VPTransformState &State); 584 585 /// Create code for the loop exit value of the reduction. 586 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 587 588 /// Clear NSW/NUW flags from reduction instructions if necessary. 589 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 590 VPTransformState &State); 591 592 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 593 /// means we need to add the appropriate incoming value from the middle 594 /// block as exiting edges from the scalar epilogue loop (if present) are 595 /// already in place, and we exit the vector loop exclusively to the middle 596 /// block. 597 void fixLCSSAPHIs(VPTransformState &State); 598 599 /// Iteratively sink the scalarized operands of a predicated instruction into 600 /// the block that was created for it. 601 void sinkScalarOperands(Instruction *PredInst); 602 603 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 604 /// represented as. 605 void truncateToMinimalBitwidths(VPTransformState &State); 606 607 /// Returns (and creates if needed) the original loop trip count. 608 Value *getOrCreateTripCount(Loop *NewLoop); 609 610 /// Returns (and creates if needed) the trip count of the widened loop. 611 Value *getOrCreateVectorTripCount(Loop *NewLoop); 612 613 /// Returns a bitcasted value to the requested vector type. 614 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 615 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 616 const DataLayout &DL); 617 618 /// Emit a bypass check to see if the vector trip count is zero, including if 619 /// it overflows. 620 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 621 622 /// Emit a bypass check to see if all of the SCEV assumptions we've 623 /// had to make are correct. Returns the block containing the checks or 624 /// nullptr if no checks have been added. 625 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 626 627 /// Emit bypass checks to check any memory assumptions we may have made. 628 /// Returns the block containing the checks or nullptr if no checks have been 629 /// added. 630 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 631 632 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 633 /// vector loop preheader, middle block and scalar preheader. Also 634 /// allocate a loop object for the new vector loop and return it. 635 Loop *createVectorLoopSkeleton(StringRef Prefix); 636 637 /// Create new phi nodes for the induction variables to resume iteration count 638 /// in the scalar epilogue, from where the vectorized loop left off. 639 /// In cases where the loop skeleton is more complicated (eg. epilogue 640 /// vectorization) and the resume values can come from an additional bypass 641 /// block, the \p AdditionalBypass pair provides information about the bypass 642 /// block and the end value on the edge from bypass to this loop. 643 void createInductionResumeValues( 644 Loop *L, 645 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 646 647 /// Complete the loop skeleton by adding debug MDs, creating appropriate 648 /// conditional branches in the middle block, preparing the builder and 649 /// running the verifier. Take in the vector loop \p L as argument, and return 650 /// the preheader of the completed vector loop. 651 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 652 653 /// Add additional metadata to \p To that was not present on \p Orig. 654 /// 655 /// Currently this is used to add the noalias annotations based on the 656 /// inserted memchecks. Use this for instructions that are *cloned* into the 657 /// vector loop. 658 void addNewMetadata(Instruction *To, const Instruction *Orig); 659 660 /// Collect poison-generating recipes that may generate a poison value that is 661 /// used after vectorization, even when their operands are not poison. Those 662 /// recipes meet the following conditions: 663 /// * Contribute to the address computation of a recipe generating a widen 664 /// memory load/store (VPWidenMemoryInstructionRecipe or 665 /// VPInterleaveRecipe). 666 /// * Such a widen memory load/store has at least one underlying Instruction 667 /// that is in a basic block that needs predication and after vectorization 668 /// the generated instruction won't be predicated. 669 void collectPoisonGeneratingRecipes(VPTransformState &State); 670 671 /// Allow subclasses to override and print debug traces before/after vplan 672 /// execution, when trace information is requested. 673 virtual void printDebugTracesAtStart(){}; 674 virtual void printDebugTracesAtEnd(){}; 675 676 /// The original loop. 677 Loop *OrigLoop; 678 679 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 680 /// dynamic knowledge to simplify SCEV expressions and converts them to a 681 /// more usable form. 682 PredicatedScalarEvolution &PSE; 683 684 /// Loop Info. 685 LoopInfo *LI; 686 687 /// Dominator Tree. 688 DominatorTree *DT; 689 690 /// Alias Analysis. 691 AAResults *AA; 692 693 /// Target Library Info. 694 const TargetLibraryInfo *TLI; 695 696 /// Target Transform Info. 697 const TargetTransformInfo *TTI; 698 699 /// Assumption Cache. 700 AssumptionCache *AC; 701 702 /// Interface to emit optimization remarks. 703 OptimizationRemarkEmitter *ORE; 704 705 /// LoopVersioning. It's only set up (non-null) if memchecks were 706 /// used. 707 /// 708 /// This is currently only used to add no-alias metadata based on the 709 /// memchecks. The actually versioning is performed manually. 710 std::unique_ptr<LoopVersioning> LVer; 711 712 /// The vectorization SIMD factor to use. Each vector will have this many 713 /// vector elements. 714 ElementCount VF; 715 716 /// The vectorization unroll factor to use. Each scalar is vectorized to this 717 /// many different vector instructions. 718 unsigned UF; 719 720 /// The builder that we use 721 IRBuilder<> Builder; 722 723 // --- Vectorization state --- 724 725 /// The vector-loop preheader. 726 BasicBlock *LoopVectorPreHeader; 727 728 /// The scalar-loop preheader. 729 BasicBlock *LoopScalarPreHeader; 730 731 /// Middle Block between the vector and the scalar. 732 BasicBlock *LoopMiddleBlock; 733 734 /// The unique ExitBlock of the scalar loop if one exists. Note that 735 /// there can be multiple exiting edges reaching this block. 736 BasicBlock *LoopExitBlock; 737 738 /// The vector loop body. 739 BasicBlock *LoopVectorBody; 740 741 /// The scalar loop body. 742 BasicBlock *LoopScalarBody; 743 744 /// A list of all bypass blocks. The first block is the entry of the loop. 745 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 746 747 /// Store instructions that were predicated. 748 SmallVector<Instruction *, 4> PredicatedInstructions; 749 750 /// Trip count of the original loop. 751 Value *TripCount = nullptr; 752 753 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 754 Value *VectorTripCount = nullptr; 755 756 /// The legality analysis. 757 LoopVectorizationLegality *Legal; 758 759 /// The profitablity analysis. 760 LoopVectorizationCostModel *Cost; 761 762 // Record whether runtime checks are added. 763 bool AddedSafetyChecks = false; 764 765 // Holds the end values for each induction variable. We save the end values 766 // so we can later fix-up the external users of the induction variables. 767 DenseMap<PHINode *, Value *> IVEndValues; 768 769 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 770 // fixed up at the end of vector code generation. 771 SmallVector<PHINode *, 8> OrigPHIsToFix; 772 773 /// BFI and PSI are used to check for profile guided size optimizations. 774 BlockFrequencyInfo *BFI; 775 ProfileSummaryInfo *PSI; 776 777 // Whether this loop should be optimized for size based on profile guided size 778 // optimizatios. 779 bool OptForSizeBasedOnProfile; 780 781 /// Structure to hold information about generated runtime checks, responsible 782 /// for cleaning the checks, if vectorization turns out unprofitable. 783 GeneratedRTChecks &RTChecks; 784 785 // Holds the resume values for reductions in the loops, used to set the 786 // correct start value of reduction PHIs when vectorizing the epilogue. 787 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 788 ReductionResumeValues; 789 }; 790 791 class InnerLoopUnroller : public InnerLoopVectorizer { 792 public: 793 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 794 LoopInfo *LI, DominatorTree *DT, 795 const TargetLibraryInfo *TLI, 796 const TargetTransformInfo *TTI, AssumptionCache *AC, 797 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 798 LoopVectorizationLegality *LVL, 799 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 800 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 801 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 802 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 803 BFI, PSI, Check) {} 804 805 private: 806 Value *getBroadcastInstrs(Value *V) override; 807 }; 808 809 /// Encapsulate information regarding vectorization of a loop and its epilogue. 810 /// This information is meant to be updated and used across two stages of 811 /// epilogue vectorization. 812 struct EpilogueLoopVectorizationInfo { 813 ElementCount MainLoopVF = ElementCount::getFixed(0); 814 unsigned MainLoopUF = 0; 815 ElementCount EpilogueVF = ElementCount::getFixed(0); 816 unsigned EpilogueUF = 0; 817 BasicBlock *MainLoopIterationCountCheck = nullptr; 818 BasicBlock *EpilogueIterationCountCheck = nullptr; 819 BasicBlock *SCEVSafetyCheck = nullptr; 820 BasicBlock *MemSafetyCheck = nullptr; 821 Value *TripCount = nullptr; 822 Value *VectorTripCount = nullptr; 823 824 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 825 ElementCount EVF, unsigned EUF) 826 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 827 assert(EUF == 1 && 828 "A high UF for the epilogue loop is likely not beneficial."); 829 } 830 }; 831 832 /// An extension of the inner loop vectorizer that creates a skeleton for a 833 /// vectorized loop that has its epilogue (residual) also vectorized. 834 /// The idea is to run the vplan on a given loop twice, firstly to setup the 835 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 836 /// from the first step and vectorize the epilogue. This is achieved by 837 /// deriving two concrete strategy classes from this base class and invoking 838 /// them in succession from the loop vectorizer planner. 839 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 840 public: 841 InnerLoopAndEpilogueVectorizer( 842 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 843 DominatorTree *DT, const TargetLibraryInfo *TLI, 844 const TargetTransformInfo *TTI, AssumptionCache *AC, 845 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 846 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 847 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 848 GeneratedRTChecks &Checks) 849 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 850 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 851 Checks), 852 EPI(EPI) {} 853 854 // Override this function to handle the more complex control flow around the 855 // three loops. 856 std::pair<BasicBlock *, Value *> 857 createVectorizedLoopSkeleton() final override { 858 return createEpilogueVectorizedLoopSkeleton(); 859 } 860 861 /// The interface for creating a vectorized skeleton using one of two 862 /// different strategies, each corresponding to one execution of the vplan 863 /// as described above. 864 virtual std::pair<BasicBlock *, Value *> 865 createEpilogueVectorizedLoopSkeleton() = 0; 866 867 /// Holds and updates state information required to vectorize the main loop 868 /// and its epilogue in two separate passes. This setup helps us avoid 869 /// regenerating and recomputing runtime safety checks. It also helps us to 870 /// shorten the iteration-count-check path length for the cases where the 871 /// iteration count of the loop is so small that the main vector loop is 872 /// completely skipped. 873 EpilogueLoopVectorizationInfo &EPI; 874 }; 875 876 /// A specialized derived class of inner loop vectorizer that performs 877 /// vectorization of *main* loops in the process of vectorizing loops and their 878 /// epilogues. 879 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 880 public: 881 EpilogueVectorizerMainLoop( 882 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 883 DominatorTree *DT, const TargetLibraryInfo *TLI, 884 const TargetTransformInfo *TTI, AssumptionCache *AC, 885 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 886 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 887 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 888 GeneratedRTChecks &Check) 889 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 890 EPI, LVL, CM, BFI, PSI, Check) {} 891 /// Implements the interface for creating a vectorized skeleton using the 892 /// *main loop* strategy (ie the first pass of vplan execution). 893 std::pair<BasicBlock *, Value *> 894 createEpilogueVectorizedLoopSkeleton() final override; 895 896 protected: 897 /// Emits an iteration count bypass check once for the main loop (when \p 898 /// ForEpilogue is false) and once for the epilogue loop (when \p 899 /// ForEpilogue is true). 900 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 901 bool ForEpilogue); 902 void printDebugTracesAtStart() override; 903 void printDebugTracesAtEnd() override; 904 }; 905 906 // A specialized derived class of inner loop vectorizer that performs 907 // vectorization of *epilogue* loops in the process of vectorizing loops and 908 // their epilogues. 909 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 910 public: 911 EpilogueVectorizerEpilogueLoop( 912 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 913 DominatorTree *DT, const TargetLibraryInfo *TLI, 914 const TargetTransformInfo *TTI, AssumptionCache *AC, 915 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 916 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 917 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 918 GeneratedRTChecks &Checks) 919 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 920 EPI, LVL, CM, BFI, PSI, Checks) {} 921 /// Implements the interface for creating a vectorized skeleton using the 922 /// *epilogue loop* strategy (ie the second pass of vplan execution). 923 std::pair<BasicBlock *, Value *> 924 createEpilogueVectorizedLoopSkeleton() final override; 925 926 protected: 927 /// Emits an iteration count bypass check after the main vector loop has 928 /// finished to see if there are any iterations left to execute by either 929 /// the vector epilogue or the scalar epilogue. 930 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 931 BasicBlock *Bypass, 932 BasicBlock *Insert); 933 void printDebugTracesAtStart() override; 934 void printDebugTracesAtEnd() override; 935 }; 936 } // end namespace llvm 937 938 /// Look for a meaningful debug location on the instruction or it's 939 /// operands. 940 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 941 if (!I) 942 return I; 943 944 DebugLoc Empty; 945 if (I->getDebugLoc() != Empty) 946 return I; 947 948 for (Use &Op : I->operands()) { 949 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 950 if (OpInst->getDebugLoc() != Empty) 951 return OpInst; 952 } 953 954 return I; 955 } 956 957 void InnerLoopVectorizer::setDebugLocFromInst( 958 const Value *V, Optional<IRBuilderBase *> CustomBuilder) { 959 IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 960 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 961 const DILocation *DIL = Inst->getDebugLoc(); 962 963 // When a FSDiscriminator is enabled, we don't need to add the multiply 964 // factors to the discriminators. 965 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 966 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 967 // FIXME: For scalable vectors, assume vscale=1. 968 auto NewDIL = 969 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 970 if (NewDIL) 971 B->SetCurrentDebugLocation(NewDIL.getValue()); 972 else 973 LLVM_DEBUG(dbgs() 974 << "Failed to create new discriminator: " 975 << DIL->getFilename() << " Line: " << DIL->getLine()); 976 } else 977 B->SetCurrentDebugLocation(DIL); 978 } else 979 B->SetCurrentDebugLocation(DebugLoc()); 980 } 981 982 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 983 /// is passed, the message relates to that particular instruction. 984 #ifndef NDEBUG 985 static void debugVectorizationMessage(const StringRef Prefix, 986 const StringRef DebugMsg, 987 Instruction *I) { 988 dbgs() << "LV: " << Prefix << DebugMsg; 989 if (I != nullptr) 990 dbgs() << " " << *I; 991 else 992 dbgs() << '.'; 993 dbgs() << '\n'; 994 } 995 #endif 996 997 /// Create an analysis remark that explains why vectorization failed 998 /// 999 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1000 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1001 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1002 /// the location of the remark. \return the remark object that can be 1003 /// streamed to. 1004 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1005 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1006 Value *CodeRegion = TheLoop->getHeader(); 1007 DebugLoc DL = TheLoop->getStartLoc(); 1008 1009 if (I) { 1010 CodeRegion = I->getParent(); 1011 // If there is no debug location attached to the instruction, revert back to 1012 // using the loop's. 1013 if (I->getDebugLoc()) 1014 DL = I->getDebugLoc(); 1015 } 1016 1017 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1018 } 1019 1020 namespace llvm { 1021 1022 /// Return a value for Step multiplied by VF. 1023 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1024 int64_t Step) { 1025 assert(Ty->isIntegerTy() && "Expected an integer step"); 1026 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1027 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1028 } 1029 1030 /// Return the runtime value for VF. 1031 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1032 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1033 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1034 } 1035 1036 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1037 ElementCount VF) { 1038 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1039 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1040 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1041 return B.CreateUIToFP(RuntimeVF, FTy); 1042 } 1043 1044 void reportVectorizationFailure(const StringRef DebugMsg, 1045 const StringRef OREMsg, const StringRef ORETag, 1046 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1047 Instruction *I) { 1048 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1049 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1050 ORE->emit( 1051 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1052 << "loop not vectorized: " << OREMsg); 1053 } 1054 1055 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1056 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1057 Instruction *I) { 1058 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1059 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1060 ORE->emit( 1061 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1062 << Msg); 1063 } 1064 1065 } // end namespace llvm 1066 1067 #ifndef NDEBUG 1068 /// \return string containing a file name and a line # for the given loop. 1069 static std::string getDebugLocString(const Loop *L) { 1070 std::string Result; 1071 if (L) { 1072 raw_string_ostream OS(Result); 1073 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1074 LoopDbgLoc.print(OS); 1075 else 1076 // Just print the module name. 1077 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1078 OS.flush(); 1079 } 1080 return Result; 1081 } 1082 #endif 1083 1084 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1085 const Instruction *Orig) { 1086 // If the loop was versioned with memchecks, add the corresponding no-alias 1087 // metadata. 1088 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1089 LVer->annotateInstWithNoAlias(To, Orig); 1090 } 1091 1092 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1093 VPTransformState &State) { 1094 1095 // Collect recipes in the backward slice of `Root` that may generate a poison 1096 // value that is used after vectorization. 1097 SmallPtrSet<VPRecipeBase *, 16> Visited; 1098 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1099 SmallVector<VPRecipeBase *, 16> Worklist; 1100 Worklist.push_back(Root); 1101 1102 // Traverse the backward slice of Root through its use-def chain. 1103 while (!Worklist.empty()) { 1104 VPRecipeBase *CurRec = Worklist.back(); 1105 Worklist.pop_back(); 1106 1107 if (!Visited.insert(CurRec).second) 1108 continue; 1109 1110 // Prune search if we find another recipe generating a widen memory 1111 // instruction. Widen memory instructions involved in address computation 1112 // will lead to gather/scatter instructions, which don't need to be 1113 // handled. 1114 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1115 isa<VPInterleaveRecipe>(CurRec) || 1116 isa<VPScalarIVStepsRecipe>(CurRec) || 1117 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1118 continue; 1119 1120 // This recipe contributes to the address computation of a widen 1121 // load/store. Collect recipe if its underlying instruction has 1122 // poison-generating flags. 1123 Instruction *Instr = CurRec->getUnderlyingInstr(); 1124 if (Instr && Instr->hasPoisonGeneratingFlags()) 1125 State.MayGeneratePoisonRecipes.insert(CurRec); 1126 1127 // Add new definitions to the worklist. 1128 for (VPValue *operand : CurRec->operands()) 1129 if (VPDef *OpDef = operand->getDef()) 1130 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1131 } 1132 }); 1133 1134 // Traverse all the recipes in the VPlan and collect the poison-generating 1135 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1136 // VPInterleaveRecipe. 1137 auto Iter = depth_first( 1138 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1139 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1140 for (VPRecipeBase &Recipe : *VPBB) { 1141 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1142 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1143 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1144 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1145 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1146 collectPoisonGeneratingInstrsInBackwardSlice( 1147 cast<VPRecipeBase>(AddrDef)); 1148 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1149 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1150 if (AddrDef) { 1151 // Check if any member of the interleave group needs predication. 1152 const InterleaveGroup<Instruction> *InterGroup = 1153 InterleaveRec->getInterleaveGroup(); 1154 bool NeedPredication = false; 1155 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1156 I < NumMembers; ++I) { 1157 Instruction *Member = InterGroup->getMember(I); 1158 if (Member) 1159 NeedPredication |= 1160 Legal->blockNeedsPredication(Member->getParent()); 1161 } 1162 1163 if (NeedPredication) 1164 collectPoisonGeneratingInstrsInBackwardSlice( 1165 cast<VPRecipeBase>(AddrDef)); 1166 } 1167 } 1168 } 1169 } 1170 } 1171 1172 void InnerLoopVectorizer::addMetadata(Instruction *To, 1173 Instruction *From) { 1174 propagateMetadata(To, From); 1175 addNewMetadata(To, From); 1176 } 1177 1178 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1179 Instruction *From) { 1180 for (Value *V : To) { 1181 if (Instruction *I = dyn_cast<Instruction>(V)) 1182 addMetadata(I, From); 1183 } 1184 } 1185 1186 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1187 const RecurrenceDescriptor &RdxDesc) { 1188 auto It = ReductionResumeValues.find(&RdxDesc); 1189 assert(It != ReductionResumeValues.end() && 1190 "Expected to find a resume value for the reduction."); 1191 return It->second; 1192 } 1193 1194 namespace llvm { 1195 1196 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1197 // lowered. 1198 enum ScalarEpilogueLowering { 1199 1200 // The default: allowing scalar epilogues. 1201 CM_ScalarEpilogueAllowed, 1202 1203 // Vectorization with OptForSize: don't allow epilogues. 1204 CM_ScalarEpilogueNotAllowedOptSize, 1205 1206 // A special case of vectorisation with OptForSize: loops with a very small 1207 // trip count are considered for vectorization under OptForSize, thereby 1208 // making sure the cost of their loop body is dominant, free of runtime 1209 // guards and scalar iteration overheads. 1210 CM_ScalarEpilogueNotAllowedLowTripLoop, 1211 1212 // Loop hint predicate indicating an epilogue is undesired. 1213 CM_ScalarEpilogueNotNeededUsePredicate, 1214 1215 // Directive indicating we must either tail fold or not vectorize 1216 CM_ScalarEpilogueNotAllowedUsePredicate 1217 }; 1218 1219 /// ElementCountComparator creates a total ordering for ElementCount 1220 /// for the purposes of using it in a set structure. 1221 struct ElementCountComparator { 1222 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1223 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1224 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1225 } 1226 }; 1227 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1228 1229 /// LoopVectorizationCostModel - estimates the expected speedups due to 1230 /// vectorization. 1231 /// In many cases vectorization is not profitable. This can happen because of 1232 /// a number of reasons. In this class we mainly attempt to predict the 1233 /// expected speedup/slowdowns due to the supported instruction set. We use the 1234 /// TargetTransformInfo to query the different backends for the cost of 1235 /// different operations. 1236 class LoopVectorizationCostModel { 1237 public: 1238 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1239 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1240 LoopVectorizationLegality *Legal, 1241 const TargetTransformInfo &TTI, 1242 const TargetLibraryInfo *TLI, DemandedBits *DB, 1243 AssumptionCache *AC, 1244 OptimizationRemarkEmitter *ORE, const Function *F, 1245 const LoopVectorizeHints *Hints, 1246 InterleavedAccessInfo &IAI) 1247 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1248 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1249 Hints(Hints), InterleaveInfo(IAI) {} 1250 1251 /// \return An upper bound for the vectorization factors (both fixed and 1252 /// scalable). If the factors are 0, vectorization and interleaving should be 1253 /// avoided up front. 1254 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1255 1256 /// \return True if runtime checks are required for vectorization, and false 1257 /// otherwise. 1258 bool runtimeChecksRequired(); 1259 1260 /// \return The most profitable vectorization factor and the cost of that VF. 1261 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1262 /// then this vectorization factor will be selected if vectorization is 1263 /// possible. 1264 VectorizationFactor 1265 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1266 1267 VectorizationFactor 1268 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1269 const LoopVectorizationPlanner &LVP); 1270 1271 /// Setup cost-based decisions for user vectorization factor. 1272 /// \return true if the UserVF is a feasible VF to be chosen. 1273 bool selectUserVectorizationFactor(ElementCount UserVF) { 1274 collectUniformsAndScalars(UserVF); 1275 collectInstsToScalarize(UserVF); 1276 return expectedCost(UserVF).first.isValid(); 1277 } 1278 1279 /// \return The size (in bits) of the smallest and widest types in the code 1280 /// that needs to be vectorized. We ignore values that remain scalar such as 1281 /// 64 bit loop indices. 1282 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1283 1284 /// \return The desired interleave count. 1285 /// If interleave count has been specified by metadata it will be returned. 1286 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1287 /// are the selected vectorization factor and the cost of the selected VF. 1288 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1289 1290 /// Memory access instruction may be vectorized in more than one way. 1291 /// Form of instruction after vectorization depends on cost. 1292 /// This function takes cost-based decisions for Load/Store instructions 1293 /// and collects them in a map. This decisions map is used for building 1294 /// the lists of loop-uniform and loop-scalar instructions. 1295 /// The calculated cost is saved with widening decision in order to 1296 /// avoid redundant calculations. 1297 void setCostBasedWideningDecision(ElementCount VF); 1298 1299 /// A struct that represents some properties of the register usage 1300 /// of a loop. 1301 struct RegisterUsage { 1302 /// Holds the number of loop invariant values that are used in the loop. 1303 /// The key is ClassID of target-provided register class. 1304 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1305 /// Holds the maximum number of concurrent live intervals in the loop. 1306 /// The key is ClassID of target-provided register class. 1307 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1308 }; 1309 1310 /// \return Returns information about the register usages of the loop for the 1311 /// given vectorization factors. 1312 SmallVector<RegisterUsage, 8> 1313 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1314 1315 /// Collect values we want to ignore in the cost model. 1316 void collectValuesToIgnore(); 1317 1318 /// Collect all element types in the loop for which widening is needed. 1319 void collectElementTypesForWidening(); 1320 1321 /// Split reductions into those that happen in the loop, and those that happen 1322 /// outside. In loop reductions are collected into InLoopReductionChains. 1323 void collectInLoopReductions(); 1324 1325 /// Returns true if we should use strict in-order reductions for the given 1326 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1327 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1328 /// of FP operations. 1329 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1330 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1331 } 1332 1333 /// \returns The smallest bitwidth each instruction can be represented with. 1334 /// The vector equivalents of these instructions should be truncated to this 1335 /// type. 1336 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1337 return MinBWs; 1338 } 1339 1340 /// \returns True if it is more profitable to scalarize instruction \p I for 1341 /// vectorization factor \p VF. 1342 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1343 assert(VF.isVector() && 1344 "Profitable to scalarize relevant only for VF > 1."); 1345 1346 // Cost model is not run in the VPlan-native path - return conservative 1347 // result until this changes. 1348 if (EnableVPlanNativePath) 1349 return false; 1350 1351 auto Scalars = InstsToScalarize.find(VF); 1352 assert(Scalars != InstsToScalarize.end() && 1353 "VF not yet analyzed for scalarization profitability"); 1354 return Scalars->second.find(I) != Scalars->second.end(); 1355 } 1356 1357 /// Returns true if \p I is known to be uniform after vectorization. 1358 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1359 if (VF.isScalar()) 1360 return true; 1361 1362 // Cost model is not run in the VPlan-native path - return conservative 1363 // result until this changes. 1364 if (EnableVPlanNativePath) 1365 return false; 1366 1367 auto UniformsPerVF = Uniforms.find(VF); 1368 assert(UniformsPerVF != Uniforms.end() && 1369 "VF not yet analyzed for uniformity"); 1370 return UniformsPerVF->second.count(I); 1371 } 1372 1373 /// Returns true if \p I is known to be scalar after vectorization. 1374 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1375 if (VF.isScalar()) 1376 return true; 1377 1378 // Cost model is not run in the VPlan-native path - return conservative 1379 // result until this changes. 1380 if (EnableVPlanNativePath) 1381 return false; 1382 1383 auto ScalarsPerVF = Scalars.find(VF); 1384 assert(ScalarsPerVF != Scalars.end() && 1385 "Scalar values are not calculated for VF"); 1386 return ScalarsPerVF->second.count(I); 1387 } 1388 1389 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1390 /// for vectorization factor \p VF. 1391 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1392 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1393 !isProfitableToScalarize(I, VF) && 1394 !isScalarAfterVectorization(I, VF); 1395 } 1396 1397 /// Decision that was taken during cost calculation for memory instruction. 1398 enum InstWidening { 1399 CM_Unknown, 1400 CM_Widen, // For consecutive accesses with stride +1. 1401 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1402 CM_Interleave, 1403 CM_GatherScatter, 1404 CM_Scalarize 1405 }; 1406 1407 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1408 /// instruction \p I and vector width \p VF. 1409 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1410 InstructionCost Cost) { 1411 assert(VF.isVector() && "Expected VF >=2"); 1412 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1413 } 1414 1415 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1416 /// interleaving group \p Grp and vector width \p VF. 1417 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1418 ElementCount VF, InstWidening W, 1419 InstructionCost Cost) { 1420 assert(VF.isVector() && "Expected VF >=2"); 1421 /// Broadcast this decicion to all instructions inside the group. 1422 /// But the cost will be assigned to one instruction only. 1423 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1424 if (auto *I = Grp->getMember(i)) { 1425 if (Grp->getInsertPos() == I) 1426 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1427 else 1428 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1429 } 1430 } 1431 } 1432 1433 /// Return the cost model decision for the given instruction \p I and vector 1434 /// width \p VF. Return CM_Unknown if this instruction did not pass 1435 /// through the cost modeling. 1436 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1437 assert(VF.isVector() && "Expected VF to be a vector VF"); 1438 // Cost model is not run in the VPlan-native path - return conservative 1439 // result until this changes. 1440 if (EnableVPlanNativePath) 1441 return CM_GatherScatter; 1442 1443 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1444 auto Itr = WideningDecisions.find(InstOnVF); 1445 if (Itr == WideningDecisions.end()) 1446 return CM_Unknown; 1447 return Itr->second.first; 1448 } 1449 1450 /// Return the vectorization cost for the given instruction \p I and vector 1451 /// width \p VF. 1452 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1453 assert(VF.isVector() && "Expected VF >=2"); 1454 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1455 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1456 "The cost is not calculated"); 1457 return WideningDecisions[InstOnVF].second; 1458 } 1459 1460 /// Return True if instruction \p I is an optimizable truncate whose operand 1461 /// is an induction variable. Such a truncate will be removed by adding a new 1462 /// induction variable with the destination type. 1463 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1464 // If the instruction is not a truncate, return false. 1465 auto *Trunc = dyn_cast<TruncInst>(I); 1466 if (!Trunc) 1467 return false; 1468 1469 // Get the source and destination types of the truncate. 1470 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1471 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1472 1473 // If the truncate is free for the given types, return false. Replacing a 1474 // free truncate with an induction variable would add an induction variable 1475 // update instruction to each iteration of the loop. We exclude from this 1476 // check the primary induction variable since it will need an update 1477 // instruction regardless. 1478 Value *Op = Trunc->getOperand(0); 1479 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1480 return false; 1481 1482 // If the truncated value is not an induction variable, return false. 1483 return Legal->isInductionPhi(Op); 1484 } 1485 1486 /// Collects the instructions to scalarize for each predicated instruction in 1487 /// the loop. 1488 void collectInstsToScalarize(ElementCount VF); 1489 1490 /// Collect Uniform and Scalar values for the given \p VF. 1491 /// The sets depend on CM decision for Load/Store instructions 1492 /// that may be vectorized as interleave, gather-scatter or scalarized. 1493 void collectUniformsAndScalars(ElementCount VF) { 1494 // Do the analysis once. 1495 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1496 return; 1497 setCostBasedWideningDecision(VF); 1498 collectLoopUniforms(VF); 1499 collectLoopScalars(VF); 1500 } 1501 1502 /// Returns true if the target machine supports masked store operation 1503 /// for the given \p DataType and kind of access to \p Ptr. 1504 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1505 return Legal->isConsecutivePtr(DataType, Ptr) && 1506 TTI.isLegalMaskedStore(DataType, Alignment); 1507 } 1508 1509 /// Returns true if the target machine supports masked load operation 1510 /// for the given \p DataType and kind of access to \p Ptr. 1511 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1512 return Legal->isConsecutivePtr(DataType, Ptr) && 1513 TTI.isLegalMaskedLoad(DataType, Alignment); 1514 } 1515 1516 /// Returns true if the target machine can represent \p V as a masked gather 1517 /// or scatter operation. 1518 bool isLegalGatherOrScatter(Value *V, 1519 ElementCount VF = ElementCount::getFixed(1)) { 1520 bool LI = isa<LoadInst>(V); 1521 bool SI = isa<StoreInst>(V); 1522 if (!LI && !SI) 1523 return false; 1524 auto *Ty = getLoadStoreType(V); 1525 Align Align = getLoadStoreAlignment(V); 1526 if (VF.isVector()) 1527 Ty = VectorType::get(Ty, VF); 1528 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1529 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1530 } 1531 1532 /// Returns true if the target machine supports all of the reduction 1533 /// variables found for the given VF. 1534 bool canVectorizeReductions(ElementCount VF) const { 1535 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1536 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1537 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1538 })); 1539 } 1540 1541 /// Returns true if \p I is an instruction that will be scalarized with 1542 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1543 /// instructions include conditional stores and instructions that may divide 1544 /// by zero. 1545 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1546 1547 // Returns true if \p I is an instruction that will be predicated either 1548 // through scalar predication or masked load/store or masked gather/scatter. 1549 // \p VF is the vectorization factor that will be used to vectorize \p I. 1550 // Superset of instructions that return true for isScalarWithPredication. 1551 bool isPredicatedInst(Instruction *I, ElementCount VF, 1552 bool IsKnownUniform = false) { 1553 // When we know the load is uniform and the original scalar loop was not 1554 // predicated we don't need to mark it as a predicated instruction. Any 1555 // vectorised blocks created when tail-folding are something artificial we 1556 // have introduced and we know there is always at least one active lane. 1557 // That's why we call Legal->blockNeedsPredication here because it doesn't 1558 // query tail-folding. 1559 if (IsKnownUniform && isa<LoadInst>(I) && 1560 !Legal->blockNeedsPredication(I->getParent())) 1561 return false; 1562 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1563 return false; 1564 // Loads and stores that need some form of masked operation are predicated 1565 // instructions. 1566 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1567 return Legal->isMaskRequired(I); 1568 return isScalarWithPredication(I, VF); 1569 } 1570 1571 /// Returns true if \p I is a memory instruction with consecutive memory 1572 /// access that can be widened. 1573 bool 1574 memoryInstructionCanBeWidened(Instruction *I, 1575 ElementCount VF = ElementCount::getFixed(1)); 1576 1577 /// Returns true if \p I is a memory instruction in an interleaved-group 1578 /// of memory accesses that can be vectorized with wide vector loads/stores 1579 /// and shuffles. 1580 bool 1581 interleavedAccessCanBeWidened(Instruction *I, 1582 ElementCount VF = ElementCount::getFixed(1)); 1583 1584 /// Check if \p Instr belongs to any interleaved access group. 1585 bool isAccessInterleaved(Instruction *Instr) { 1586 return InterleaveInfo.isInterleaved(Instr); 1587 } 1588 1589 /// Get the interleaved access group that \p Instr belongs to. 1590 const InterleaveGroup<Instruction> * 1591 getInterleavedAccessGroup(Instruction *Instr) { 1592 return InterleaveInfo.getInterleaveGroup(Instr); 1593 } 1594 1595 /// Returns true if we're required to use a scalar epilogue for at least 1596 /// the final iteration of the original loop. 1597 bool requiresScalarEpilogue(ElementCount VF) const { 1598 if (!isScalarEpilogueAllowed()) 1599 return false; 1600 // If we might exit from anywhere but the latch, must run the exiting 1601 // iteration in scalar form. 1602 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1603 return true; 1604 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1605 } 1606 1607 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1608 /// loop hint annotation. 1609 bool isScalarEpilogueAllowed() const { 1610 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1611 } 1612 1613 /// Returns true if all loop blocks should be masked to fold tail loop. 1614 bool foldTailByMasking() const { return FoldTailByMasking; } 1615 1616 /// Returns true if the instructions in this block requires predication 1617 /// for any reason, e.g. because tail folding now requires a predicate 1618 /// or because the block in the original loop was predicated. 1619 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1620 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1621 } 1622 1623 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1624 /// nodes to the chain of instructions representing the reductions. Uses a 1625 /// MapVector to ensure deterministic iteration order. 1626 using ReductionChainMap = 1627 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1628 1629 /// Return the chain of instructions representing an inloop reduction. 1630 const ReductionChainMap &getInLoopReductionChains() const { 1631 return InLoopReductionChains; 1632 } 1633 1634 /// Returns true if the Phi is part of an inloop reduction. 1635 bool isInLoopReduction(PHINode *Phi) const { 1636 return InLoopReductionChains.count(Phi); 1637 } 1638 1639 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1640 /// with factor VF. Return the cost of the instruction, including 1641 /// scalarization overhead if it's needed. 1642 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1643 1644 /// Estimate cost of a call instruction CI if it were vectorized with factor 1645 /// VF. Return the cost of the instruction, including scalarization overhead 1646 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1647 /// scalarized - 1648 /// i.e. either vector version isn't available, or is too expensive. 1649 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1650 bool &NeedToScalarize) const; 1651 1652 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1653 /// that of B. 1654 bool isMoreProfitable(const VectorizationFactor &A, 1655 const VectorizationFactor &B) const; 1656 1657 /// Invalidates decisions already taken by the cost model. 1658 void invalidateCostModelingDecisions() { 1659 WideningDecisions.clear(); 1660 Uniforms.clear(); 1661 Scalars.clear(); 1662 } 1663 1664 private: 1665 unsigned NumPredStores = 0; 1666 1667 /// Convenience function that returns the value of vscale_range iff 1668 /// vscale_range.min == vscale_range.max or otherwise returns the value 1669 /// returned by the corresponding TLI method. 1670 Optional<unsigned> getVScaleForTuning() const; 1671 1672 /// \return An upper bound for the vectorization factors for both 1673 /// fixed and scalable vectorization, where the minimum-known number of 1674 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1675 /// disabled or unsupported, then the scalable part will be equal to 1676 /// ElementCount::getScalable(0). 1677 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1678 ElementCount UserVF, 1679 bool FoldTailByMasking); 1680 1681 /// \return the maximized element count based on the targets vector 1682 /// registers and the loop trip-count, but limited to a maximum safe VF. 1683 /// This is a helper function of computeFeasibleMaxVF. 1684 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1685 /// issue that occurred on one of the buildbots which cannot be reproduced 1686 /// without having access to the properietary compiler (see comments on 1687 /// D98509). The issue is currently under investigation and this workaround 1688 /// will be removed as soon as possible. 1689 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1690 unsigned SmallestType, 1691 unsigned WidestType, 1692 const ElementCount &MaxSafeVF, 1693 bool FoldTailByMasking); 1694 1695 /// \return the maximum legal scalable VF, based on the safe max number 1696 /// of elements. 1697 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1698 1699 /// The vectorization cost is a combination of the cost itself and a boolean 1700 /// indicating whether any of the contributing operations will actually 1701 /// operate on vector values after type legalization in the backend. If this 1702 /// latter value is false, then all operations will be scalarized (i.e. no 1703 /// vectorization has actually taken place). 1704 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1705 1706 /// Returns the expected execution cost. The unit of the cost does 1707 /// not matter because we use the 'cost' units to compare different 1708 /// vector widths. The cost that is returned is *not* normalized by 1709 /// the factor width. If \p Invalid is not nullptr, this function 1710 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1711 /// each instruction that has an Invalid cost for the given VF. 1712 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1713 VectorizationCostTy 1714 expectedCost(ElementCount VF, 1715 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1716 1717 /// Returns the execution time cost of an instruction for a given vector 1718 /// width. Vector width of one means scalar. 1719 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1720 1721 /// The cost-computation logic from getInstructionCost which provides 1722 /// the vector type as an output parameter. 1723 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1724 Type *&VectorTy); 1725 1726 /// Return the cost of instructions in an inloop reduction pattern, if I is 1727 /// part of that pattern. 1728 Optional<InstructionCost> 1729 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1730 TTI::TargetCostKind CostKind); 1731 1732 /// Calculate vectorization cost of memory instruction \p I. 1733 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1734 1735 /// The cost computation for scalarized memory instruction. 1736 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1737 1738 /// The cost computation for interleaving group of memory instructions. 1739 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1740 1741 /// The cost computation for Gather/Scatter instruction. 1742 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1743 1744 /// The cost computation for widening instruction \p I with consecutive 1745 /// memory access. 1746 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1747 1748 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1749 /// Load: scalar load + broadcast. 1750 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1751 /// element) 1752 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1753 1754 /// Estimate the overhead of scalarizing an instruction. This is a 1755 /// convenience wrapper for the type-based getScalarizationOverhead API. 1756 InstructionCost getScalarizationOverhead(Instruction *I, 1757 ElementCount VF) const; 1758 1759 /// Returns whether the instruction is a load or store and will be a emitted 1760 /// as a vector operation. 1761 bool isConsecutiveLoadOrStore(Instruction *I); 1762 1763 /// Returns true if an artificially high cost for emulated masked memrefs 1764 /// should be used. 1765 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1766 1767 /// Map of scalar integer values to the smallest bitwidth they can be legally 1768 /// represented as. The vector equivalents of these values should be truncated 1769 /// to this type. 1770 MapVector<Instruction *, uint64_t> MinBWs; 1771 1772 /// A type representing the costs for instructions if they were to be 1773 /// scalarized rather than vectorized. The entries are Instruction-Cost 1774 /// pairs. 1775 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1776 1777 /// A set containing all BasicBlocks that are known to present after 1778 /// vectorization as a predicated block. 1779 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1780 1781 /// Records whether it is allowed to have the original scalar loop execute at 1782 /// least once. This may be needed as a fallback loop in case runtime 1783 /// aliasing/dependence checks fail, or to handle the tail/remainder 1784 /// iterations when the trip count is unknown or doesn't divide by the VF, 1785 /// or as a peel-loop to handle gaps in interleave-groups. 1786 /// Under optsize and when the trip count is very small we don't allow any 1787 /// iterations to execute in the scalar loop. 1788 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1789 1790 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1791 bool FoldTailByMasking = false; 1792 1793 /// A map holding scalar costs for different vectorization factors. The 1794 /// presence of a cost for an instruction in the mapping indicates that the 1795 /// instruction will be scalarized when vectorizing with the associated 1796 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1797 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1798 1799 /// Holds the instructions known to be uniform after vectorization. 1800 /// The data is collected per VF. 1801 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1802 1803 /// Holds the instructions known to be scalar after vectorization. 1804 /// The data is collected per VF. 1805 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1806 1807 /// Holds the instructions (address computations) that are forced to be 1808 /// scalarized. 1809 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1810 1811 /// PHINodes of the reductions that should be expanded in-loop along with 1812 /// their associated chains of reduction operations, in program order from top 1813 /// (PHI) to bottom 1814 ReductionChainMap InLoopReductionChains; 1815 1816 /// A Map of inloop reduction operations and their immediate chain operand. 1817 /// FIXME: This can be removed once reductions can be costed correctly in 1818 /// vplan. This was added to allow quick lookup to the inloop operations, 1819 /// without having to loop through InLoopReductionChains. 1820 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1821 1822 /// Returns the expected difference in cost from scalarizing the expression 1823 /// feeding a predicated instruction \p PredInst. The instructions to 1824 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1825 /// non-negative return value implies the expression will be scalarized. 1826 /// Currently, only single-use chains are considered for scalarization. 1827 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1828 ElementCount VF); 1829 1830 /// Collect the instructions that are uniform after vectorization. An 1831 /// instruction is uniform if we represent it with a single scalar value in 1832 /// the vectorized loop corresponding to each vector iteration. Examples of 1833 /// uniform instructions include pointer operands of consecutive or 1834 /// interleaved memory accesses. Note that although uniformity implies an 1835 /// instruction will be scalar, the reverse is not true. In general, a 1836 /// scalarized instruction will be represented by VF scalar values in the 1837 /// vectorized loop, each corresponding to an iteration of the original 1838 /// scalar loop. 1839 void collectLoopUniforms(ElementCount VF); 1840 1841 /// Collect the instructions that are scalar after vectorization. An 1842 /// instruction is scalar if it is known to be uniform or will be scalarized 1843 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1844 /// to the list if they are used by a load/store instruction that is marked as 1845 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1846 /// VF values in the vectorized loop, each corresponding to an iteration of 1847 /// the original scalar loop. 1848 void collectLoopScalars(ElementCount VF); 1849 1850 /// Keeps cost model vectorization decision and cost for instructions. 1851 /// Right now it is used for memory instructions only. 1852 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1853 std::pair<InstWidening, InstructionCost>>; 1854 1855 DecisionList WideningDecisions; 1856 1857 /// Returns true if \p V is expected to be vectorized and it needs to be 1858 /// extracted. 1859 bool needsExtract(Value *V, ElementCount VF) const { 1860 Instruction *I = dyn_cast<Instruction>(V); 1861 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1862 TheLoop->isLoopInvariant(I)) 1863 return false; 1864 1865 // Assume we can vectorize V (and hence we need extraction) if the 1866 // scalars are not computed yet. This can happen, because it is called 1867 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1868 // the scalars are collected. That should be a safe assumption in most 1869 // cases, because we check if the operands have vectorizable types 1870 // beforehand in LoopVectorizationLegality. 1871 return Scalars.find(VF) == Scalars.end() || 1872 !isScalarAfterVectorization(I, VF); 1873 }; 1874 1875 /// Returns a range containing only operands needing to be extracted. 1876 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1877 ElementCount VF) const { 1878 return SmallVector<Value *, 4>(make_filter_range( 1879 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1880 } 1881 1882 /// Determines if we have the infrastructure to vectorize loop \p L and its 1883 /// epilogue, assuming the main loop is vectorized by \p VF. 1884 bool isCandidateForEpilogueVectorization(const Loop &L, 1885 const ElementCount VF) const; 1886 1887 /// Returns true if epilogue vectorization is considered profitable, and 1888 /// false otherwise. 1889 /// \p VF is the vectorization factor chosen for the original loop. 1890 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1891 1892 public: 1893 /// The loop that we evaluate. 1894 Loop *TheLoop; 1895 1896 /// Predicated scalar evolution analysis. 1897 PredicatedScalarEvolution &PSE; 1898 1899 /// Loop Info analysis. 1900 LoopInfo *LI; 1901 1902 /// Vectorization legality. 1903 LoopVectorizationLegality *Legal; 1904 1905 /// Vector target information. 1906 const TargetTransformInfo &TTI; 1907 1908 /// Target Library Info. 1909 const TargetLibraryInfo *TLI; 1910 1911 /// Demanded bits analysis. 1912 DemandedBits *DB; 1913 1914 /// Assumption cache. 1915 AssumptionCache *AC; 1916 1917 /// Interface to emit optimization remarks. 1918 OptimizationRemarkEmitter *ORE; 1919 1920 const Function *TheFunction; 1921 1922 /// Loop Vectorize Hint. 1923 const LoopVectorizeHints *Hints; 1924 1925 /// The interleave access information contains groups of interleaved accesses 1926 /// with the same stride and close to each other. 1927 InterleavedAccessInfo &InterleaveInfo; 1928 1929 /// Values to ignore in the cost model. 1930 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1931 1932 /// Values to ignore in the cost model when VF > 1. 1933 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1934 1935 /// All element types found in the loop. 1936 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1937 1938 /// Profitable vector factors. 1939 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1940 }; 1941 } // end namespace llvm 1942 1943 /// Helper struct to manage generating runtime checks for vectorization. 1944 /// 1945 /// The runtime checks are created up-front in temporary blocks to allow better 1946 /// estimating the cost and un-linked from the existing IR. After deciding to 1947 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1948 /// temporary blocks are completely removed. 1949 class GeneratedRTChecks { 1950 /// Basic block which contains the generated SCEV checks, if any. 1951 BasicBlock *SCEVCheckBlock = nullptr; 1952 1953 /// The value representing the result of the generated SCEV checks. If it is 1954 /// nullptr, either no SCEV checks have been generated or they have been used. 1955 Value *SCEVCheckCond = nullptr; 1956 1957 /// Basic block which contains the generated memory runtime checks, if any. 1958 BasicBlock *MemCheckBlock = nullptr; 1959 1960 /// The value representing the result of the generated memory runtime checks. 1961 /// If it is nullptr, either no memory runtime checks have been generated or 1962 /// they have been used. 1963 Value *MemRuntimeCheckCond = nullptr; 1964 1965 DominatorTree *DT; 1966 LoopInfo *LI; 1967 1968 SCEVExpander SCEVExp; 1969 SCEVExpander MemCheckExp; 1970 1971 public: 1972 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1973 const DataLayout &DL) 1974 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1975 MemCheckExp(SE, DL, "scev.check") {} 1976 1977 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1978 /// accurately estimate the cost of the runtime checks. The blocks are 1979 /// un-linked from the IR and is added back during vector code generation. If 1980 /// there is no vector code generation, the check blocks are removed 1981 /// completely. 1982 void Create(Loop *L, const LoopAccessInfo &LAI, 1983 const SCEVPredicate &Pred) { 1984 1985 BasicBlock *LoopHeader = L->getHeader(); 1986 BasicBlock *Preheader = L->getLoopPreheader(); 1987 1988 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1989 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1990 // may be used by SCEVExpander. The blocks will be un-linked from their 1991 // predecessors and removed from LI & DT at the end of the function. 1992 if (!Pred.isAlwaysTrue()) { 1993 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1994 nullptr, "vector.scevcheck"); 1995 1996 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1997 &Pred, SCEVCheckBlock->getTerminator()); 1998 } 1999 2000 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2001 if (RtPtrChecking.Need) { 2002 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2003 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2004 "vector.memcheck"); 2005 2006 MemRuntimeCheckCond = 2007 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2008 RtPtrChecking.getChecks(), MemCheckExp); 2009 assert(MemRuntimeCheckCond && 2010 "no RT checks generated although RtPtrChecking " 2011 "claimed checks are required"); 2012 } 2013 2014 if (!MemCheckBlock && !SCEVCheckBlock) 2015 return; 2016 2017 // Unhook the temporary block with the checks, update various places 2018 // accordingly. 2019 if (SCEVCheckBlock) 2020 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2021 if (MemCheckBlock) 2022 MemCheckBlock->replaceAllUsesWith(Preheader); 2023 2024 if (SCEVCheckBlock) { 2025 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2026 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2027 Preheader->getTerminator()->eraseFromParent(); 2028 } 2029 if (MemCheckBlock) { 2030 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2031 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2032 Preheader->getTerminator()->eraseFromParent(); 2033 } 2034 2035 DT->changeImmediateDominator(LoopHeader, Preheader); 2036 if (MemCheckBlock) { 2037 DT->eraseNode(MemCheckBlock); 2038 LI->removeBlock(MemCheckBlock); 2039 } 2040 if (SCEVCheckBlock) { 2041 DT->eraseNode(SCEVCheckBlock); 2042 LI->removeBlock(SCEVCheckBlock); 2043 } 2044 } 2045 2046 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2047 /// unused. 2048 ~GeneratedRTChecks() { 2049 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2050 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2051 if (!SCEVCheckCond) 2052 SCEVCleaner.markResultUsed(); 2053 2054 if (!MemRuntimeCheckCond) 2055 MemCheckCleaner.markResultUsed(); 2056 2057 if (MemRuntimeCheckCond) { 2058 auto &SE = *MemCheckExp.getSE(); 2059 // Memory runtime check generation creates compares that use expanded 2060 // values. Remove them before running the SCEVExpanderCleaners. 2061 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2062 if (MemCheckExp.isInsertedInstruction(&I)) 2063 continue; 2064 SE.forgetValue(&I); 2065 I.eraseFromParent(); 2066 } 2067 } 2068 MemCheckCleaner.cleanup(); 2069 SCEVCleaner.cleanup(); 2070 2071 if (SCEVCheckCond) 2072 SCEVCheckBlock->eraseFromParent(); 2073 if (MemRuntimeCheckCond) 2074 MemCheckBlock->eraseFromParent(); 2075 } 2076 2077 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2078 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2079 /// depending on the generated condition. 2080 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2081 BasicBlock *LoopVectorPreHeader, 2082 BasicBlock *LoopExitBlock) { 2083 if (!SCEVCheckCond) 2084 return nullptr; 2085 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2086 if (C->isZero()) 2087 return nullptr; 2088 2089 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2090 2091 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2092 // Create new preheader for vector loop. 2093 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2094 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2095 2096 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2097 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2098 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2099 SCEVCheckBlock); 2100 2101 DT->addNewBlock(SCEVCheckBlock, Pred); 2102 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2103 2104 ReplaceInstWithInst( 2105 SCEVCheckBlock->getTerminator(), 2106 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2107 // Mark the check as used, to prevent it from being removed during cleanup. 2108 SCEVCheckCond = nullptr; 2109 return SCEVCheckBlock; 2110 } 2111 2112 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2113 /// the branches to branch to the vector preheader or \p Bypass, depending on 2114 /// the generated condition. 2115 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2116 BasicBlock *LoopVectorPreHeader) { 2117 // Check if we generated code that checks in runtime if arrays overlap. 2118 if (!MemRuntimeCheckCond) 2119 return nullptr; 2120 2121 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2122 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2123 MemCheckBlock); 2124 2125 DT->addNewBlock(MemCheckBlock, Pred); 2126 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2127 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2128 2129 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2130 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2131 2132 ReplaceInstWithInst( 2133 MemCheckBlock->getTerminator(), 2134 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2135 MemCheckBlock->getTerminator()->setDebugLoc( 2136 Pred->getTerminator()->getDebugLoc()); 2137 2138 // Mark the check as used, to prevent it from being removed during cleanup. 2139 MemRuntimeCheckCond = nullptr; 2140 return MemCheckBlock; 2141 } 2142 }; 2143 2144 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2145 // vectorization. The loop needs to be annotated with #pragma omp simd 2146 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2147 // vector length information is not provided, vectorization is not considered 2148 // explicit. Interleave hints are not allowed either. These limitations will be 2149 // relaxed in the future. 2150 // Please, note that we are currently forced to abuse the pragma 'clang 2151 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2152 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2153 // provides *explicit vectorization hints* (LV can bypass legal checks and 2154 // assume that vectorization is legal). However, both hints are implemented 2155 // using the same metadata (llvm.loop.vectorize, processed by 2156 // LoopVectorizeHints). This will be fixed in the future when the native IR 2157 // representation for pragma 'omp simd' is introduced. 2158 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2159 OptimizationRemarkEmitter *ORE) { 2160 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2161 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2162 2163 // Only outer loops with an explicit vectorization hint are supported. 2164 // Unannotated outer loops are ignored. 2165 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2166 return false; 2167 2168 Function *Fn = OuterLp->getHeader()->getParent(); 2169 if (!Hints.allowVectorization(Fn, OuterLp, 2170 true /*VectorizeOnlyWhenForced*/)) { 2171 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2172 return false; 2173 } 2174 2175 if (Hints.getInterleave() > 1) { 2176 // TODO: Interleave support is future work. 2177 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2178 "outer loops.\n"); 2179 Hints.emitRemarkWithHints(); 2180 return false; 2181 } 2182 2183 return true; 2184 } 2185 2186 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2187 OptimizationRemarkEmitter *ORE, 2188 SmallVectorImpl<Loop *> &V) { 2189 // Collect inner loops and outer loops without irreducible control flow. For 2190 // now, only collect outer loops that have explicit vectorization hints. If we 2191 // are stress testing the VPlan H-CFG construction, we collect the outermost 2192 // loop of every loop nest. 2193 if (L.isInnermost() || VPlanBuildStressTest || 2194 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2195 LoopBlocksRPO RPOT(&L); 2196 RPOT.perform(LI); 2197 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2198 V.push_back(&L); 2199 // TODO: Collect inner loops inside marked outer loops in case 2200 // vectorization fails for the outer loop. Do not invoke 2201 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2202 // already known to be reducible. We can use an inherited attribute for 2203 // that. 2204 return; 2205 } 2206 } 2207 for (Loop *InnerL : L) 2208 collectSupportedLoops(*InnerL, LI, ORE, V); 2209 } 2210 2211 namespace { 2212 2213 /// The LoopVectorize Pass. 2214 struct LoopVectorize : public FunctionPass { 2215 /// Pass identification, replacement for typeid 2216 static char ID; 2217 2218 LoopVectorizePass Impl; 2219 2220 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2221 bool VectorizeOnlyWhenForced = false) 2222 : FunctionPass(ID), 2223 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2224 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2225 } 2226 2227 bool runOnFunction(Function &F) override { 2228 if (skipFunction(F)) 2229 return false; 2230 2231 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2232 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2233 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2234 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2235 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2236 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2237 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2238 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2239 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2240 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2241 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2242 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2243 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2244 2245 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2246 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2247 2248 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2249 GetLAA, *ORE, PSI).MadeAnyChange; 2250 } 2251 2252 void getAnalysisUsage(AnalysisUsage &AU) const override { 2253 AU.addRequired<AssumptionCacheTracker>(); 2254 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2255 AU.addRequired<DominatorTreeWrapperPass>(); 2256 AU.addRequired<LoopInfoWrapperPass>(); 2257 AU.addRequired<ScalarEvolutionWrapperPass>(); 2258 AU.addRequired<TargetTransformInfoWrapperPass>(); 2259 AU.addRequired<AAResultsWrapperPass>(); 2260 AU.addRequired<LoopAccessLegacyAnalysis>(); 2261 AU.addRequired<DemandedBitsWrapperPass>(); 2262 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2263 AU.addRequired<InjectTLIMappingsLegacy>(); 2264 2265 // We currently do not preserve loopinfo/dominator analyses with outer loop 2266 // vectorization. Until this is addressed, mark these analyses as preserved 2267 // only for non-VPlan-native path. 2268 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2269 if (!EnableVPlanNativePath) { 2270 AU.addPreserved<LoopInfoWrapperPass>(); 2271 AU.addPreserved<DominatorTreeWrapperPass>(); 2272 } 2273 2274 AU.addPreserved<BasicAAWrapperPass>(); 2275 AU.addPreserved<GlobalsAAWrapperPass>(); 2276 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2277 } 2278 }; 2279 2280 } // end anonymous namespace 2281 2282 //===----------------------------------------------------------------------===// 2283 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2284 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2285 //===----------------------------------------------------------------------===// 2286 2287 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2288 // We need to place the broadcast of invariant variables outside the loop, 2289 // but only if it's proven safe to do so. Else, broadcast will be inside 2290 // vector loop body. 2291 Instruction *Instr = dyn_cast<Instruction>(V); 2292 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2293 (!Instr || 2294 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2295 // Place the code for broadcasting invariant variables in the new preheader. 2296 IRBuilder<>::InsertPointGuard Guard(Builder); 2297 if (SafeToHoist) 2298 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2299 2300 // Broadcast the scalar into all locations in the vector. 2301 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2302 2303 return Shuf; 2304 } 2305 2306 /// This function adds 2307 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2308 /// to each vector element of Val. The sequence starts at StartIndex. 2309 /// \p Opcode is relevant for FP induction variable. 2310 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2311 Instruction::BinaryOps BinOp, ElementCount VF, 2312 IRBuilderBase &Builder) { 2313 assert(VF.isVector() && "only vector VFs are supported"); 2314 2315 // Create and check the types. 2316 auto *ValVTy = cast<VectorType>(Val->getType()); 2317 ElementCount VLen = ValVTy->getElementCount(); 2318 2319 Type *STy = Val->getType()->getScalarType(); 2320 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2321 "Induction Step must be an integer or FP"); 2322 assert(Step->getType() == STy && "Step has wrong type"); 2323 2324 SmallVector<Constant *, 8> Indices; 2325 2326 // Create a vector of consecutive numbers from zero to VF. 2327 VectorType *InitVecValVTy = ValVTy; 2328 if (STy->isFloatingPointTy()) { 2329 Type *InitVecValSTy = 2330 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2331 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2332 } 2333 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2334 2335 // Splat the StartIdx 2336 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2337 2338 if (STy->isIntegerTy()) { 2339 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2340 Step = Builder.CreateVectorSplat(VLen, Step); 2341 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2342 // FIXME: The newly created binary instructions should contain nsw/nuw 2343 // flags, which can be found from the original scalar operations. 2344 Step = Builder.CreateMul(InitVec, Step); 2345 return Builder.CreateAdd(Val, Step, "induction"); 2346 } 2347 2348 // Floating point induction. 2349 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2350 "Binary Opcode should be specified for FP induction"); 2351 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2352 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2353 2354 Step = Builder.CreateVectorSplat(VLen, Step); 2355 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2356 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2357 } 2358 2359 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2360 /// variable on which to base the steps, \p Step is the size of the step. 2361 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2362 const InductionDescriptor &ID, VPValue *Def, 2363 VPTransformState &State) { 2364 IRBuilderBase &Builder = State.Builder; 2365 // We shouldn't have to build scalar steps if we aren't vectorizing. 2366 assert(State.VF.isVector() && "VF should be greater than one"); 2367 // Get the value type and ensure it and the step have the same integer type. 2368 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2369 assert(ScalarIVTy == Step->getType() && 2370 "Val and Step should have the same type"); 2371 2372 // We build scalar steps for both integer and floating-point induction 2373 // variables. Here, we determine the kind of arithmetic we will perform. 2374 Instruction::BinaryOps AddOp; 2375 Instruction::BinaryOps MulOp; 2376 if (ScalarIVTy->isIntegerTy()) { 2377 AddOp = Instruction::Add; 2378 MulOp = Instruction::Mul; 2379 } else { 2380 AddOp = ID.getInductionOpcode(); 2381 MulOp = Instruction::FMul; 2382 } 2383 2384 // Determine the number of scalars we need to generate for each unroll 2385 // iteration. 2386 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2387 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2388 // Compute the scalar steps and save the results in State. 2389 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2390 ScalarIVTy->getScalarSizeInBits()); 2391 Type *VecIVTy = nullptr; 2392 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2393 if (!FirstLaneOnly && State.VF.isScalable()) { 2394 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2395 UnitStepVec = 2396 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2397 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2398 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2399 } 2400 2401 for (unsigned Part = 0; Part < State.UF; ++Part) { 2402 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2403 2404 if (!FirstLaneOnly && State.VF.isScalable()) { 2405 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2406 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2407 if (ScalarIVTy->isFloatingPointTy()) 2408 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2409 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2410 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2411 State.set(Def, Add, Part); 2412 // It's useful to record the lane values too for the known minimum number 2413 // of elements so we do those below. This improves the code quality when 2414 // trying to extract the first element, for example. 2415 } 2416 2417 if (ScalarIVTy->isFloatingPointTy()) 2418 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2419 2420 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2421 Value *StartIdx = Builder.CreateBinOp( 2422 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2423 // The step returned by `createStepForVF` is a runtime-evaluated value 2424 // when VF is scalable. Otherwise, it should be folded into a Constant. 2425 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2426 "Expected StartIdx to be folded to a constant when VF is not " 2427 "scalable"); 2428 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2429 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2430 State.set(Def, Add, VPIteration(Part, Lane)); 2431 } 2432 } 2433 } 2434 2435 // Generate code for the induction step. Note that induction steps are 2436 // required to be loop-invariant 2437 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2438 Instruction *InsertBefore, 2439 Loop *OrigLoop = nullptr) { 2440 const DataLayout &DL = SE.getDataLayout(); 2441 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2442 "Induction step should be loop invariant"); 2443 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2444 return E->getValue(); 2445 2446 SCEVExpander Exp(SE, DL, "induction"); 2447 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2448 } 2449 2450 /// Compute the transformed value of Index at offset StartValue using step 2451 /// StepValue. 2452 /// For integer induction, returns StartValue + Index * StepValue. 2453 /// For pointer induction, returns StartValue[Index * StepValue]. 2454 /// FIXME: The newly created binary instructions should contain nsw/nuw 2455 /// flags, which can be found from the original scalar operations. 2456 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2457 Value *StartValue, Value *Step, 2458 const InductionDescriptor &ID) { 2459 assert(Index->getType()->getScalarType() == Step->getType() && 2460 "Index scalar type does not match StepValue type"); 2461 2462 // Note: the IR at this point is broken. We cannot use SE to create any new 2463 // SCEV and then expand it, hoping that SCEV's simplification will give us 2464 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2465 // lead to various SCEV crashes. So all we can do is to use builder and rely 2466 // on InstCombine for future simplifications. Here we handle some trivial 2467 // cases only. 2468 auto CreateAdd = [&B](Value *X, Value *Y) { 2469 assert(X->getType() == Y->getType() && "Types don't match!"); 2470 if (auto *CX = dyn_cast<ConstantInt>(X)) 2471 if (CX->isZero()) 2472 return Y; 2473 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2474 if (CY->isZero()) 2475 return X; 2476 return B.CreateAdd(X, Y); 2477 }; 2478 2479 // We allow X to be a vector type, in which case Y will potentially be 2480 // splatted into a vector with the same element count. 2481 auto CreateMul = [&B](Value *X, Value *Y) { 2482 assert(X->getType()->getScalarType() == Y->getType() && 2483 "Types don't match!"); 2484 if (auto *CX = dyn_cast<ConstantInt>(X)) 2485 if (CX->isOne()) 2486 return Y; 2487 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2488 if (CY->isOne()) 2489 return X; 2490 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2491 if (XVTy && !isa<VectorType>(Y->getType())) 2492 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2493 return B.CreateMul(X, Y); 2494 }; 2495 2496 switch (ID.getKind()) { 2497 case InductionDescriptor::IK_IntInduction: { 2498 assert(!isa<VectorType>(Index->getType()) && 2499 "Vector indices not supported for integer inductions yet"); 2500 assert(Index->getType() == StartValue->getType() && 2501 "Index type does not match StartValue type"); 2502 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2503 return B.CreateSub(StartValue, Index); 2504 auto *Offset = CreateMul(Index, Step); 2505 return CreateAdd(StartValue, Offset); 2506 } 2507 case InductionDescriptor::IK_PtrInduction: { 2508 assert(isa<Constant>(Step) && 2509 "Expected constant step for pointer induction"); 2510 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2511 } 2512 case InductionDescriptor::IK_FpInduction: { 2513 assert(!isa<VectorType>(Index->getType()) && 2514 "Vector indices not supported for FP inductions yet"); 2515 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2516 auto InductionBinOp = ID.getInductionBinOp(); 2517 assert(InductionBinOp && 2518 (InductionBinOp->getOpcode() == Instruction::FAdd || 2519 InductionBinOp->getOpcode() == Instruction::FSub) && 2520 "Original bin op should be defined for FP induction"); 2521 2522 Value *MulExp = B.CreateFMul(Step, Index); 2523 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2524 "induction"); 2525 } 2526 case InductionDescriptor::IK_NoInduction: 2527 return nullptr; 2528 } 2529 llvm_unreachable("invalid enum"); 2530 } 2531 2532 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2533 const VPIteration &Instance, 2534 VPTransformState &State) { 2535 Value *ScalarInst = State.get(Def, Instance); 2536 Value *VectorValue = State.get(Def, Instance.Part); 2537 VectorValue = Builder.CreateInsertElement( 2538 VectorValue, ScalarInst, 2539 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2540 State.set(Def, VectorValue, Instance.Part); 2541 } 2542 2543 // Return whether we allow using masked interleave-groups (for dealing with 2544 // strided loads/stores that reside in predicated blocks, or for dealing 2545 // with gaps). 2546 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2547 // If an override option has been passed in for interleaved accesses, use it. 2548 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2549 return EnableMaskedInterleavedMemAccesses; 2550 2551 return TTI.enableMaskedInterleavedAccessVectorization(); 2552 } 2553 2554 // Try to vectorize the interleave group that \p Instr belongs to. 2555 // 2556 // E.g. Translate following interleaved load group (factor = 3): 2557 // for (i = 0; i < N; i+=3) { 2558 // R = Pic[i]; // Member of index 0 2559 // G = Pic[i+1]; // Member of index 1 2560 // B = Pic[i+2]; // Member of index 2 2561 // ... // do something to R, G, B 2562 // } 2563 // To: 2564 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2565 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2566 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2567 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2568 // 2569 // Or translate following interleaved store group (factor = 3): 2570 // for (i = 0; i < N; i+=3) { 2571 // ... do something to R, G, B 2572 // Pic[i] = R; // Member of index 0 2573 // Pic[i+1] = G; // Member of index 1 2574 // Pic[i+2] = B; // Member of index 2 2575 // } 2576 // To: 2577 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2578 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2579 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2580 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2581 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2582 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2583 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2584 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2585 VPValue *BlockInMask) { 2586 Instruction *Instr = Group->getInsertPos(); 2587 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2588 2589 // Prepare for the vector type of the interleaved load/store. 2590 Type *ScalarTy = getLoadStoreType(Instr); 2591 unsigned InterleaveFactor = Group->getFactor(); 2592 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2593 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2594 2595 // Prepare for the new pointers. 2596 SmallVector<Value *, 2> AddrParts; 2597 unsigned Index = Group->getIndex(Instr); 2598 2599 // TODO: extend the masked interleaved-group support to reversed access. 2600 assert((!BlockInMask || !Group->isReverse()) && 2601 "Reversed masked interleave-group not supported."); 2602 2603 // If the group is reverse, adjust the index to refer to the last vector lane 2604 // instead of the first. We adjust the index from the first vector lane, 2605 // rather than directly getting the pointer for lane VF - 1, because the 2606 // pointer operand of the interleaved access is supposed to be uniform. For 2607 // uniform instructions, we're only required to generate a value for the 2608 // first vector lane in each unroll iteration. 2609 if (Group->isReverse()) 2610 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2611 2612 for (unsigned Part = 0; Part < UF; Part++) { 2613 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2614 setDebugLocFromInst(AddrPart); 2615 2616 // Notice current instruction could be any index. Need to adjust the address 2617 // to the member of index 0. 2618 // 2619 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2620 // b = A[i]; // Member of index 0 2621 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2622 // 2623 // E.g. A[i+1] = a; // Member of index 1 2624 // A[i] = b; // Member of index 0 2625 // A[i+2] = c; // Member of index 2 (Current instruction) 2626 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2627 2628 bool InBounds = false; 2629 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2630 InBounds = gep->isInBounds(); 2631 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2632 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2633 2634 // Cast to the vector pointer type. 2635 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2636 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2637 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2638 } 2639 2640 setDebugLocFromInst(Instr); 2641 Value *PoisonVec = PoisonValue::get(VecTy); 2642 2643 Value *MaskForGaps = nullptr; 2644 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2645 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2646 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2647 } 2648 2649 // Vectorize the interleaved load group. 2650 if (isa<LoadInst>(Instr)) { 2651 // For each unroll part, create a wide load for the group. 2652 SmallVector<Value *, 2> NewLoads; 2653 for (unsigned Part = 0; Part < UF; Part++) { 2654 Instruction *NewLoad; 2655 if (BlockInMask || MaskForGaps) { 2656 assert(useMaskedInterleavedAccesses(*TTI) && 2657 "masked interleaved groups are not allowed."); 2658 Value *GroupMask = MaskForGaps; 2659 if (BlockInMask) { 2660 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2661 Value *ShuffledMask = Builder.CreateShuffleVector( 2662 BlockInMaskPart, 2663 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2664 "interleaved.mask"); 2665 GroupMask = MaskForGaps 2666 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2667 MaskForGaps) 2668 : ShuffledMask; 2669 } 2670 NewLoad = 2671 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2672 GroupMask, PoisonVec, "wide.masked.vec"); 2673 } 2674 else 2675 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2676 Group->getAlign(), "wide.vec"); 2677 Group->addMetadata(NewLoad); 2678 NewLoads.push_back(NewLoad); 2679 } 2680 2681 // For each member in the group, shuffle out the appropriate data from the 2682 // wide loads. 2683 unsigned J = 0; 2684 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2685 Instruction *Member = Group->getMember(I); 2686 2687 // Skip the gaps in the group. 2688 if (!Member) 2689 continue; 2690 2691 auto StrideMask = 2692 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2693 for (unsigned Part = 0; Part < UF; Part++) { 2694 Value *StridedVec = Builder.CreateShuffleVector( 2695 NewLoads[Part], StrideMask, "strided.vec"); 2696 2697 // If this member has different type, cast the result type. 2698 if (Member->getType() != ScalarTy) { 2699 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2700 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2701 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2702 } 2703 2704 if (Group->isReverse()) 2705 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2706 2707 State.set(VPDefs[J], StridedVec, Part); 2708 } 2709 ++J; 2710 } 2711 return; 2712 } 2713 2714 // The sub vector type for current instruction. 2715 auto *SubVT = VectorType::get(ScalarTy, VF); 2716 2717 // Vectorize the interleaved store group. 2718 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2719 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2720 "masked interleaved groups are not allowed."); 2721 assert((!MaskForGaps || !VF.isScalable()) && 2722 "masking gaps for scalable vectors is not yet supported."); 2723 for (unsigned Part = 0; Part < UF; Part++) { 2724 // Collect the stored vector from each member. 2725 SmallVector<Value *, 4> StoredVecs; 2726 for (unsigned i = 0; i < InterleaveFactor; i++) { 2727 assert((Group->getMember(i) || MaskForGaps) && 2728 "Fail to get a member from an interleaved store group"); 2729 Instruction *Member = Group->getMember(i); 2730 2731 // Skip the gaps in the group. 2732 if (!Member) { 2733 Value *Undef = PoisonValue::get(SubVT); 2734 StoredVecs.push_back(Undef); 2735 continue; 2736 } 2737 2738 Value *StoredVec = State.get(StoredValues[i], Part); 2739 2740 if (Group->isReverse()) 2741 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2742 2743 // If this member has different type, cast it to a unified type. 2744 2745 if (StoredVec->getType() != SubVT) 2746 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2747 2748 StoredVecs.push_back(StoredVec); 2749 } 2750 2751 // Concatenate all vectors into a wide vector. 2752 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2753 2754 // Interleave the elements in the wide vector. 2755 Value *IVec = Builder.CreateShuffleVector( 2756 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2757 "interleaved.vec"); 2758 2759 Instruction *NewStoreInstr; 2760 if (BlockInMask || MaskForGaps) { 2761 Value *GroupMask = MaskForGaps; 2762 if (BlockInMask) { 2763 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2764 Value *ShuffledMask = Builder.CreateShuffleVector( 2765 BlockInMaskPart, 2766 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2767 "interleaved.mask"); 2768 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2769 ShuffledMask, MaskForGaps) 2770 : ShuffledMask; 2771 } 2772 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2773 Group->getAlign(), GroupMask); 2774 } else 2775 NewStoreInstr = 2776 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2777 2778 Group->addMetadata(NewStoreInstr); 2779 } 2780 } 2781 2782 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2783 VPReplicateRecipe *RepRecipe, 2784 const VPIteration &Instance, 2785 bool IfPredicateInstr, 2786 VPTransformState &State) { 2787 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2788 2789 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2790 // the first lane and part. 2791 if (isa<NoAliasScopeDeclInst>(Instr)) 2792 if (!Instance.isFirstIteration()) 2793 return; 2794 2795 setDebugLocFromInst(Instr); 2796 2797 // Does this instruction return a value ? 2798 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2799 2800 Instruction *Cloned = Instr->clone(); 2801 if (!IsVoidRetTy) 2802 Cloned->setName(Instr->getName() + ".cloned"); 2803 2804 // If the scalarized instruction contributes to the address computation of a 2805 // widen masked load/store which was in a basic block that needed predication 2806 // and is not predicated after vectorization, we can't propagate 2807 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2808 // instruction could feed a poison value to the base address of the widen 2809 // load/store. 2810 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2811 Cloned->dropPoisonGeneratingFlags(); 2812 2813 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2814 Builder.GetInsertPoint()); 2815 // Replace the operands of the cloned instructions with their scalar 2816 // equivalents in the new loop. 2817 for (auto &I : enumerate(RepRecipe->operands())) { 2818 auto InputInstance = Instance; 2819 VPValue *Operand = I.value(); 2820 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2821 if (OperandR && OperandR->isUniform()) 2822 InputInstance.Lane = VPLane::getFirstLane(); 2823 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2824 } 2825 addNewMetadata(Cloned, Instr); 2826 2827 // Place the cloned scalar in the new loop. 2828 Builder.Insert(Cloned); 2829 2830 State.set(RepRecipe, Cloned, Instance); 2831 2832 // If we just cloned a new assumption, add it the assumption cache. 2833 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2834 AC->registerAssumption(II); 2835 2836 // End if-block. 2837 if (IfPredicateInstr) 2838 PredicatedInstructions.push_back(Cloned); 2839 } 2840 2841 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 2842 BasicBlock *Header = L->getHeader(); 2843 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 2844 2845 IRBuilder<> B(Header->getTerminator()); 2846 Instruction *OldInst = 2847 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 2848 setDebugLocFromInst(OldInst, &B); 2849 2850 // Connect the header to the exit and header blocks and replace the old 2851 // terminator. 2852 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 2853 2854 // Now we have two terminators. Remove the old one from the block. 2855 Header->getTerminator()->eraseFromParent(); 2856 } 2857 2858 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2859 if (TripCount) 2860 return TripCount; 2861 2862 assert(L && "Create Trip Count for null loop."); 2863 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2864 // Find the loop boundaries. 2865 ScalarEvolution *SE = PSE.getSE(); 2866 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2867 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2868 "Invalid loop count"); 2869 2870 Type *IdxTy = Legal->getWidestInductionType(); 2871 assert(IdxTy && "No type for induction"); 2872 2873 // The exit count might have the type of i64 while the phi is i32. This can 2874 // happen if we have an induction variable that is sign extended before the 2875 // compare. The only way that we get a backedge taken count is that the 2876 // induction variable was signed and as such will not overflow. In such a case 2877 // truncation is legal. 2878 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2879 IdxTy->getPrimitiveSizeInBits()) 2880 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2881 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2882 2883 // Get the total trip count from the count by adding 1. 2884 const SCEV *ExitCount = SE->getAddExpr( 2885 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2886 2887 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2888 2889 // Expand the trip count and place the new instructions in the preheader. 2890 // Notice that the pre-header does not change, only the loop body. 2891 SCEVExpander Exp(*SE, DL, "induction"); 2892 2893 // Count holds the overall loop count (N). 2894 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2895 L->getLoopPreheader()->getTerminator()); 2896 2897 if (TripCount->getType()->isPointerTy()) 2898 TripCount = 2899 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2900 L->getLoopPreheader()->getTerminator()); 2901 2902 return TripCount; 2903 } 2904 2905 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2906 if (VectorTripCount) 2907 return VectorTripCount; 2908 2909 Value *TC = getOrCreateTripCount(L); 2910 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2911 2912 Type *Ty = TC->getType(); 2913 // This is where we can make the step a runtime constant. 2914 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2915 2916 // If the tail is to be folded by masking, round the number of iterations N 2917 // up to a multiple of Step instead of rounding down. This is done by first 2918 // adding Step-1 and then rounding down. Note that it's ok if this addition 2919 // overflows: the vector induction variable will eventually wrap to zero given 2920 // that it starts at zero and its Step is a power of two; the loop will then 2921 // exit, with the last early-exit vector comparison also producing all-true. 2922 if (Cost->foldTailByMasking()) { 2923 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2924 "VF*UF must be a power of 2 when folding tail by masking"); 2925 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2926 TC = Builder.CreateAdd( 2927 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2928 } 2929 2930 // Now we need to generate the expression for the part of the loop that the 2931 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2932 // iterations are not required for correctness, or N - Step, otherwise. Step 2933 // is equal to the vectorization factor (number of SIMD elements) times the 2934 // unroll factor (number of SIMD instructions). 2935 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2936 2937 // There are cases where we *must* run at least one iteration in the remainder 2938 // loop. See the cost model for when this can happen. If the step evenly 2939 // divides the trip count, we set the remainder to be equal to the step. If 2940 // the step does not evenly divide the trip count, no adjustment is necessary 2941 // since there will already be scalar iterations. Note that the minimum 2942 // iterations check ensures that N >= Step. 2943 if (Cost->requiresScalarEpilogue(VF)) { 2944 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2945 R = Builder.CreateSelect(IsZero, Step, R); 2946 } 2947 2948 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2949 2950 return VectorTripCount; 2951 } 2952 2953 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2954 const DataLayout &DL) { 2955 // Verify that V is a vector type with same number of elements as DstVTy. 2956 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2957 unsigned VF = DstFVTy->getNumElements(); 2958 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2959 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2960 Type *SrcElemTy = SrcVecTy->getElementType(); 2961 Type *DstElemTy = DstFVTy->getElementType(); 2962 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2963 "Vector elements must have same size"); 2964 2965 // Do a direct cast if element types are castable. 2966 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2967 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2968 } 2969 // V cannot be directly casted to desired vector type. 2970 // May happen when V is a floating point vector but DstVTy is a vector of 2971 // pointers or vice-versa. Handle this using a two-step bitcast using an 2972 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2973 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2974 "Only one type should be a pointer type"); 2975 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2976 "Only one type should be a floating point type"); 2977 Type *IntTy = 2978 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2979 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2980 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2981 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2982 } 2983 2984 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2985 BasicBlock *Bypass) { 2986 Value *Count = getOrCreateTripCount(L); 2987 // Reuse existing vector loop preheader for TC checks. 2988 // Note that new preheader block is generated for vector loop. 2989 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2990 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2991 2992 // Generate code to check if the loop's trip count is less than VF * UF, or 2993 // equal to it in case a scalar epilogue is required; this implies that the 2994 // vector trip count is zero. This check also covers the case where adding one 2995 // to the backedge-taken count overflowed leading to an incorrect trip count 2996 // of zero. In this case we will also jump to the scalar loop. 2997 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2998 : ICmpInst::ICMP_ULT; 2999 3000 // If tail is to be folded, vector loop takes care of all iterations. 3001 Value *CheckMinIters = Builder.getFalse(); 3002 if (!Cost->foldTailByMasking()) { 3003 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3004 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3005 } 3006 // Create new preheader for vector loop. 3007 LoopVectorPreHeader = 3008 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3009 "vector.ph"); 3010 3011 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3012 DT->getNode(Bypass)->getIDom()) && 3013 "TC check is expected to dominate Bypass"); 3014 3015 // Update dominator for Bypass & LoopExit (if needed). 3016 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3017 if (!Cost->requiresScalarEpilogue(VF)) 3018 // If there is an epilogue which must run, there's no edge from the 3019 // middle block to exit blocks and thus no need to update the immediate 3020 // dominator of the exit blocks. 3021 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3022 3023 ReplaceInstWithInst( 3024 TCCheckBlock->getTerminator(), 3025 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3026 LoopBypassBlocks.push_back(TCCheckBlock); 3027 } 3028 3029 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 3030 3031 BasicBlock *const SCEVCheckBlock = 3032 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3033 if (!SCEVCheckBlock) 3034 return nullptr; 3035 3036 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3037 (OptForSizeBasedOnProfile && 3038 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3039 "Cannot SCEV check stride or overflow when optimizing for size"); 3040 3041 3042 // Update dominator only if this is first RT check. 3043 if (LoopBypassBlocks.empty()) { 3044 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3045 if (!Cost->requiresScalarEpilogue(VF)) 3046 // If there is an epilogue which must run, there's no edge from the 3047 // middle block to exit blocks and thus no need to update the immediate 3048 // dominator of the exit blocks. 3049 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3050 } 3051 3052 LoopBypassBlocks.push_back(SCEVCheckBlock); 3053 AddedSafetyChecks = true; 3054 return SCEVCheckBlock; 3055 } 3056 3057 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3058 BasicBlock *Bypass) { 3059 // VPlan-native path does not do any analysis for runtime checks currently. 3060 if (EnableVPlanNativePath) 3061 return nullptr; 3062 3063 BasicBlock *const MemCheckBlock = 3064 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3065 3066 // Check if we generated code that checks in runtime if arrays overlap. We put 3067 // the checks into a separate block to make the more common case of few 3068 // elements faster. 3069 if (!MemCheckBlock) 3070 return nullptr; 3071 3072 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3073 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3074 "Cannot emit memory checks when optimizing for size, unless forced " 3075 "to vectorize."); 3076 ORE->emit([&]() { 3077 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3078 L->getStartLoc(), L->getHeader()) 3079 << "Code-size may be reduced by not forcing " 3080 "vectorization, or by source-code modifications " 3081 "eliminating the need for runtime checks " 3082 "(e.g., adding 'restrict')."; 3083 }); 3084 } 3085 3086 LoopBypassBlocks.push_back(MemCheckBlock); 3087 3088 AddedSafetyChecks = true; 3089 3090 // We currently don't use LoopVersioning for the actual loop cloning but we 3091 // still use it to add the noalias metadata. 3092 LVer = std::make_unique<LoopVersioning>( 3093 *Legal->getLAI(), 3094 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3095 DT, PSE.getSE()); 3096 LVer->prepareNoAliasMetadata(); 3097 return MemCheckBlock; 3098 } 3099 3100 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3101 LoopScalarBody = OrigLoop->getHeader(); 3102 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3103 assert(LoopVectorPreHeader && "Invalid loop structure"); 3104 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3105 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3106 "multiple exit loop without required epilogue?"); 3107 3108 LoopMiddleBlock = 3109 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3110 LI, nullptr, Twine(Prefix) + "middle.block"); 3111 LoopScalarPreHeader = 3112 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3113 nullptr, Twine(Prefix) + "scalar.ph"); 3114 3115 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3116 3117 // Set up the middle block terminator. Two cases: 3118 // 1) If we know that we must execute the scalar epilogue, emit an 3119 // unconditional branch. 3120 // 2) Otherwise, we must have a single unique exit block (due to how we 3121 // implement the multiple exit case). In this case, set up a conditonal 3122 // branch from the middle block to the loop scalar preheader, and the 3123 // exit block. completeLoopSkeleton will update the condition to use an 3124 // iteration check, if required to decide whether to execute the remainder. 3125 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3126 BranchInst::Create(LoopScalarPreHeader) : 3127 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3128 Builder.getTrue()); 3129 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3130 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3131 3132 // We intentionally don't let SplitBlock to update LoopInfo since 3133 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3134 // LoopVectorBody is explicitly added to the correct place few lines later. 3135 LoopVectorBody = 3136 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3137 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3138 3139 // Update dominator for loop exit. 3140 if (!Cost->requiresScalarEpilogue(VF)) 3141 // If there is an epilogue which must run, there's no edge from the 3142 // middle block to exit blocks and thus no need to update the immediate 3143 // dominator of the exit blocks. 3144 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3145 3146 // Create and register the new vector loop. 3147 Loop *Lp = LI->AllocateLoop(); 3148 Loop *ParentLoop = OrigLoop->getParentLoop(); 3149 3150 // Insert the new loop into the loop nest and register the new basic blocks 3151 // before calling any utilities such as SCEV that require valid LoopInfo. 3152 if (ParentLoop) { 3153 ParentLoop->addChildLoop(Lp); 3154 } else { 3155 LI->addTopLevelLoop(Lp); 3156 } 3157 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3158 return Lp; 3159 } 3160 3161 void InnerLoopVectorizer::createInductionResumeValues( 3162 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3163 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3164 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3165 "Inconsistent information about additional bypass."); 3166 3167 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3168 assert(VectorTripCount && L && "Expected valid arguments"); 3169 // We are going to resume the execution of the scalar loop. 3170 // Go over all of the induction variables that we found and fix the 3171 // PHIs that are left in the scalar version of the loop. 3172 // The starting values of PHI nodes depend on the counter of the last 3173 // iteration in the vectorized loop. 3174 // If we come from a bypass edge then we need to start from the original 3175 // start value. 3176 Instruction *OldInduction = Legal->getPrimaryInduction(); 3177 for (auto &InductionEntry : Legal->getInductionVars()) { 3178 PHINode *OrigPhi = InductionEntry.first; 3179 InductionDescriptor II = InductionEntry.second; 3180 3181 // Create phi nodes to merge from the backedge-taken check block. 3182 PHINode *BCResumeVal = 3183 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3184 LoopScalarPreHeader->getTerminator()); 3185 // Copy original phi DL over to the new one. 3186 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3187 Value *&EndValue = IVEndValues[OrigPhi]; 3188 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3189 if (OrigPhi == OldInduction) { 3190 // We know what the end value is. 3191 EndValue = VectorTripCount; 3192 } else { 3193 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3194 3195 // Fast-math-flags propagate from the original induction instruction. 3196 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3197 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3198 3199 Type *StepType = II.getStep()->getType(); 3200 Instruction::CastOps CastOp = 3201 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3202 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3203 Value *Step = 3204 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3205 EndValue = emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3206 EndValue->setName("ind.end"); 3207 3208 // Compute the end value for the additional bypass (if applicable). 3209 if (AdditionalBypass.first) { 3210 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3211 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3212 StepType, true); 3213 Value *Step = 3214 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3215 CRD = 3216 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3217 EndValueFromAdditionalBypass = 3218 emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3219 EndValueFromAdditionalBypass->setName("ind.end"); 3220 } 3221 } 3222 // The new PHI merges the original incoming value, in case of a bypass, 3223 // or the value at the end of the vectorized loop. 3224 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3225 3226 // Fix the scalar body counter (PHI node). 3227 // The old induction's phi node in the scalar body needs the truncated 3228 // value. 3229 for (BasicBlock *BB : LoopBypassBlocks) 3230 BCResumeVal->addIncoming(II.getStartValue(), BB); 3231 3232 if (AdditionalBypass.first) 3233 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3234 EndValueFromAdditionalBypass); 3235 3236 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3237 } 3238 } 3239 3240 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3241 MDNode *OrigLoopID) { 3242 assert(L && "Expected valid loop."); 3243 3244 // The trip counts should be cached by now. 3245 Value *Count = getOrCreateTripCount(L); 3246 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3247 3248 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3249 3250 // Add a check in the middle block to see if we have completed 3251 // all of the iterations in the first vector loop. Three cases: 3252 // 1) If we require a scalar epilogue, there is no conditional branch as 3253 // we unconditionally branch to the scalar preheader. Do nothing. 3254 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3255 // Thus if tail is to be folded, we know we don't need to run the 3256 // remainder and we can use the previous value for the condition (true). 3257 // 3) Otherwise, construct a runtime check. 3258 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3259 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3260 Count, VectorTripCount, "cmp.n", 3261 LoopMiddleBlock->getTerminator()); 3262 3263 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3264 // of the corresponding compare because they may have ended up with 3265 // different line numbers and we want to avoid awkward line stepping while 3266 // debugging. Eg. if the compare has got a line number inside the loop. 3267 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3268 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3269 } 3270 3271 // Get ready to start creating new instructions into the vectorized body. 3272 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3273 "Inconsistent vector loop preheader"); 3274 3275 #ifdef EXPENSIVE_CHECKS 3276 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3277 LI->verify(*DT); 3278 #endif 3279 3280 return LoopVectorPreHeader; 3281 } 3282 3283 std::pair<BasicBlock *, Value *> 3284 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3285 /* 3286 In this function we generate a new loop. The new loop will contain 3287 the vectorized instructions while the old loop will continue to run the 3288 scalar remainder. 3289 3290 [ ] <-- loop iteration number check. 3291 / | 3292 / v 3293 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3294 | / | 3295 | / v 3296 || [ ] <-- vector pre header. 3297 |/ | 3298 | v 3299 | [ ] \ 3300 | [ ]_| <-- vector loop. 3301 | | 3302 | v 3303 \ -[ ] <--- middle-block. 3304 \/ | 3305 /\ v 3306 | ->[ ] <--- new preheader. 3307 | | 3308 (opt) v <-- edge from middle to exit iff epilogue is not required. 3309 | [ ] \ 3310 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3311 \ | 3312 \ v 3313 >[ ] <-- exit block(s). 3314 ... 3315 */ 3316 3317 // Get the metadata of the original loop before it gets modified. 3318 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3319 3320 // Workaround! Compute the trip count of the original loop and cache it 3321 // before we start modifying the CFG. This code has a systemic problem 3322 // wherein it tries to run analysis over partially constructed IR; this is 3323 // wrong, and not simply for SCEV. The trip count of the original loop 3324 // simply happens to be prone to hitting this in practice. In theory, we 3325 // can hit the same issue for any SCEV, or ValueTracking query done during 3326 // mutation. See PR49900. 3327 getOrCreateTripCount(OrigLoop); 3328 3329 // Create an empty vector loop, and prepare basic blocks for the runtime 3330 // checks. 3331 Loop *Lp = createVectorLoopSkeleton(""); 3332 3333 // Now, compare the new count to zero. If it is zero skip the vector loop and 3334 // jump to the scalar loop. This check also covers the case where the 3335 // backedge-taken count is uint##_max: adding one to it will overflow leading 3336 // to an incorrect trip count of zero. In this (rare) case we will also jump 3337 // to the scalar loop. 3338 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3339 3340 // Generate the code to check any assumptions that we've made for SCEV 3341 // expressions. 3342 emitSCEVChecks(LoopScalarPreHeader); 3343 3344 // Generate the code that checks in runtime if arrays overlap. We put the 3345 // checks into a separate block to make the more common case of few elements 3346 // faster. 3347 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3348 3349 createHeaderBranch(Lp); 3350 3351 // Emit phis for the new starting index of the scalar loop. 3352 createInductionResumeValues(Lp); 3353 3354 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3355 } 3356 3357 // Fix up external users of the induction variable. At this point, we are 3358 // in LCSSA form, with all external PHIs that use the IV having one input value, 3359 // coming from the remainder loop. We need those PHIs to also have a correct 3360 // value for the IV when arriving directly from the middle block. 3361 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3362 const InductionDescriptor &II, 3363 Value *CountRoundDown, Value *EndValue, 3364 BasicBlock *MiddleBlock) { 3365 // There are two kinds of external IV usages - those that use the value 3366 // computed in the last iteration (the PHI) and those that use the penultimate 3367 // value (the value that feeds into the phi from the loop latch). 3368 // We allow both, but they, obviously, have different values. 3369 3370 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3371 3372 DenseMap<Value *, Value *> MissingVals; 3373 3374 // An external user of the last iteration's value should see the value that 3375 // the remainder loop uses to initialize its own IV. 3376 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3377 for (User *U : PostInc->users()) { 3378 Instruction *UI = cast<Instruction>(U); 3379 if (!OrigLoop->contains(UI)) { 3380 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3381 MissingVals[UI] = EndValue; 3382 } 3383 } 3384 3385 // An external user of the penultimate value need to see EndValue - Step. 3386 // The simplest way to get this is to recompute it from the constituent SCEVs, 3387 // that is Start + (Step * (CRD - 1)). 3388 for (User *U : OrigPhi->users()) { 3389 auto *UI = cast<Instruction>(U); 3390 if (!OrigLoop->contains(UI)) { 3391 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3392 3393 IRBuilder<> B(MiddleBlock->getTerminator()); 3394 3395 // Fast-math-flags propagate from the original induction instruction. 3396 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3397 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3398 3399 Value *CountMinusOne = B.CreateSub( 3400 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3401 Value *CMO = 3402 !II.getStep()->getType()->isIntegerTy() 3403 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3404 II.getStep()->getType()) 3405 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3406 CMO->setName("cast.cmo"); 3407 3408 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3409 LoopVectorBody->getTerminator()); 3410 Value *Escape = 3411 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3412 Escape->setName("ind.escape"); 3413 MissingVals[UI] = Escape; 3414 } 3415 } 3416 3417 for (auto &I : MissingVals) { 3418 PHINode *PHI = cast<PHINode>(I.first); 3419 // One corner case we have to handle is two IVs "chasing" each-other, 3420 // that is %IV2 = phi [...], [ %IV1, %latch ] 3421 // In this case, if IV1 has an external use, we need to avoid adding both 3422 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3423 // don't already have an incoming value for the middle block. 3424 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3425 PHI->addIncoming(I.second, MiddleBlock); 3426 } 3427 } 3428 3429 namespace { 3430 3431 struct CSEDenseMapInfo { 3432 static bool canHandle(const Instruction *I) { 3433 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3434 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3435 } 3436 3437 static inline Instruction *getEmptyKey() { 3438 return DenseMapInfo<Instruction *>::getEmptyKey(); 3439 } 3440 3441 static inline Instruction *getTombstoneKey() { 3442 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3443 } 3444 3445 static unsigned getHashValue(const Instruction *I) { 3446 assert(canHandle(I) && "Unknown instruction!"); 3447 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3448 I->value_op_end())); 3449 } 3450 3451 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3452 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3453 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3454 return LHS == RHS; 3455 return LHS->isIdenticalTo(RHS); 3456 } 3457 }; 3458 3459 } // end anonymous namespace 3460 3461 ///Perform cse of induction variable instructions. 3462 static void cse(BasicBlock *BB) { 3463 // Perform simple cse. 3464 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3465 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3466 if (!CSEDenseMapInfo::canHandle(&In)) 3467 continue; 3468 3469 // Check if we can replace this instruction with any of the 3470 // visited instructions. 3471 if (Instruction *V = CSEMap.lookup(&In)) { 3472 In.replaceAllUsesWith(V); 3473 In.eraseFromParent(); 3474 continue; 3475 } 3476 3477 CSEMap[&In] = &In; 3478 } 3479 } 3480 3481 InstructionCost 3482 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3483 bool &NeedToScalarize) const { 3484 Function *F = CI->getCalledFunction(); 3485 Type *ScalarRetTy = CI->getType(); 3486 SmallVector<Type *, 4> Tys, ScalarTys; 3487 for (auto &ArgOp : CI->args()) 3488 ScalarTys.push_back(ArgOp->getType()); 3489 3490 // Estimate cost of scalarized vector call. The source operands are assumed 3491 // to be vectors, so we need to extract individual elements from there, 3492 // execute VF scalar calls, and then gather the result into the vector return 3493 // value. 3494 InstructionCost ScalarCallCost = 3495 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3496 if (VF.isScalar()) 3497 return ScalarCallCost; 3498 3499 // Compute corresponding vector type for return value and arguments. 3500 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3501 for (Type *ScalarTy : ScalarTys) 3502 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3503 3504 // Compute costs of unpacking argument values for the scalar calls and 3505 // packing the return values to a vector. 3506 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3507 3508 InstructionCost Cost = 3509 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3510 3511 // If we can't emit a vector call for this function, then the currently found 3512 // cost is the cost we need to return. 3513 NeedToScalarize = true; 3514 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3515 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3516 3517 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3518 return Cost; 3519 3520 // If the corresponding vector cost is cheaper, return its cost. 3521 InstructionCost VectorCallCost = 3522 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3523 if (VectorCallCost < Cost) { 3524 NeedToScalarize = false; 3525 Cost = VectorCallCost; 3526 } 3527 return Cost; 3528 } 3529 3530 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3531 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3532 return Elt; 3533 return VectorType::get(Elt, VF); 3534 } 3535 3536 InstructionCost 3537 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3538 ElementCount VF) const { 3539 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3540 assert(ID && "Expected intrinsic call!"); 3541 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3542 FastMathFlags FMF; 3543 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3544 FMF = FPMO->getFastMathFlags(); 3545 3546 SmallVector<const Value *> Arguments(CI->args()); 3547 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3548 SmallVector<Type *> ParamTys; 3549 std::transform(FTy->param_begin(), FTy->param_end(), 3550 std::back_inserter(ParamTys), 3551 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3552 3553 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3554 dyn_cast<IntrinsicInst>(CI)); 3555 return TTI.getIntrinsicInstrCost(CostAttrs, 3556 TargetTransformInfo::TCK_RecipThroughput); 3557 } 3558 3559 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3560 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3561 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3562 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3563 } 3564 3565 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3566 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3567 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3568 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3569 } 3570 3571 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3572 // For every instruction `I` in MinBWs, truncate the operands, create a 3573 // truncated version of `I` and reextend its result. InstCombine runs 3574 // later and will remove any ext/trunc pairs. 3575 SmallPtrSet<Value *, 4> Erased; 3576 for (const auto &KV : Cost->getMinimalBitwidths()) { 3577 // If the value wasn't vectorized, we must maintain the original scalar 3578 // type. The absence of the value from State indicates that it 3579 // wasn't vectorized. 3580 // FIXME: Should not rely on getVPValue at this point. 3581 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3582 if (!State.hasAnyVectorValue(Def)) 3583 continue; 3584 for (unsigned Part = 0; Part < UF; ++Part) { 3585 Value *I = State.get(Def, Part); 3586 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3587 continue; 3588 Type *OriginalTy = I->getType(); 3589 Type *ScalarTruncatedTy = 3590 IntegerType::get(OriginalTy->getContext(), KV.second); 3591 auto *TruncatedTy = VectorType::get( 3592 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3593 if (TruncatedTy == OriginalTy) 3594 continue; 3595 3596 IRBuilder<> B(cast<Instruction>(I)); 3597 auto ShrinkOperand = [&](Value *V) -> Value * { 3598 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3599 if (ZI->getSrcTy() == TruncatedTy) 3600 return ZI->getOperand(0); 3601 return B.CreateZExtOrTrunc(V, TruncatedTy); 3602 }; 3603 3604 // The actual instruction modification depends on the instruction type, 3605 // unfortunately. 3606 Value *NewI = nullptr; 3607 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3608 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3609 ShrinkOperand(BO->getOperand(1))); 3610 3611 // Any wrapping introduced by shrinking this operation shouldn't be 3612 // considered undefined behavior. So, we can't unconditionally copy 3613 // arithmetic wrapping flags to NewI. 3614 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3615 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3616 NewI = 3617 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3618 ShrinkOperand(CI->getOperand(1))); 3619 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3620 NewI = B.CreateSelect(SI->getCondition(), 3621 ShrinkOperand(SI->getTrueValue()), 3622 ShrinkOperand(SI->getFalseValue())); 3623 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3624 switch (CI->getOpcode()) { 3625 default: 3626 llvm_unreachable("Unhandled cast!"); 3627 case Instruction::Trunc: 3628 NewI = ShrinkOperand(CI->getOperand(0)); 3629 break; 3630 case Instruction::SExt: 3631 NewI = B.CreateSExtOrTrunc( 3632 CI->getOperand(0), 3633 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3634 break; 3635 case Instruction::ZExt: 3636 NewI = B.CreateZExtOrTrunc( 3637 CI->getOperand(0), 3638 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3639 break; 3640 } 3641 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3642 auto Elements0 = 3643 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3644 auto *O0 = B.CreateZExtOrTrunc( 3645 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3646 auto Elements1 = 3647 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3648 auto *O1 = B.CreateZExtOrTrunc( 3649 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3650 3651 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3652 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3653 // Don't do anything with the operands, just extend the result. 3654 continue; 3655 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3656 auto Elements = 3657 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3658 auto *O0 = B.CreateZExtOrTrunc( 3659 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3660 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3661 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3662 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3663 auto Elements = 3664 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3665 auto *O0 = B.CreateZExtOrTrunc( 3666 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3667 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3668 } else { 3669 // If we don't know what to do, be conservative and don't do anything. 3670 continue; 3671 } 3672 3673 // Lastly, extend the result. 3674 NewI->takeName(cast<Instruction>(I)); 3675 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3676 I->replaceAllUsesWith(Res); 3677 cast<Instruction>(I)->eraseFromParent(); 3678 Erased.insert(I); 3679 State.reset(Def, Res, Part); 3680 } 3681 } 3682 3683 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3684 for (const auto &KV : Cost->getMinimalBitwidths()) { 3685 // If the value wasn't vectorized, we must maintain the original scalar 3686 // type. The absence of the value from State indicates that it 3687 // wasn't vectorized. 3688 // FIXME: Should not rely on getVPValue at this point. 3689 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3690 if (!State.hasAnyVectorValue(Def)) 3691 continue; 3692 for (unsigned Part = 0; Part < UF; ++Part) { 3693 Value *I = State.get(Def, Part); 3694 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3695 if (Inst && Inst->use_empty()) { 3696 Value *NewI = Inst->getOperand(0); 3697 Inst->eraseFromParent(); 3698 State.reset(Def, NewI, Part); 3699 } 3700 } 3701 } 3702 } 3703 3704 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3705 // Insert truncates and extends for any truncated instructions as hints to 3706 // InstCombine. 3707 if (VF.isVector()) 3708 truncateToMinimalBitwidths(State); 3709 3710 // Fix widened non-induction PHIs by setting up the PHI operands. 3711 if (OrigPHIsToFix.size()) { 3712 assert(EnableVPlanNativePath && 3713 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3714 fixNonInductionPHIs(State); 3715 } 3716 3717 // At this point every instruction in the original loop is widened to a 3718 // vector form. Now we need to fix the recurrences in the loop. These PHI 3719 // nodes are currently empty because we did not want to introduce cycles. 3720 // This is the second stage of vectorizing recurrences. 3721 fixCrossIterationPHIs(State); 3722 3723 // Forget the original basic block. 3724 PSE.getSE()->forgetLoop(OrigLoop); 3725 3726 // If we inserted an edge from the middle block to the unique exit block, 3727 // update uses outside the loop (phis) to account for the newly inserted 3728 // edge. 3729 if (!Cost->requiresScalarEpilogue(VF)) { 3730 // Fix-up external users of the induction variables. 3731 for (auto &Entry : Legal->getInductionVars()) 3732 fixupIVUsers(Entry.first, Entry.second, 3733 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3734 IVEndValues[Entry.first], LoopMiddleBlock); 3735 3736 fixLCSSAPHIs(State); 3737 } 3738 3739 for (Instruction *PI : PredicatedInstructions) 3740 sinkScalarOperands(&*PI); 3741 3742 // Remove redundant induction instructions. 3743 cse(LoopVectorBody); 3744 3745 // Set/update profile weights for the vector and remainder loops as original 3746 // loop iterations are now distributed among them. Note that original loop 3747 // represented by LoopScalarBody becomes remainder loop after vectorization. 3748 // 3749 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3750 // end up getting slightly roughened result but that should be OK since 3751 // profile is not inherently precise anyway. Note also possible bypass of 3752 // vector code caused by legality checks is ignored, assigning all the weight 3753 // to the vector loop, optimistically. 3754 // 3755 // For scalable vectorization we can't know at compile time how many iterations 3756 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3757 // vscale of '1'. 3758 setProfileInfoAfterUnrolling( 3759 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3760 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3761 } 3762 3763 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3764 // In order to support recurrences we need to be able to vectorize Phi nodes. 3765 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3766 // stage #2: We now need to fix the recurrences by adding incoming edges to 3767 // the currently empty PHI nodes. At this point every instruction in the 3768 // original loop is widened to a vector form so we can use them to construct 3769 // the incoming edges. 3770 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 3771 for (VPRecipeBase &R : Header->phis()) { 3772 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3773 fixReduction(ReductionPhi, State); 3774 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3775 fixFirstOrderRecurrence(FOR, State); 3776 } 3777 } 3778 3779 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3780 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3781 // This is the second phase of vectorizing first-order recurrences. An 3782 // overview of the transformation is described below. Suppose we have the 3783 // following loop. 3784 // 3785 // for (int i = 0; i < n; ++i) 3786 // b[i] = a[i] - a[i - 1]; 3787 // 3788 // There is a first-order recurrence on "a". For this loop, the shorthand 3789 // scalar IR looks like: 3790 // 3791 // scalar.ph: 3792 // s_init = a[-1] 3793 // br scalar.body 3794 // 3795 // scalar.body: 3796 // i = phi [0, scalar.ph], [i+1, scalar.body] 3797 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3798 // s2 = a[i] 3799 // b[i] = s2 - s1 3800 // br cond, scalar.body, ... 3801 // 3802 // In this example, s1 is a recurrence because it's value depends on the 3803 // previous iteration. In the first phase of vectorization, we created a 3804 // vector phi v1 for s1. We now complete the vectorization and produce the 3805 // shorthand vector IR shown below (for VF = 4, UF = 1). 3806 // 3807 // vector.ph: 3808 // v_init = vector(..., ..., ..., a[-1]) 3809 // br vector.body 3810 // 3811 // vector.body 3812 // i = phi [0, vector.ph], [i+4, vector.body] 3813 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3814 // v2 = a[i, i+1, i+2, i+3]; 3815 // v3 = vector(v1(3), v2(0, 1, 2)) 3816 // b[i, i+1, i+2, i+3] = v2 - v3 3817 // br cond, vector.body, middle.block 3818 // 3819 // middle.block: 3820 // x = v2(3) 3821 // br scalar.ph 3822 // 3823 // scalar.ph: 3824 // s_init = phi [x, middle.block], [a[-1], otherwise] 3825 // br scalar.body 3826 // 3827 // After execution completes the vector loop, we extract the next value of 3828 // the recurrence (x) to use as the initial value in the scalar loop. 3829 3830 // Extract the last vector element in the middle block. This will be the 3831 // initial value for the recurrence when jumping to the scalar loop. 3832 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3833 Value *Incoming = State.get(PreviousDef, UF - 1); 3834 auto *ExtractForScalar = Incoming; 3835 auto *IdxTy = Builder.getInt32Ty(); 3836 if (VF.isVector()) { 3837 auto *One = ConstantInt::get(IdxTy, 1); 3838 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3839 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3840 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3841 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3842 "vector.recur.extract"); 3843 } 3844 // Extract the second last element in the middle block if the 3845 // Phi is used outside the loop. We need to extract the phi itself 3846 // and not the last element (the phi update in the current iteration). This 3847 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3848 // when the scalar loop is not run at all. 3849 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3850 if (VF.isVector()) { 3851 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3852 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3853 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3854 Incoming, Idx, "vector.recur.extract.for.phi"); 3855 } else if (UF > 1) 3856 // When loop is unrolled without vectorizing, initialize 3857 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3858 // of `Incoming`. This is analogous to the vectorized case above: extracting 3859 // the second last element when VF > 1. 3860 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3861 3862 // Fix the initial value of the original recurrence in the scalar loop. 3863 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3864 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3865 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3866 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3867 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3868 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3869 Start->addIncoming(Incoming, BB); 3870 } 3871 3872 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3873 Phi->setName("scalar.recur"); 3874 3875 // Finally, fix users of the recurrence outside the loop. The users will need 3876 // either the last value of the scalar recurrence or the last value of the 3877 // vector recurrence we extracted in the middle block. Since the loop is in 3878 // LCSSA form, we just need to find all the phi nodes for the original scalar 3879 // recurrence in the exit block, and then add an edge for the middle block. 3880 // Note that LCSSA does not imply single entry when the original scalar loop 3881 // had multiple exiting edges (as we always run the last iteration in the 3882 // scalar epilogue); in that case, there is no edge from middle to exit and 3883 // and thus no phis which needed updated. 3884 if (!Cost->requiresScalarEpilogue(VF)) 3885 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3886 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 3887 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3888 } 3889 3890 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3891 VPTransformState &State) { 3892 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3893 // Get it's reduction variable descriptor. 3894 assert(Legal->isReductionVariable(OrigPhi) && 3895 "Unable to find the reduction variable"); 3896 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3897 3898 RecurKind RK = RdxDesc.getRecurrenceKind(); 3899 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3900 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3901 setDebugLocFromInst(ReductionStartValue); 3902 3903 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3904 // This is the vector-clone of the value that leaves the loop. 3905 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3906 3907 // Wrap flags are in general invalid after vectorization, clear them. 3908 clearReductionWrapFlags(RdxDesc, State); 3909 3910 // Before each round, move the insertion point right between 3911 // the PHIs and the values we are going to write. 3912 // This allows us to write both PHINodes and the extractelement 3913 // instructions. 3914 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3915 3916 setDebugLocFromInst(LoopExitInst); 3917 3918 Type *PhiTy = OrigPhi->getType(); 3919 // If tail is folded by masking, the vector value to leave the loop should be 3920 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3921 // instead of the former. For an inloop reduction the reduction will already 3922 // be predicated, and does not need to be handled here. 3923 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3924 for (unsigned Part = 0; Part < UF; ++Part) { 3925 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3926 Value *Sel = nullptr; 3927 for (User *U : VecLoopExitInst->users()) { 3928 if (isa<SelectInst>(U)) { 3929 assert(!Sel && "Reduction exit feeding two selects"); 3930 Sel = U; 3931 } else 3932 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3933 } 3934 assert(Sel && "Reduction exit feeds no select"); 3935 State.reset(LoopExitInstDef, Sel, Part); 3936 3937 // If the target can create a predicated operator for the reduction at no 3938 // extra cost in the loop (for example a predicated vadd), it can be 3939 // cheaper for the select to remain in the loop than be sunk out of it, 3940 // and so use the select value for the phi instead of the old 3941 // LoopExitValue. 3942 if (PreferPredicatedReductionSelect || 3943 TTI->preferPredicatedReductionSelect( 3944 RdxDesc.getOpcode(), PhiTy, 3945 TargetTransformInfo::ReductionFlags())) { 3946 auto *VecRdxPhi = 3947 cast<PHINode>(State.get(PhiR, Part)); 3948 VecRdxPhi->setIncomingValueForBlock( 3949 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 3950 } 3951 } 3952 } 3953 3954 // If the vector reduction can be performed in a smaller type, we truncate 3955 // then extend the loop exit value to enable InstCombine to evaluate the 3956 // entire expression in the smaller type. 3957 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3958 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3959 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3960 Builder.SetInsertPoint( 3961 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3962 VectorParts RdxParts(UF); 3963 for (unsigned Part = 0; Part < UF; ++Part) { 3964 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3965 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3966 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3967 : Builder.CreateZExt(Trunc, VecTy); 3968 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3969 if (U != Trunc) { 3970 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3971 RdxParts[Part] = Extnd; 3972 } 3973 } 3974 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3975 for (unsigned Part = 0; Part < UF; ++Part) { 3976 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3977 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3978 } 3979 } 3980 3981 // Reduce all of the unrolled parts into a single vector. 3982 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3983 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3984 3985 // The middle block terminator has already been assigned a DebugLoc here (the 3986 // OrigLoop's single latch terminator). We want the whole middle block to 3987 // appear to execute on this line because: (a) it is all compiler generated, 3988 // (b) these instructions are always executed after evaluating the latch 3989 // conditional branch, and (c) other passes may add new predecessors which 3990 // terminate on this line. This is the easiest way to ensure we don't 3991 // accidentally cause an extra step back into the loop while debugging. 3992 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3993 if (PhiR->isOrdered()) 3994 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3995 else { 3996 // Floating-point operations should have some FMF to enable the reduction. 3997 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3998 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3999 for (unsigned Part = 1; Part < UF; ++Part) { 4000 Value *RdxPart = State.get(LoopExitInstDef, Part); 4001 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4002 ReducedPartRdx = Builder.CreateBinOp( 4003 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4004 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4005 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4006 ReducedPartRdx, RdxPart); 4007 else 4008 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4009 } 4010 } 4011 4012 // Create the reduction after the loop. Note that inloop reductions create the 4013 // target reduction in the loop using a Reduction recipe. 4014 if (VF.isVector() && !PhiR->isInLoop()) { 4015 ReducedPartRdx = 4016 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4017 // If the reduction can be performed in a smaller type, we need to extend 4018 // the reduction to the wider type before we branch to the original loop. 4019 if (PhiTy != RdxDesc.getRecurrenceType()) 4020 ReducedPartRdx = RdxDesc.isSigned() 4021 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4022 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4023 } 4024 4025 PHINode *ResumePhi = 4026 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4027 4028 // Create a phi node that merges control-flow from the backedge-taken check 4029 // block and the middle block. 4030 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4031 LoopScalarPreHeader->getTerminator()); 4032 4033 // If we are fixing reductions in the epilogue loop then we should already 4034 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4035 // we carry over the incoming values correctly. 4036 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4037 if (Incoming == LoopMiddleBlock) 4038 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4039 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4040 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4041 Incoming); 4042 else 4043 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4044 } 4045 4046 // Set the resume value for this reduction 4047 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4048 4049 // Now, we need to fix the users of the reduction variable 4050 // inside and outside of the scalar remainder loop. 4051 4052 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4053 // in the exit blocks. See comment on analogous loop in 4054 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4055 if (!Cost->requiresScalarEpilogue(VF)) 4056 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4057 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4058 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4059 4060 // Fix the scalar loop reduction variable with the incoming reduction sum 4061 // from the vector body and from the backedge value. 4062 int IncomingEdgeBlockIdx = 4063 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4064 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4065 // Pick the other block. 4066 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4067 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4068 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4069 } 4070 4071 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4072 VPTransformState &State) { 4073 RecurKind RK = RdxDesc.getRecurrenceKind(); 4074 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4075 return; 4076 4077 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4078 assert(LoopExitInstr && "null loop exit instruction"); 4079 SmallVector<Instruction *, 8> Worklist; 4080 SmallPtrSet<Instruction *, 8> Visited; 4081 Worklist.push_back(LoopExitInstr); 4082 Visited.insert(LoopExitInstr); 4083 4084 while (!Worklist.empty()) { 4085 Instruction *Cur = Worklist.pop_back_val(); 4086 if (isa<OverflowingBinaryOperator>(Cur)) 4087 for (unsigned Part = 0; Part < UF; ++Part) { 4088 // FIXME: Should not rely on getVPValue at this point. 4089 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4090 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4091 } 4092 4093 for (User *U : Cur->users()) { 4094 Instruction *UI = cast<Instruction>(U); 4095 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4096 Visited.insert(UI).second) 4097 Worklist.push_back(UI); 4098 } 4099 } 4100 } 4101 4102 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4103 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4104 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4105 // Some phis were already hand updated by the reduction and recurrence 4106 // code above, leave them alone. 4107 continue; 4108 4109 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4110 // Non-instruction incoming values will have only one value. 4111 4112 VPLane Lane = VPLane::getFirstLane(); 4113 if (isa<Instruction>(IncomingValue) && 4114 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4115 VF)) 4116 Lane = VPLane::getLastLaneForVF(VF); 4117 4118 // Can be a loop invariant incoming value or the last scalar value to be 4119 // extracted from the vectorized loop. 4120 // FIXME: Should not rely on getVPValue at this point. 4121 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4122 Value *lastIncomingValue = 4123 OrigLoop->isLoopInvariant(IncomingValue) 4124 ? IncomingValue 4125 : State.get(State.Plan->getVPValue(IncomingValue, true), 4126 VPIteration(UF - 1, Lane)); 4127 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4128 } 4129 } 4130 4131 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4132 // The basic block and loop containing the predicated instruction. 4133 auto *PredBB = PredInst->getParent(); 4134 auto *VectorLoop = LI->getLoopFor(PredBB); 4135 4136 // Initialize a worklist with the operands of the predicated instruction. 4137 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4138 4139 // Holds instructions that we need to analyze again. An instruction may be 4140 // reanalyzed if we don't yet know if we can sink it or not. 4141 SmallVector<Instruction *, 8> InstsToReanalyze; 4142 4143 // Returns true if a given use occurs in the predicated block. Phi nodes use 4144 // their operands in their corresponding predecessor blocks. 4145 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4146 auto *I = cast<Instruction>(U.getUser()); 4147 BasicBlock *BB = I->getParent(); 4148 if (auto *Phi = dyn_cast<PHINode>(I)) 4149 BB = Phi->getIncomingBlock( 4150 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4151 return BB == PredBB; 4152 }; 4153 4154 // Iteratively sink the scalarized operands of the predicated instruction 4155 // into the block we created for it. When an instruction is sunk, it's 4156 // operands are then added to the worklist. The algorithm ends after one pass 4157 // through the worklist doesn't sink a single instruction. 4158 bool Changed; 4159 do { 4160 // Add the instructions that need to be reanalyzed to the worklist, and 4161 // reset the changed indicator. 4162 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4163 InstsToReanalyze.clear(); 4164 Changed = false; 4165 4166 while (!Worklist.empty()) { 4167 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4168 4169 // We can't sink an instruction if it is a phi node, is not in the loop, 4170 // or may have side effects. 4171 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4172 I->mayHaveSideEffects()) 4173 continue; 4174 4175 // If the instruction is already in PredBB, check if we can sink its 4176 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4177 // sinking the scalar instruction I, hence it appears in PredBB; but it 4178 // may have failed to sink I's operands (recursively), which we try 4179 // (again) here. 4180 if (I->getParent() == PredBB) { 4181 Worklist.insert(I->op_begin(), I->op_end()); 4182 continue; 4183 } 4184 4185 // It's legal to sink the instruction if all its uses occur in the 4186 // predicated block. Otherwise, there's nothing to do yet, and we may 4187 // need to reanalyze the instruction. 4188 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4189 InstsToReanalyze.push_back(I); 4190 continue; 4191 } 4192 4193 // Move the instruction to the beginning of the predicated block, and add 4194 // it's operands to the worklist. 4195 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4196 Worklist.insert(I->op_begin(), I->op_end()); 4197 4198 // The sinking may have enabled other instructions to be sunk, so we will 4199 // need to iterate. 4200 Changed = true; 4201 } 4202 } while (Changed); 4203 } 4204 4205 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4206 for (PHINode *OrigPhi : OrigPHIsToFix) { 4207 VPWidenPHIRecipe *VPPhi = 4208 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4209 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4210 // Make sure the builder has a valid insert point. 4211 Builder.SetInsertPoint(NewPhi); 4212 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4213 VPValue *Inc = VPPhi->getIncomingValue(i); 4214 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4215 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4216 } 4217 } 4218 } 4219 4220 bool InnerLoopVectorizer::useOrderedReductions( 4221 const RecurrenceDescriptor &RdxDesc) { 4222 return Cost->useOrderedReductions(RdxDesc); 4223 } 4224 4225 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4226 VPWidenPHIRecipe *PhiR, 4227 VPTransformState &State) { 4228 PHINode *P = cast<PHINode>(PN); 4229 if (EnableVPlanNativePath) { 4230 // Currently we enter here in the VPlan-native path for non-induction 4231 // PHIs where all control flow is uniform. We simply widen these PHIs. 4232 // Create a vector phi with no operands - the vector phi operands will be 4233 // set at the end of vector code generation. 4234 Type *VecTy = (State.VF.isScalar()) 4235 ? PN->getType() 4236 : VectorType::get(PN->getType(), State.VF); 4237 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4238 State.set(PhiR, VecPhi, 0); 4239 OrigPHIsToFix.push_back(P); 4240 4241 return; 4242 } 4243 4244 assert(PN->getParent() == OrigLoop->getHeader() && 4245 "Non-header phis should have been handled elsewhere"); 4246 4247 // In order to support recurrences we need to be able to vectorize Phi nodes. 4248 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4249 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4250 // this value when we vectorize all of the instructions that use the PHI. 4251 4252 assert(!Legal->isReductionVariable(P) && 4253 "reductions should be handled elsewhere"); 4254 4255 setDebugLocFromInst(P); 4256 4257 // This PHINode must be an induction variable. 4258 // Make sure that we know about it. 4259 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4260 4261 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4262 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4263 4264 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4265 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4266 4267 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4268 // which can be found from the original scalar operations. 4269 switch (II.getKind()) { 4270 case InductionDescriptor::IK_NoInduction: 4271 llvm_unreachable("Unknown induction"); 4272 case InductionDescriptor::IK_IntInduction: 4273 case InductionDescriptor::IK_FpInduction: 4274 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4275 case InductionDescriptor::IK_PtrInduction: { 4276 // Handle the pointer induction variable case. 4277 assert(P->getType()->isPointerTy() && "Unexpected type."); 4278 4279 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4280 // This is the normalized GEP that starts counting at zero. 4281 Value *PtrInd = 4282 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4283 // Determine the number of scalars we need to generate for each unroll 4284 // iteration. If the instruction is uniform, we only need to generate the 4285 // first lane. Otherwise, we generate all VF values. 4286 bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); 4287 assert((IsUniform || !State.VF.isScalable()) && 4288 "Cannot scalarize a scalable VF"); 4289 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4290 4291 for (unsigned Part = 0; Part < UF; ++Part) { 4292 Value *PartStart = 4293 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4294 4295 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4296 Value *Idx = Builder.CreateAdd( 4297 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4298 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4299 4300 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 4301 State.CFG.PrevBB->getTerminator()); 4302 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, 4303 II.getStartValue(), Step, II); 4304 SclrGep->setName("next.gep"); 4305 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4306 } 4307 } 4308 return; 4309 } 4310 assert(isa<SCEVConstant>(II.getStep()) && 4311 "Induction step not a SCEV constant!"); 4312 Type *PhiType = II.getStep()->getType(); 4313 4314 // Build a pointer phi 4315 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4316 Type *ScStValueType = ScalarStartValue->getType(); 4317 PHINode *NewPointerPhi = 4318 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4319 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4320 4321 // A pointer induction, performed by using a gep 4322 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4323 Instruction *InductionLoc = LoopLatch->getTerminator(); 4324 const SCEV *ScalarStep = II.getStep(); 4325 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4326 Value *ScalarStepValue = 4327 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4328 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4329 Value *NumUnrolledElems = 4330 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4331 Value *InductionGEP = GetElementPtrInst::Create( 4332 II.getElementType(), NewPointerPhi, 4333 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4334 InductionLoc); 4335 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4336 4337 // Create UF many actual address geps that use the pointer 4338 // phi as base and a vectorized version of the step value 4339 // (<step*0, ..., step*N>) as offset. 4340 for (unsigned Part = 0; Part < State.UF; ++Part) { 4341 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4342 Value *StartOffsetScalar = 4343 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4344 Value *StartOffset = 4345 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4346 // Create a vector of consecutive numbers from zero to VF. 4347 StartOffset = 4348 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4349 4350 Value *GEP = Builder.CreateGEP( 4351 II.getElementType(), NewPointerPhi, 4352 Builder.CreateMul( 4353 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4354 "vector.gep")); 4355 State.set(PhiR, GEP, Part); 4356 } 4357 } 4358 } 4359 } 4360 4361 /// A helper function for checking whether an integer division-related 4362 /// instruction may divide by zero (in which case it must be predicated if 4363 /// executed conditionally in the scalar code). 4364 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4365 /// Non-zero divisors that are non compile-time constants will not be 4366 /// converted into multiplication, so we will still end up scalarizing 4367 /// the division, but can do so w/o predication. 4368 static bool mayDivideByZero(Instruction &I) { 4369 assert((I.getOpcode() == Instruction::UDiv || 4370 I.getOpcode() == Instruction::SDiv || 4371 I.getOpcode() == Instruction::URem || 4372 I.getOpcode() == Instruction::SRem) && 4373 "Unexpected instruction"); 4374 Value *Divisor = I.getOperand(1); 4375 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4376 return !CInt || CInt->isZero(); 4377 } 4378 4379 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4380 VPUser &ArgOperands, 4381 VPTransformState &State) { 4382 assert(!isa<DbgInfoIntrinsic>(I) && 4383 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4384 setDebugLocFromInst(&I); 4385 4386 Module *M = I.getParent()->getParent()->getParent(); 4387 auto *CI = cast<CallInst>(&I); 4388 4389 SmallVector<Type *, 4> Tys; 4390 for (Value *ArgOperand : CI->args()) 4391 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4392 4393 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4394 4395 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4396 // version of the instruction. 4397 // Is it beneficial to perform intrinsic call compared to lib call? 4398 bool NeedToScalarize = false; 4399 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4400 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4401 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4402 assert((UseVectorIntrinsic || !NeedToScalarize) && 4403 "Instruction should be scalarized elsewhere."); 4404 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4405 "Either the intrinsic cost or vector call cost must be valid"); 4406 4407 for (unsigned Part = 0; Part < UF; ++Part) { 4408 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4409 SmallVector<Value *, 4> Args; 4410 for (auto &I : enumerate(ArgOperands.operands())) { 4411 // Some intrinsics have a scalar argument - don't replace it with a 4412 // vector. 4413 Value *Arg; 4414 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4415 Arg = State.get(I.value(), Part); 4416 else { 4417 Arg = State.get(I.value(), VPIteration(0, 0)); 4418 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4419 TysForDecl.push_back(Arg->getType()); 4420 } 4421 Args.push_back(Arg); 4422 } 4423 4424 Function *VectorF; 4425 if (UseVectorIntrinsic) { 4426 // Use vector version of the intrinsic. 4427 if (VF.isVector()) 4428 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4429 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4430 assert(VectorF && "Can't retrieve vector intrinsic."); 4431 } else { 4432 // Use vector version of the function call. 4433 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4434 #ifndef NDEBUG 4435 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4436 "Can't create vector function."); 4437 #endif 4438 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4439 } 4440 SmallVector<OperandBundleDef, 1> OpBundles; 4441 CI->getOperandBundlesAsDefs(OpBundles); 4442 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4443 4444 if (isa<FPMathOperator>(V)) 4445 V->copyFastMathFlags(CI); 4446 4447 State.set(Def, V, Part); 4448 addMetadata(V, &I); 4449 } 4450 } 4451 4452 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4453 // We should not collect Scalars more than once per VF. Right now, this 4454 // function is called from collectUniformsAndScalars(), which already does 4455 // this check. Collecting Scalars for VF=1 does not make any sense. 4456 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4457 "This function should not be visited twice for the same VF"); 4458 4459 SmallSetVector<Instruction *, 8> Worklist; 4460 4461 // These sets are used to seed the analysis with pointers used by memory 4462 // accesses that will remain scalar. 4463 SmallSetVector<Instruction *, 8> ScalarPtrs; 4464 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4465 auto *Latch = TheLoop->getLoopLatch(); 4466 4467 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4468 // The pointer operands of loads and stores will be scalar as long as the 4469 // memory access is not a gather or scatter operation. The value operand of a 4470 // store will remain scalar if the store is scalarized. 4471 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4472 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4473 assert(WideningDecision != CM_Unknown && 4474 "Widening decision should be ready at this moment"); 4475 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4476 if (Ptr == Store->getValueOperand()) 4477 return WideningDecision == CM_Scalarize; 4478 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4479 "Ptr is neither a value or pointer operand"); 4480 return WideningDecision != CM_GatherScatter; 4481 }; 4482 4483 // A helper that returns true if the given value is a bitcast or 4484 // getelementptr instruction contained in the loop. 4485 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4486 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4487 isa<GetElementPtrInst>(V)) && 4488 !TheLoop->isLoopInvariant(V); 4489 }; 4490 4491 // A helper that evaluates a memory access's use of a pointer. If the use will 4492 // be a scalar use and the pointer is only used by memory accesses, we place 4493 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4494 // PossibleNonScalarPtrs. 4495 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4496 // We only care about bitcast and getelementptr instructions contained in 4497 // the loop. 4498 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4499 return; 4500 4501 // If the pointer has already been identified as scalar (e.g., if it was 4502 // also identified as uniform), there's nothing to do. 4503 auto *I = cast<Instruction>(Ptr); 4504 if (Worklist.count(I)) 4505 return; 4506 4507 // If the use of the pointer will be a scalar use, and all users of the 4508 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4509 // place the pointer in PossibleNonScalarPtrs. 4510 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4511 return isa<LoadInst>(U) || isa<StoreInst>(U); 4512 })) 4513 ScalarPtrs.insert(I); 4514 else 4515 PossibleNonScalarPtrs.insert(I); 4516 }; 4517 4518 // We seed the scalars analysis with three classes of instructions: (1) 4519 // instructions marked uniform-after-vectorization and (2) bitcast, 4520 // getelementptr and (pointer) phi instructions used by memory accesses 4521 // requiring a scalar use. 4522 // 4523 // (1) Add to the worklist all instructions that have been identified as 4524 // uniform-after-vectorization. 4525 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4526 4527 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4528 // memory accesses requiring a scalar use. The pointer operands of loads and 4529 // stores will be scalar as long as the memory accesses is not a gather or 4530 // scatter operation. The value operand of a store will remain scalar if the 4531 // store is scalarized. 4532 for (auto *BB : TheLoop->blocks()) 4533 for (auto &I : *BB) { 4534 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4535 evaluatePtrUse(Load, Load->getPointerOperand()); 4536 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4537 evaluatePtrUse(Store, Store->getPointerOperand()); 4538 evaluatePtrUse(Store, Store->getValueOperand()); 4539 } 4540 } 4541 for (auto *I : ScalarPtrs) 4542 if (!PossibleNonScalarPtrs.count(I)) { 4543 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4544 Worklist.insert(I); 4545 } 4546 4547 // Insert the forced scalars. 4548 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4549 // induction variable when the PHI user is scalarized. 4550 auto ForcedScalar = ForcedScalars.find(VF); 4551 if (ForcedScalar != ForcedScalars.end()) 4552 for (auto *I : ForcedScalar->second) 4553 Worklist.insert(I); 4554 4555 // Expand the worklist by looking through any bitcasts and getelementptr 4556 // instructions we've already identified as scalar. This is similar to the 4557 // expansion step in collectLoopUniforms(); however, here we're only 4558 // expanding to include additional bitcasts and getelementptr instructions. 4559 unsigned Idx = 0; 4560 while (Idx != Worklist.size()) { 4561 Instruction *Dst = Worklist[Idx++]; 4562 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4563 continue; 4564 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4565 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4566 auto *J = cast<Instruction>(U); 4567 return !TheLoop->contains(J) || Worklist.count(J) || 4568 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4569 isScalarUse(J, Src)); 4570 })) { 4571 Worklist.insert(Src); 4572 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4573 } 4574 } 4575 4576 // An induction variable will remain scalar if all users of the induction 4577 // variable and induction variable update remain scalar. 4578 for (auto &Induction : Legal->getInductionVars()) { 4579 auto *Ind = Induction.first; 4580 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4581 4582 // If tail-folding is applied, the primary induction variable will be used 4583 // to feed a vector compare. 4584 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4585 continue; 4586 4587 // Returns true if \p Indvar is a pointer induction that is used directly by 4588 // load/store instruction \p I. 4589 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4590 Instruction *I) { 4591 return Induction.second.getKind() == 4592 InductionDescriptor::IK_PtrInduction && 4593 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4594 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4595 }; 4596 4597 // Determine if all users of the induction variable are scalar after 4598 // vectorization. 4599 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4600 auto *I = cast<Instruction>(U); 4601 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4602 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4603 }); 4604 if (!ScalarInd) 4605 continue; 4606 4607 // Determine if all users of the induction variable update instruction are 4608 // scalar after vectorization. 4609 auto ScalarIndUpdate = 4610 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4611 auto *I = cast<Instruction>(U); 4612 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4613 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4614 }); 4615 if (!ScalarIndUpdate) 4616 continue; 4617 4618 // The induction variable and its update instruction will remain scalar. 4619 Worklist.insert(Ind); 4620 Worklist.insert(IndUpdate); 4621 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4622 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4623 << "\n"); 4624 } 4625 4626 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4627 } 4628 4629 bool LoopVectorizationCostModel::isScalarWithPredication( 4630 Instruction *I, ElementCount VF) const { 4631 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4632 return false; 4633 switch(I->getOpcode()) { 4634 default: 4635 break; 4636 case Instruction::Load: 4637 case Instruction::Store: { 4638 if (!Legal->isMaskRequired(I)) 4639 return false; 4640 auto *Ptr = getLoadStorePointerOperand(I); 4641 auto *Ty = getLoadStoreType(I); 4642 Type *VTy = Ty; 4643 if (VF.isVector()) 4644 VTy = VectorType::get(Ty, VF); 4645 const Align Alignment = getLoadStoreAlignment(I); 4646 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4647 TTI.isLegalMaskedGather(VTy, Alignment)) 4648 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4649 TTI.isLegalMaskedScatter(VTy, Alignment)); 4650 } 4651 case Instruction::UDiv: 4652 case Instruction::SDiv: 4653 case Instruction::SRem: 4654 case Instruction::URem: 4655 return mayDivideByZero(*I); 4656 } 4657 return false; 4658 } 4659 4660 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4661 Instruction *I, ElementCount VF) { 4662 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4663 assert(getWideningDecision(I, VF) == CM_Unknown && 4664 "Decision should not be set yet."); 4665 auto *Group = getInterleavedAccessGroup(I); 4666 assert(Group && "Must have a group."); 4667 4668 // If the instruction's allocated size doesn't equal it's type size, it 4669 // requires padding and will be scalarized. 4670 auto &DL = I->getModule()->getDataLayout(); 4671 auto *ScalarTy = getLoadStoreType(I); 4672 if (hasIrregularType(ScalarTy, DL)) 4673 return false; 4674 4675 // Check if masking is required. 4676 // A Group may need masking for one of two reasons: it resides in a block that 4677 // needs predication, or it was decided to use masking to deal with gaps 4678 // (either a gap at the end of a load-access that may result in a speculative 4679 // load, or any gaps in a store-access). 4680 bool PredicatedAccessRequiresMasking = 4681 blockNeedsPredicationForAnyReason(I->getParent()) && 4682 Legal->isMaskRequired(I); 4683 bool LoadAccessWithGapsRequiresEpilogMasking = 4684 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4685 !isScalarEpilogueAllowed(); 4686 bool StoreAccessWithGapsRequiresMasking = 4687 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4688 if (!PredicatedAccessRequiresMasking && 4689 !LoadAccessWithGapsRequiresEpilogMasking && 4690 !StoreAccessWithGapsRequiresMasking) 4691 return true; 4692 4693 // If masked interleaving is required, we expect that the user/target had 4694 // enabled it, because otherwise it either wouldn't have been created or 4695 // it should have been invalidated by the CostModel. 4696 assert(useMaskedInterleavedAccesses(TTI) && 4697 "Masked interleave-groups for predicated accesses are not enabled."); 4698 4699 if (Group->isReverse()) 4700 return false; 4701 4702 auto *Ty = getLoadStoreType(I); 4703 const Align Alignment = getLoadStoreAlignment(I); 4704 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4705 : TTI.isLegalMaskedStore(Ty, Alignment); 4706 } 4707 4708 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4709 Instruction *I, ElementCount VF) { 4710 // Get and ensure we have a valid memory instruction. 4711 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4712 4713 auto *Ptr = getLoadStorePointerOperand(I); 4714 auto *ScalarTy = getLoadStoreType(I); 4715 4716 // In order to be widened, the pointer should be consecutive, first of all. 4717 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4718 return false; 4719 4720 // If the instruction is a store located in a predicated block, it will be 4721 // scalarized. 4722 if (isScalarWithPredication(I, VF)) 4723 return false; 4724 4725 // If the instruction's allocated size doesn't equal it's type size, it 4726 // requires padding and will be scalarized. 4727 auto &DL = I->getModule()->getDataLayout(); 4728 if (hasIrregularType(ScalarTy, DL)) 4729 return false; 4730 4731 return true; 4732 } 4733 4734 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4735 // We should not collect Uniforms more than once per VF. Right now, 4736 // this function is called from collectUniformsAndScalars(), which 4737 // already does this check. Collecting Uniforms for VF=1 does not make any 4738 // sense. 4739 4740 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4741 "This function should not be visited twice for the same VF"); 4742 4743 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4744 // not analyze again. Uniforms.count(VF) will return 1. 4745 Uniforms[VF].clear(); 4746 4747 // We now know that the loop is vectorizable! 4748 // Collect instructions inside the loop that will remain uniform after 4749 // vectorization. 4750 4751 // Global values, params and instructions outside of current loop are out of 4752 // scope. 4753 auto isOutOfScope = [&](Value *V) -> bool { 4754 Instruction *I = dyn_cast<Instruction>(V); 4755 return (!I || !TheLoop->contains(I)); 4756 }; 4757 4758 // Worklist containing uniform instructions demanding lane 0. 4759 SetVector<Instruction *> Worklist; 4760 BasicBlock *Latch = TheLoop->getLoopLatch(); 4761 4762 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4763 // that are scalar with predication must not be considered uniform after 4764 // vectorization, because that would create an erroneous replicating region 4765 // where only a single instance out of VF should be formed. 4766 // TODO: optimize such seldom cases if found important, see PR40816. 4767 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4768 if (isOutOfScope(I)) { 4769 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4770 << *I << "\n"); 4771 return; 4772 } 4773 if (isScalarWithPredication(I, VF)) { 4774 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4775 << *I << "\n"); 4776 return; 4777 } 4778 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4779 Worklist.insert(I); 4780 }; 4781 4782 // Start with the conditional branch. If the branch condition is an 4783 // instruction contained in the loop that is only used by the branch, it is 4784 // uniform. 4785 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4786 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4787 addToWorklistIfAllowed(Cmp); 4788 4789 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4790 InstWidening WideningDecision = getWideningDecision(I, VF); 4791 assert(WideningDecision != CM_Unknown && 4792 "Widening decision should be ready at this moment"); 4793 4794 // A uniform memory op is itself uniform. We exclude uniform stores 4795 // here as they demand the last lane, not the first one. 4796 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4797 assert(WideningDecision == CM_Scalarize); 4798 return true; 4799 } 4800 4801 return (WideningDecision == CM_Widen || 4802 WideningDecision == CM_Widen_Reverse || 4803 WideningDecision == CM_Interleave); 4804 }; 4805 4806 4807 // Returns true if Ptr is the pointer operand of a memory access instruction 4808 // I, and I is known to not require scalarization. 4809 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4810 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4811 }; 4812 4813 // Holds a list of values which are known to have at least one uniform use. 4814 // Note that there may be other uses which aren't uniform. A "uniform use" 4815 // here is something which only demands lane 0 of the unrolled iterations; 4816 // it does not imply that all lanes produce the same value (e.g. this is not 4817 // the usual meaning of uniform) 4818 SetVector<Value *> HasUniformUse; 4819 4820 // Scan the loop for instructions which are either a) known to have only 4821 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4822 for (auto *BB : TheLoop->blocks()) 4823 for (auto &I : *BB) { 4824 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4825 switch (II->getIntrinsicID()) { 4826 case Intrinsic::sideeffect: 4827 case Intrinsic::experimental_noalias_scope_decl: 4828 case Intrinsic::assume: 4829 case Intrinsic::lifetime_start: 4830 case Intrinsic::lifetime_end: 4831 if (TheLoop->hasLoopInvariantOperands(&I)) 4832 addToWorklistIfAllowed(&I); 4833 break; 4834 default: 4835 break; 4836 } 4837 } 4838 4839 // ExtractValue instructions must be uniform, because the operands are 4840 // known to be loop-invariant. 4841 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4842 assert(isOutOfScope(EVI->getAggregateOperand()) && 4843 "Expected aggregate value to be loop invariant"); 4844 addToWorklistIfAllowed(EVI); 4845 continue; 4846 } 4847 4848 // If there's no pointer operand, there's nothing to do. 4849 auto *Ptr = getLoadStorePointerOperand(&I); 4850 if (!Ptr) 4851 continue; 4852 4853 // A uniform memory op is itself uniform. We exclude uniform stores 4854 // here as they demand the last lane, not the first one. 4855 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4856 addToWorklistIfAllowed(&I); 4857 4858 if (isUniformDecision(&I, VF)) { 4859 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4860 HasUniformUse.insert(Ptr); 4861 } 4862 } 4863 4864 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4865 // demanding) users. Since loops are assumed to be in LCSSA form, this 4866 // disallows uses outside the loop as well. 4867 for (auto *V : HasUniformUse) { 4868 if (isOutOfScope(V)) 4869 continue; 4870 auto *I = cast<Instruction>(V); 4871 auto UsersAreMemAccesses = 4872 llvm::all_of(I->users(), [&](User *U) -> bool { 4873 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4874 }); 4875 if (UsersAreMemAccesses) 4876 addToWorklistIfAllowed(I); 4877 } 4878 4879 // Expand Worklist in topological order: whenever a new instruction 4880 // is added , its users should be already inside Worklist. It ensures 4881 // a uniform instruction will only be used by uniform instructions. 4882 unsigned idx = 0; 4883 while (idx != Worklist.size()) { 4884 Instruction *I = Worklist[idx++]; 4885 4886 for (auto OV : I->operand_values()) { 4887 // isOutOfScope operands cannot be uniform instructions. 4888 if (isOutOfScope(OV)) 4889 continue; 4890 // First order recurrence Phi's should typically be considered 4891 // non-uniform. 4892 auto *OP = dyn_cast<PHINode>(OV); 4893 if (OP && Legal->isFirstOrderRecurrence(OP)) 4894 continue; 4895 // If all the users of the operand are uniform, then add the 4896 // operand into the uniform worklist. 4897 auto *OI = cast<Instruction>(OV); 4898 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4899 auto *J = cast<Instruction>(U); 4900 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4901 })) 4902 addToWorklistIfAllowed(OI); 4903 } 4904 } 4905 4906 // For an instruction to be added into Worklist above, all its users inside 4907 // the loop should also be in Worklist. However, this condition cannot be 4908 // true for phi nodes that form a cyclic dependence. We must process phi 4909 // nodes separately. An induction variable will remain uniform if all users 4910 // of the induction variable and induction variable update remain uniform. 4911 // The code below handles both pointer and non-pointer induction variables. 4912 for (auto &Induction : Legal->getInductionVars()) { 4913 auto *Ind = Induction.first; 4914 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4915 4916 // Determine if all users of the induction variable are uniform after 4917 // vectorization. 4918 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4919 auto *I = cast<Instruction>(U); 4920 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4921 isVectorizedMemAccessUse(I, Ind); 4922 }); 4923 if (!UniformInd) 4924 continue; 4925 4926 // Determine if all users of the induction variable update instruction are 4927 // uniform after vectorization. 4928 auto UniformIndUpdate = 4929 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4930 auto *I = cast<Instruction>(U); 4931 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4932 isVectorizedMemAccessUse(I, IndUpdate); 4933 }); 4934 if (!UniformIndUpdate) 4935 continue; 4936 4937 // The induction variable and its update instruction will remain uniform. 4938 addToWorklistIfAllowed(Ind); 4939 addToWorklistIfAllowed(IndUpdate); 4940 } 4941 4942 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4943 } 4944 4945 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4946 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4947 4948 if (Legal->getRuntimePointerChecking()->Need) { 4949 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4950 "runtime pointer checks needed. Enable vectorization of this " 4951 "loop with '#pragma clang loop vectorize(enable)' when " 4952 "compiling with -Os/-Oz", 4953 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4954 return true; 4955 } 4956 4957 if (!PSE.getPredicate().isAlwaysTrue()) { 4958 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4959 "runtime SCEV checks needed. Enable vectorization of this " 4960 "loop with '#pragma clang loop vectorize(enable)' when " 4961 "compiling with -Os/-Oz", 4962 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4963 return true; 4964 } 4965 4966 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4967 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4968 reportVectorizationFailure("Runtime stride check for small trip count", 4969 "runtime stride == 1 checks needed. Enable vectorization of " 4970 "this loop without such check by compiling with -Os/-Oz", 4971 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4972 return true; 4973 } 4974 4975 return false; 4976 } 4977 4978 ElementCount 4979 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4980 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4981 return ElementCount::getScalable(0); 4982 4983 if (Hints->isScalableVectorizationDisabled()) { 4984 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4985 "ScalableVectorizationDisabled", ORE, TheLoop); 4986 return ElementCount::getScalable(0); 4987 } 4988 4989 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4990 4991 auto MaxScalableVF = ElementCount::getScalable( 4992 std::numeric_limits<ElementCount::ScalarTy>::max()); 4993 4994 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4995 // FIXME: While for scalable vectors this is currently sufficient, this should 4996 // be replaced by a more detailed mechanism that filters out specific VFs, 4997 // instead of invalidating vectorization for a whole set of VFs based on the 4998 // MaxVF. 4999 5000 // Disable scalable vectorization if the loop contains unsupported reductions. 5001 if (!canVectorizeReductions(MaxScalableVF)) { 5002 reportVectorizationInfo( 5003 "Scalable vectorization not supported for the reduction " 5004 "operations found in this loop.", 5005 "ScalableVFUnfeasible", ORE, TheLoop); 5006 return ElementCount::getScalable(0); 5007 } 5008 5009 // Disable scalable vectorization if the loop contains any instructions 5010 // with element types not supported for scalable vectors. 5011 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5012 return !Ty->isVoidTy() && 5013 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5014 })) { 5015 reportVectorizationInfo("Scalable vectorization is not supported " 5016 "for all element types found in this loop.", 5017 "ScalableVFUnfeasible", ORE, TheLoop); 5018 return ElementCount::getScalable(0); 5019 } 5020 5021 if (Legal->isSafeForAnyVectorWidth()) 5022 return MaxScalableVF; 5023 5024 // Limit MaxScalableVF by the maximum safe dependence distance. 5025 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5026 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5027 MaxVScale = 5028 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5029 MaxScalableVF = ElementCount::getScalable( 5030 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5031 if (!MaxScalableVF) 5032 reportVectorizationInfo( 5033 "Max legal vector width too small, scalable vectorization " 5034 "unfeasible.", 5035 "ScalableVFUnfeasible", ORE, TheLoop); 5036 5037 return MaxScalableVF; 5038 } 5039 5040 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5041 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5042 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5043 unsigned SmallestType, WidestType; 5044 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5045 5046 // Get the maximum safe dependence distance in bits computed by LAA. 5047 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5048 // the memory accesses that is most restrictive (involved in the smallest 5049 // dependence distance). 5050 unsigned MaxSafeElements = 5051 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5052 5053 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5054 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5055 5056 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5057 << ".\n"); 5058 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5059 << ".\n"); 5060 5061 // First analyze the UserVF, fall back if the UserVF should be ignored. 5062 if (UserVF) { 5063 auto MaxSafeUserVF = 5064 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5065 5066 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5067 // If `VF=vscale x N` is safe, then so is `VF=N` 5068 if (UserVF.isScalable()) 5069 return FixedScalableVFPair( 5070 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5071 else 5072 return UserVF; 5073 } 5074 5075 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5076 5077 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5078 // is better to ignore the hint and let the compiler choose a suitable VF. 5079 if (!UserVF.isScalable()) { 5080 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5081 << " is unsafe, clamping to max safe VF=" 5082 << MaxSafeFixedVF << ".\n"); 5083 ORE->emit([&]() { 5084 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5085 TheLoop->getStartLoc(), 5086 TheLoop->getHeader()) 5087 << "User-specified vectorization factor " 5088 << ore::NV("UserVectorizationFactor", UserVF) 5089 << " is unsafe, clamping to maximum safe vectorization factor " 5090 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5091 }); 5092 return MaxSafeFixedVF; 5093 } 5094 5095 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5096 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5097 << " is ignored because scalable vectors are not " 5098 "available.\n"); 5099 ORE->emit([&]() { 5100 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5101 TheLoop->getStartLoc(), 5102 TheLoop->getHeader()) 5103 << "User-specified vectorization factor " 5104 << ore::NV("UserVectorizationFactor", UserVF) 5105 << " is ignored because the target does not support scalable " 5106 "vectors. The compiler will pick a more suitable value."; 5107 }); 5108 } else { 5109 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5110 << " is unsafe. Ignoring scalable UserVF.\n"); 5111 ORE->emit([&]() { 5112 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5113 TheLoop->getStartLoc(), 5114 TheLoop->getHeader()) 5115 << "User-specified vectorization factor " 5116 << ore::NV("UserVectorizationFactor", UserVF) 5117 << " is unsafe. Ignoring the hint to let the compiler pick a " 5118 "more suitable value."; 5119 }); 5120 } 5121 } 5122 5123 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5124 << " / " << WidestType << " bits.\n"); 5125 5126 FixedScalableVFPair Result(ElementCount::getFixed(1), 5127 ElementCount::getScalable(0)); 5128 if (auto MaxVF = 5129 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5130 MaxSafeFixedVF, FoldTailByMasking)) 5131 Result.FixedVF = MaxVF; 5132 5133 if (auto MaxVF = 5134 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5135 MaxSafeScalableVF, FoldTailByMasking)) 5136 if (MaxVF.isScalable()) { 5137 Result.ScalableVF = MaxVF; 5138 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5139 << "\n"); 5140 } 5141 5142 return Result; 5143 } 5144 5145 FixedScalableVFPair 5146 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5147 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5148 // TODO: It may by useful to do since it's still likely to be dynamically 5149 // uniform if the target can skip. 5150 reportVectorizationFailure( 5151 "Not inserting runtime ptr check for divergent target", 5152 "runtime pointer checks needed. Not enabled for divergent target", 5153 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5154 return FixedScalableVFPair::getNone(); 5155 } 5156 5157 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5158 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5159 if (TC == 1) { 5160 reportVectorizationFailure("Single iteration (non) loop", 5161 "loop trip count is one, irrelevant for vectorization", 5162 "SingleIterationLoop", ORE, TheLoop); 5163 return FixedScalableVFPair::getNone(); 5164 } 5165 5166 switch (ScalarEpilogueStatus) { 5167 case CM_ScalarEpilogueAllowed: 5168 return computeFeasibleMaxVF(TC, UserVF, false); 5169 case CM_ScalarEpilogueNotAllowedUsePredicate: 5170 LLVM_FALLTHROUGH; 5171 case CM_ScalarEpilogueNotNeededUsePredicate: 5172 LLVM_DEBUG( 5173 dbgs() << "LV: vector predicate hint/switch found.\n" 5174 << "LV: Not allowing scalar epilogue, creating predicated " 5175 << "vector loop.\n"); 5176 break; 5177 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5178 // fallthrough as a special case of OptForSize 5179 case CM_ScalarEpilogueNotAllowedOptSize: 5180 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5181 LLVM_DEBUG( 5182 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5183 else 5184 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5185 << "count.\n"); 5186 5187 // Bail if runtime checks are required, which are not good when optimising 5188 // for size. 5189 if (runtimeChecksRequired()) 5190 return FixedScalableVFPair::getNone(); 5191 5192 break; 5193 } 5194 5195 // The only loops we can vectorize without a scalar epilogue, are loops with 5196 // a bottom-test and a single exiting block. We'd have to handle the fact 5197 // that not every instruction executes on the last iteration. This will 5198 // require a lane mask which varies through the vector loop body. (TODO) 5199 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5200 // If there was a tail-folding hint/switch, but we can't fold the tail by 5201 // masking, fallback to a vectorization with a scalar epilogue. 5202 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5203 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5204 "scalar epilogue instead.\n"); 5205 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5206 return computeFeasibleMaxVF(TC, UserVF, false); 5207 } 5208 return FixedScalableVFPair::getNone(); 5209 } 5210 5211 // Now try the tail folding 5212 5213 // Invalidate interleave groups that require an epilogue if we can't mask 5214 // the interleave-group. 5215 if (!useMaskedInterleavedAccesses(TTI)) { 5216 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5217 "No decisions should have been taken at this point"); 5218 // Note: There is no need to invalidate any cost modeling decisions here, as 5219 // non where taken so far. 5220 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5221 } 5222 5223 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5224 // Avoid tail folding if the trip count is known to be a multiple of any VF 5225 // we chose. 5226 // FIXME: The condition below pessimises the case for fixed-width vectors, 5227 // when scalable VFs are also candidates for vectorization. 5228 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5229 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5230 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5231 "MaxFixedVF must be a power of 2"); 5232 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5233 : MaxFixedVF.getFixedValue(); 5234 ScalarEvolution *SE = PSE.getSE(); 5235 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5236 const SCEV *ExitCount = SE->getAddExpr( 5237 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5238 const SCEV *Rem = SE->getURemExpr( 5239 SE->applyLoopGuards(ExitCount, TheLoop), 5240 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5241 if (Rem->isZero()) { 5242 // Accept MaxFixedVF if we do not have a tail. 5243 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5244 return MaxFactors; 5245 } 5246 } 5247 5248 // For scalable vectors don't use tail folding for low trip counts or 5249 // optimizing for code size. We only permit this if the user has explicitly 5250 // requested it. 5251 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5252 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5253 MaxFactors.ScalableVF.isVector()) 5254 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5255 5256 // If we don't know the precise trip count, or if the trip count that we 5257 // found modulo the vectorization factor is not zero, try to fold the tail 5258 // by masking. 5259 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5260 if (Legal->prepareToFoldTailByMasking()) { 5261 FoldTailByMasking = true; 5262 return MaxFactors; 5263 } 5264 5265 // If there was a tail-folding hint/switch, but we can't fold the tail by 5266 // masking, fallback to a vectorization with a scalar epilogue. 5267 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5268 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5269 "scalar epilogue instead.\n"); 5270 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5271 return MaxFactors; 5272 } 5273 5274 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5275 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5276 return FixedScalableVFPair::getNone(); 5277 } 5278 5279 if (TC == 0) { 5280 reportVectorizationFailure( 5281 "Unable to calculate the loop count due to complex control flow", 5282 "unable to calculate the loop count due to complex control flow", 5283 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5284 return FixedScalableVFPair::getNone(); 5285 } 5286 5287 reportVectorizationFailure( 5288 "Cannot optimize for size and vectorize at the same time.", 5289 "cannot optimize for size and vectorize at the same time. " 5290 "Enable vectorization of this loop with '#pragma clang loop " 5291 "vectorize(enable)' when compiling with -Os/-Oz", 5292 "NoTailLoopWithOptForSize", ORE, TheLoop); 5293 return FixedScalableVFPair::getNone(); 5294 } 5295 5296 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5297 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5298 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5299 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5300 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5301 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5302 : TargetTransformInfo::RGK_FixedWidthVector); 5303 5304 // Convenience function to return the minimum of two ElementCounts. 5305 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5306 assert((LHS.isScalable() == RHS.isScalable()) && 5307 "Scalable flags must match"); 5308 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5309 }; 5310 5311 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5312 // Note that both WidestRegister and WidestType may not be a powers of 2. 5313 auto MaxVectorElementCount = ElementCount::get( 5314 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5315 ComputeScalableMaxVF); 5316 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5317 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5318 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5319 5320 if (!MaxVectorElementCount) { 5321 LLVM_DEBUG(dbgs() << "LV: The target has no " 5322 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5323 << " vector registers.\n"); 5324 return ElementCount::getFixed(1); 5325 } 5326 5327 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5328 if (ConstTripCount && 5329 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5330 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5331 // If loop trip count (TC) is known at compile time there is no point in 5332 // choosing VF greater than TC (as done in the loop below). Select maximum 5333 // power of two which doesn't exceed TC. 5334 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5335 // when the TC is less than or equal to the known number of lanes. 5336 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5337 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5338 "exceeding the constant trip count: " 5339 << ClampedConstTripCount << "\n"); 5340 return ElementCount::getFixed(ClampedConstTripCount); 5341 } 5342 5343 ElementCount MaxVF = MaxVectorElementCount; 5344 if (TTI.shouldMaximizeVectorBandwidth() || 5345 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5346 auto MaxVectorElementCountMaxBW = ElementCount::get( 5347 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5348 ComputeScalableMaxVF); 5349 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5350 5351 // Collect all viable vectorization factors larger than the default MaxVF 5352 // (i.e. MaxVectorElementCount). 5353 SmallVector<ElementCount, 8> VFs; 5354 for (ElementCount VS = MaxVectorElementCount * 2; 5355 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5356 VFs.push_back(VS); 5357 5358 // For each VF calculate its register usage. 5359 auto RUs = calculateRegisterUsage(VFs); 5360 5361 // Select the largest VF which doesn't require more registers than existing 5362 // ones. 5363 for (int i = RUs.size() - 1; i >= 0; --i) { 5364 bool Selected = true; 5365 for (auto &pair : RUs[i].MaxLocalUsers) { 5366 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5367 if (pair.second > TargetNumRegisters) 5368 Selected = false; 5369 } 5370 if (Selected) { 5371 MaxVF = VFs[i]; 5372 break; 5373 } 5374 } 5375 if (ElementCount MinVF = 5376 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5377 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5378 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5379 << ") with target's minimum: " << MinVF << '\n'); 5380 MaxVF = MinVF; 5381 } 5382 } 5383 } 5384 return MaxVF; 5385 } 5386 5387 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5388 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5389 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5390 auto Min = Attr.getVScaleRangeMin(); 5391 auto Max = Attr.getVScaleRangeMax(); 5392 if (Max && Min == Max) 5393 return Max; 5394 } 5395 5396 return TTI.getVScaleForTuning(); 5397 } 5398 5399 bool LoopVectorizationCostModel::isMoreProfitable( 5400 const VectorizationFactor &A, const VectorizationFactor &B) const { 5401 InstructionCost CostA = A.Cost; 5402 InstructionCost CostB = B.Cost; 5403 5404 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5405 5406 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5407 MaxTripCount) { 5408 // If we are folding the tail and the trip count is a known (possibly small) 5409 // constant, the trip count will be rounded up to an integer number of 5410 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5411 // which we compare directly. When not folding the tail, the total cost will 5412 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5413 // approximated with the per-lane cost below instead of using the tripcount 5414 // as here. 5415 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5416 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5417 return RTCostA < RTCostB; 5418 } 5419 5420 // Improve estimate for the vector width if it is scalable. 5421 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5422 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5423 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5424 if (A.Width.isScalable()) 5425 EstimatedWidthA *= VScale.getValue(); 5426 if (B.Width.isScalable()) 5427 EstimatedWidthB *= VScale.getValue(); 5428 } 5429 5430 // Assume vscale may be larger than 1 (or the value being tuned for), 5431 // so that scalable vectorization is slightly favorable over fixed-width 5432 // vectorization. 5433 if (A.Width.isScalable() && !B.Width.isScalable()) 5434 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5435 5436 // To avoid the need for FP division: 5437 // (CostA / A.Width) < (CostB / B.Width) 5438 // <=> (CostA * B.Width) < (CostB * A.Width) 5439 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5440 } 5441 5442 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5443 const ElementCountSet &VFCandidates) { 5444 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5445 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5446 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5447 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5448 "Expected Scalar VF to be a candidate"); 5449 5450 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5451 VectorizationFactor ChosenFactor = ScalarCost; 5452 5453 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5454 if (ForceVectorization && VFCandidates.size() > 1) { 5455 // Ignore scalar width, because the user explicitly wants vectorization. 5456 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5457 // evaluation. 5458 ChosenFactor.Cost = InstructionCost::getMax(); 5459 } 5460 5461 SmallVector<InstructionVFPair> InvalidCosts; 5462 for (const auto &i : VFCandidates) { 5463 // The cost for scalar VF=1 is already calculated, so ignore it. 5464 if (i.isScalar()) 5465 continue; 5466 5467 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5468 VectorizationFactor Candidate(i, C.first); 5469 5470 #ifndef NDEBUG 5471 unsigned AssumedMinimumVscale = 1; 5472 if (Optional<unsigned> VScale = getVScaleForTuning()) 5473 AssumedMinimumVscale = VScale.getValue(); 5474 unsigned Width = 5475 Candidate.Width.isScalable() 5476 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5477 : Candidate.Width.getFixedValue(); 5478 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5479 << " costs: " << (Candidate.Cost / Width)); 5480 if (i.isScalable()) 5481 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5482 << AssumedMinimumVscale << ")"); 5483 LLVM_DEBUG(dbgs() << ".\n"); 5484 #endif 5485 5486 if (!C.second && !ForceVectorization) { 5487 LLVM_DEBUG( 5488 dbgs() << "LV: Not considering vector loop of width " << i 5489 << " because it will not generate any vector instructions.\n"); 5490 continue; 5491 } 5492 5493 // If profitable add it to ProfitableVF list. 5494 if (isMoreProfitable(Candidate, ScalarCost)) 5495 ProfitableVFs.push_back(Candidate); 5496 5497 if (isMoreProfitable(Candidate, ChosenFactor)) 5498 ChosenFactor = Candidate; 5499 } 5500 5501 // Emit a report of VFs with invalid costs in the loop. 5502 if (!InvalidCosts.empty()) { 5503 // Group the remarks per instruction, keeping the instruction order from 5504 // InvalidCosts. 5505 std::map<Instruction *, unsigned> Numbering; 5506 unsigned I = 0; 5507 for (auto &Pair : InvalidCosts) 5508 if (!Numbering.count(Pair.first)) 5509 Numbering[Pair.first] = I++; 5510 5511 // Sort the list, first on instruction(number) then on VF. 5512 llvm::sort(InvalidCosts, 5513 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5514 if (Numbering[A.first] != Numbering[B.first]) 5515 return Numbering[A.first] < Numbering[B.first]; 5516 ElementCountComparator ECC; 5517 return ECC(A.second, B.second); 5518 }); 5519 5520 // For a list of ordered instruction-vf pairs: 5521 // [(load, vf1), (load, vf2), (store, vf1)] 5522 // Group the instructions together to emit separate remarks for: 5523 // load (vf1, vf2) 5524 // store (vf1) 5525 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5526 auto Subset = ArrayRef<InstructionVFPair>(); 5527 do { 5528 if (Subset.empty()) 5529 Subset = Tail.take_front(1); 5530 5531 Instruction *I = Subset.front().first; 5532 5533 // If the next instruction is different, or if there are no other pairs, 5534 // emit a remark for the collated subset. e.g. 5535 // [(load, vf1), (load, vf2))] 5536 // to emit: 5537 // remark: invalid costs for 'load' at VF=(vf, vf2) 5538 if (Subset == Tail || Tail[Subset.size()].first != I) { 5539 std::string OutString; 5540 raw_string_ostream OS(OutString); 5541 assert(!Subset.empty() && "Unexpected empty range"); 5542 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5543 for (auto &Pair : Subset) 5544 OS << (Pair.second == Subset.front().second ? "" : ", ") 5545 << Pair.second; 5546 OS << "):"; 5547 if (auto *CI = dyn_cast<CallInst>(I)) 5548 OS << " call to " << CI->getCalledFunction()->getName(); 5549 else 5550 OS << " " << I->getOpcodeName(); 5551 OS.flush(); 5552 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5553 Tail = Tail.drop_front(Subset.size()); 5554 Subset = {}; 5555 } else 5556 // Grow the subset by one element 5557 Subset = Tail.take_front(Subset.size() + 1); 5558 } while (!Tail.empty()); 5559 } 5560 5561 if (!EnableCondStoresVectorization && NumPredStores) { 5562 reportVectorizationFailure("There are conditional stores.", 5563 "store that is conditionally executed prevents vectorization", 5564 "ConditionalStore", ORE, TheLoop); 5565 ChosenFactor = ScalarCost; 5566 } 5567 5568 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5569 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5570 << "LV: Vectorization seems to be not beneficial, " 5571 << "but was forced by a user.\n"); 5572 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5573 return ChosenFactor; 5574 } 5575 5576 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5577 const Loop &L, ElementCount VF) const { 5578 // Cross iteration phis such as reductions need special handling and are 5579 // currently unsupported. 5580 if (any_of(L.getHeader()->phis(), 5581 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5582 return false; 5583 5584 // Phis with uses outside of the loop require special handling and are 5585 // currently unsupported. 5586 for (auto &Entry : Legal->getInductionVars()) { 5587 // Look for uses of the value of the induction at the last iteration. 5588 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5589 for (User *U : PostInc->users()) 5590 if (!L.contains(cast<Instruction>(U))) 5591 return false; 5592 // Look for uses of penultimate value of the induction. 5593 for (User *U : Entry.first->users()) 5594 if (!L.contains(cast<Instruction>(U))) 5595 return false; 5596 } 5597 5598 // Induction variables that are widened require special handling that is 5599 // currently not supported. 5600 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5601 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5602 this->isProfitableToScalarize(Entry.first, VF)); 5603 })) 5604 return false; 5605 5606 // Epilogue vectorization code has not been auditted to ensure it handles 5607 // non-latch exits properly. It may be fine, but it needs auditted and 5608 // tested. 5609 if (L.getExitingBlock() != L.getLoopLatch()) 5610 return false; 5611 5612 return true; 5613 } 5614 5615 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5616 const ElementCount VF) const { 5617 // FIXME: We need a much better cost-model to take different parameters such 5618 // as register pressure, code size increase and cost of extra branches into 5619 // account. For now we apply a very crude heuristic and only consider loops 5620 // with vectorization factors larger than a certain value. 5621 // We also consider epilogue vectorization unprofitable for targets that don't 5622 // consider interleaving beneficial (eg. MVE). 5623 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5624 return false; 5625 // FIXME: We should consider changing the threshold for scalable 5626 // vectors to take VScaleForTuning into account. 5627 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5628 return true; 5629 return false; 5630 } 5631 5632 VectorizationFactor 5633 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5634 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5635 VectorizationFactor Result = VectorizationFactor::Disabled(); 5636 if (!EnableEpilogueVectorization) { 5637 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5638 return Result; 5639 } 5640 5641 if (!isScalarEpilogueAllowed()) { 5642 LLVM_DEBUG( 5643 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5644 "allowed.\n";); 5645 return Result; 5646 } 5647 5648 // Not really a cost consideration, but check for unsupported cases here to 5649 // simplify the logic. 5650 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5651 LLVM_DEBUG( 5652 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5653 "not a supported candidate.\n";); 5654 return Result; 5655 } 5656 5657 if (EpilogueVectorizationForceVF > 1) { 5658 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5659 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5660 if (LVP.hasPlanWithVF(ForcedEC)) 5661 return {ForcedEC, 0}; 5662 else { 5663 LLVM_DEBUG( 5664 dbgs() 5665 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5666 return Result; 5667 } 5668 } 5669 5670 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5671 TheLoop->getHeader()->getParent()->hasMinSize()) { 5672 LLVM_DEBUG( 5673 dbgs() 5674 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5675 return Result; 5676 } 5677 5678 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5679 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5680 "this loop\n"); 5681 return Result; 5682 } 5683 5684 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5685 // the main loop handles 8 lanes per iteration. We could still benefit from 5686 // vectorizing the epilogue loop with VF=4. 5687 ElementCount EstimatedRuntimeVF = MainLoopVF; 5688 if (MainLoopVF.isScalable()) { 5689 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5690 if (Optional<unsigned> VScale = getVScaleForTuning()) 5691 EstimatedRuntimeVF *= VScale.getValue(); 5692 } 5693 5694 for (auto &NextVF : ProfitableVFs) 5695 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5696 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5697 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5698 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5699 LVP.hasPlanWithVF(NextVF.Width)) 5700 Result = NextVF; 5701 5702 if (Result != VectorizationFactor::Disabled()) 5703 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5704 << Result.Width << "\n";); 5705 return Result; 5706 } 5707 5708 std::pair<unsigned, unsigned> 5709 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5710 unsigned MinWidth = -1U; 5711 unsigned MaxWidth = 8; 5712 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5713 // For in-loop reductions, no element types are added to ElementTypesInLoop 5714 // if there are no loads/stores in the loop. In this case, check through the 5715 // reduction variables to determine the maximum width. 5716 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5717 // Reset MaxWidth so that we can find the smallest type used by recurrences 5718 // in the loop. 5719 MaxWidth = -1U; 5720 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5721 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5722 // When finding the min width used by the recurrence we need to account 5723 // for casts on the input operands of the recurrence. 5724 MaxWidth = std::min<unsigned>( 5725 MaxWidth, std::min<unsigned>( 5726 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5727 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5728 } 5729 } else { 5730 for (Type *T : ElementTypesInLoop) { 5731 MinWidth = std::min<unsigned>( 5732 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5733 MaxWidth = std::max<unsigned>( 5734 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5735 } 5736 } 5737 return {MinWidth, MaxWidth}; 5738 } 5739 5740 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5741 ElementTypesInLoop.clear(); 5742 // For each block. 5743 for (BasicBlock *BB : TheLoop->blocks()) { 5744 // For each instruction in the loop. 5745 for (Instruction &I : BB->instructionsWithoutDebug()) { 5746 Type *T = I.getType(); 5747 5748 // Skip ignored values. 5749 if (ValuesToIgnore.count(&I)) 5750 continue; 5751 5752 // Only examine Loads, Stores and PHINodes. 5753 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5754 continue; 5755 5756 // Examine PHI nodes that are reduction variables. Update the type to 5757 // account for the recurrence type. 5758 if (auto *PN = dyn_cast<PHINode>(&I)) { 5759 if (!Legal->isReductionVariable(PN)) 5760 continue; 5761 const RecurrenceDescriptor &RdxDesc = 5762 Legal->getReductionVars().find(PN)->second; 5763 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5764 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5765 RdxDesc.getRecurrenceType(), 5766 TargetTransformInfo::ReductionFlags())) 5767 continue; 5768 T = RdxDesc.getRecurrenceType(); 5769 } 5770 5771 // Examine the stored values. 5772 if (auto *ST = dyn_cast<StoreInst>(&I)) 5773 T = ST->getValueOperand()->getType(); 5774 5775 assert(T->isSized() && 5776 "Expected the load/store/recurrence type to be sized"); 5777 5778 ElementTypesInLoop.insert(T); 5779 } 5780 } 5781 } 5782 5783 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5784 unsigned LoopCost) { 5785 // -- The interleave heuristics -- 5786 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5787 // There are many micro-architectural considerations that we can't predict 5788 // at this level. For example, frontend pressure (on decode or fetch) due to 5789 // code size, or the number and capabilities of the execution ports. 5790 // 5791 // We use the following heuristics to select the interleave count: 5792 // 1. If the code has reductions, then we interleave to break the cross 5793 // iteration dependency. 5794 // 2. If the loop is really small, then we interleave to reduce the loop 5795 // overhead. 5796 // 3. We don't interleave if we think that we will spill registers to memory 5797 // due to the increased register pressure. 5798 5799 if (!isScalarEpilogueAllowed()) 5800 return 1; 5801 5802 // We used the distance for the interleave count. 5803 if (Legal->getMaxSafeDepDistBytes() != -1U) 5804 return 1; 5805 5806 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5807 const bool HasReductions = !Legal->getReductionVars().empty(); 5808 // Do not interleave loops with a relatively small known or estimated trip 5809 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5810 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5811 // because with the above conditions interleaving can expose ILP and break 5812 // cross iteration dependences for reductions. 5813 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5814 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5815 return 1; 5816 5817 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5818 // We divide by these constants so assume that we have at least one 5819 // instruction that uses at least one register. 5820 for (auto& pair : R.MaxLocalUsers) { 5821 pair.second = std::max(pair.second, 1U); 5822 } 5823 5824 // We calculate the interleave count using the following formula. 5825 // Subtract the number of loop invariants from the number of available 5826 // registers. These registers are used by all of the interleaved instances. 5827 // Next, divide the remaining registers by the number of registers that is 5828 // required by the loop, in order to estimate how many parallel instances 5829 // fit without causing spills. All of this is rounded down if necessary to be 5830 // a power of two. We want power of two interleave count to simplify any 5831 // addressing operations or alignment considerations. 5832 // We also want power of two interleave counts to ensure that the induction 5833 // variable of the vector loop wraps to zero, when tail is folded by masking; 5834 // this currently happens when OptForSize, in which case IC is set to 1 above. 5835 unsigned IC = UINT_MAX; 5836 5837 for (auto& pair : R.MaxLocalUsers) { 5838 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5839 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5840 << " registers of " 5841 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5842 if (VF.isScalar()) { 5843 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5844 TargetNumRegisters = ForceTargetNumScalarRegs; 5845 } else { 5846 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5847 TargetNumRegisters = ForceTargetNumVectorRegs; 5848 } 5849 unsigned MaxLocalUsers = pair.second; 5850 unsigned LoopInvariantRegs = 0; 5851 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5852 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5853 5854 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5855 // Don't count the induction variable as interleaved. 5856 if (EnableIndVarRegisterHeur) { 5857 TmpIC = 5858 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5859 std::max(1U, (MaxLocalUsers - 1))); 5860 } 5861 5862 IC = std::min(IC, TmpIC); 5863 } 5864 5865 // Clamp the interleave ranges to reasonable counts. 5866 unsigned MaxInterleaveCount = 5867 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5868 5869 // Check if the user has overridden the max. 5870 if (VF.isScalar()) { 5871 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5872 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5873 } else { 5874 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5875 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5876 } 5877 5878 // If trip count is known or estimated compile time constant, limit the 5879 // interleave count to be less than the trip count divided by VF, provided it 5880 // is at least 1. 5881 // 5882 // For scalable vectors we can't know if interleaving is beneficial. It may 5883 // not be beneficial for small loops if none of the lanes in the second vector 5884 // iterations is enabled. However, for larger loops, there is likely to be a 5885 // similar benefit as for fixed-width vectors. For now, we choose to leave 5886 // the InterleaveCount as if vscale is '1', although if some information about 5887 // the vector is known (e.g. min vector size), we can make a better decision. 5888 if (BestKnownTC) { 5889 MaxInterleaveCount = 5890 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5891 // Make sure MaxInterleaveCount is greater than 0. 5892 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5893 } 5894 5895 assert(MaxInterleaveCount > 0 && 5896 "Maximum interleave count must be greater than 0"); 5897 5898 // Clamp the calculated IC to be between the 1 and the max interleave count 5899 // that the target and trip count allows. 5900 if (IC > MaxInterleaveCount) 5901 IC = MaxInterleaveCount; 5902 else 5903 // Make sure IC is greater than 0. 5904 IC = std::max(1u, IC); 5905 5906 assert(IC > 0 && "Interleave count must be greater than 0."); 5907 5908 // If we did not calculate the cost for VF (because the user selected the VF) 5909 // then we calculate the cost of VF here. 5910 if (LoopCost == 0) { 5911 InstructionCost C = expectedCost(VF).first; 5912 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5913 LoopCost = *C.getValue(); 5914 } 5915 5916 assert(LoopCost && "Non-zero loop cost expected"); 5917 5918 // Interleave if we vectorized this loop and there is a reduction that could 5919 // benefit from interleaving. 5920 if (VF.isVector() && HasReductions) { 5921 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5922 return IC; 5923 } 5924 5925 // For any scalar loop that either requires runtime checks or predication we 5926 // are better off leaving this to the unroller. Note that if we've already 5927 // vectorized the loop we will have done the runtime check and so interleaving 5928 // won't require further checks. 5929 bool ScalarInterleavingRequiresPredication = 5930 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5931 return Legal->blockNeedsPredication(BB); 5932 })); 5933 bool ScalarInterleavingRequiresRuntimePointerCheck = 5934 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5935 5936 // We want to interleave small loops in order to reduce the loop overhead and 5937 // potentially expose ILP opportunities. 5938 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5939 << "LV: IC is " << IC << '\n' 5940 << "LV: VF is " << VF << '\n'); 5941 const bool AggressivelyInterleaveReductions = 5942 TTI.enableAggressiveInterleaving(HasReductions); 5943 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5944 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5945 // We assume that the cost overhead is 1 and we use the cost model 5946 // to estimate the cost of the loop and interleave until the cost of the 5947 // loop overhead is about 5% of the cost of the loop. 5948 unsigned SmallIC = 5949 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5950 5951 // Interleave until store/load ports (estimated by max interleave count) are 5952 // saturated. 5953 unsigned NumStores = Legal->getNumStores(); 5954 unsigned NumLoads = Legal->getNumLoads(); 5955 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5956 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5957 5958 // There is little point in interleaving for reductions containing selects 5959 // and compares when VF=1 since it may just create more overhead than it's 5960 // worth for loops with small trip counts. This is because we still have to 5961 // do the final reduction after the loop. 5962 bool HasSelectCmpReductions = 5963 HasReductions && 5964 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5965 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5966 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5967 RdxDesc.getRecurrenceKind()); 5968 }); 5969 if (HasSelectCmpReductions) { 5970 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5971 return 1; 5972 } 5973 5974 // If we have a scalar reduction (vector reductions are already dealt with 5975 // by this point), we can increase the critical path length if the loop 5976 // we're interleaving is inside another loop. For tree-wise reductions 5977 // set the limit to 2, and for ordered reductions it's best to disable 5978 // interleaving entirely. 5979 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5980 bool HasOrderedReductions = 5981 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5982 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5983 return RdxDesc.isOrdered(); 5984 }); 5985 if (HasOrderedReductions) { 5986 LLVM_DEBUG( 5987 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5988 return 1; 5989 } 5990 5991 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5992 SmallIC = std::min(SmallIC, F); 5993 StoresIC = std::min(StoresIC, F); 5994 LoadsIC = std::min(LoadsIC, F); 5995 } 5996 5997 if (EnableLoadStoreRuntimeInterleave && 5998 std::max(StoresIC, LoadsIC) > SmallIC) { 5999 LLVM_DEBUG( 6000 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6001 return std::max(StoresIC, LoadsIC); 6002 } 6003 6004 // If there are scalar reductions and TTI has enabled aggressive 6005 // interleaving for reductions, we will interleave to expose ILP. 6006 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6007 AggressivelyInterleaveReductions) { 6008 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6009 // Interleave no less than SmallIC but not as aggressive as the normal IC 6010 // to satisfy the rare situation when resources are too limited. 6011 return std::max(IC / 2, SmallIC); 6012 } else { 6013 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6014 return SmallIC; 6015 } 6016 } 6017 6018 // Interleave if this is a large loop (small loops are already dealt with by 6019 // this point) that could benefit from interleaving. 6020 if (AggressivelyInterleaveReductions) { 6021 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6022 return IC; 6023 } 6024 6025 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6026 return 1; 6027 } 6028 6029 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6030 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6031 // This function calculates the register usage by measuring the highest number 6032 // of values that are alive at a single location. Obviously, this is a very 6033 // rough estimation. We scan the loop in a topological order in order and 6034 // assign a number to each instruction. We use RPO to ensure that defs are 6035 // met before their users. We assume that each instruction that has in-loop 6036 // users starts an interval. We record every time that an in-loop value is 6037 // used, so we have a list of the first and last occurrences of each 6038 // instruction. Next, we transpose this data structure into a multi map that 6039 // holds the list of intervals that *end* at a specific location. This multi 6040 // map allows us to perform a linear search. We scan the instructions linearly 6041 // and record each time that a new interval starts, by placing it in a set. 6042 // If we find this value in the multi-map then we remove it from the set. 6043 // The max register usage is the maximum size of the set. 6044 // We also search for instructions that are defined outside the loop, but are 6045 // used inside the loop. We need this number separately from the max-interval 6046 // usage number because when we unroll, loop-invariant values do not take 6047 // more register. 6048 LoopBlocksDFS DFS(TheLoop); 6049 DFS.perform(LI); 6050 6051 RegisterUsage RU; 6052 6053 // Each 'key' in the map opens a new interval. The values 6054 // of the map are the index of the 'last seen' usage of the 6055 // instruction that is the key. 6056 using IntervalMap = DenseMap<Instruction *, unsigned>; 6057 6058 // Maps instruction to its index. 6059 SmallVector<Instruction *, 64> IdxToInstr; 6060 // Marks the end of each interval. 6061 IntervalMap EndPoint; 6062 // Saves the list of instruction indices that are used in the loop. 6063 SmallPtrSet<Instruction *, 8> Ends; 6064 // Saves the list of values that are used in the loop but are 6065 // defined outside the loop, such as arguments and constants. 6066 SmallPtrSet<Value *, 8> LoopInvariants; 6067 6068 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6069 for (Instruction &I : BB->instructionsWithoutDebug()) { 6070 IdxToInstr.push_back(&I); 6071 6072 // Save the end location of each USE. 6073 for (Value *U : I.operands()) { 6074 auto *Instr = dyn_cast<Instruction>(U); 6075 6076 // Ignore non-instruction values such as arguments, constants, etc. 6077 if (!Instr) 6078 continue; 6079 6080 // If this instruction is outside the loop then record it and continue. 6081 if (!TheLoop->contains(Instr)) { 6082 LoopInvariants.insert(Instr); 6083 continue; 6084 } 6085 6086 // Overwrite previous end points. 6087 EndPoint[Instr] = IdxToInstr.size(); 6088 Ends.insert(Instr); 6089 } 6090 } 6091 } 6092 6093 // Saves the list of intervals that end with the index in 'key'. 6094 using InstrList = SmallVector<Instruction *, 2>; 6095 DenseMap<unsigned, InstrList> TransposeEnds; 6096 6097 // Transpose the EndPoints to a list of values that end at each index. 6098 for (auto &Interval : EndPoint) 6099 TransposeEnds[Interval.second].push_back(Interval.first); 6100 6101 SmallPtrSet<Instruction *, 8> OpenIntervals; 6102 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6103 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6104 6105 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6106 6107 // A lambda that gets the register usage for the given type and VF. 6108 const auto &TTICapture = TTI; 6109 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6110 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6111 return 0; 6112 InstructionCost::CostType RegUsage = 6113 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6114 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6115 "Nonsensical values for register usage."); 6116 return RegUsage; 6117 }; 6118 6119 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6120 Instruction *I = IdxToInstr[i]; 6121 6122 // Remove all of the instructions that end at this location. 6123 InstrList &List = TransposeEnds[i]; 6124 for (Instruction *ToRemove : List) 6125 OpenIntervals.erase(ToRemove); 6126 6127 // Ignore instructions that are never used within the loop. 6128 if (!Ends.count(I)) 6129 continue; 6130 6131 // Skip ignored values. 6132 if (ValuesToIgnore.count(I)) 6133 continue; 6134 6135 // For each VF find the maximum usage of registers. 6136 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6137 // Count the number of live intervals. 6138 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6139 6140 if (VFs[j].isScalar()) { 6141 for (auto Inst : OpenIntervals) { 6142 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6143 if (RegUsage.find(ClassID) == RegUsage.end()) 6144 RegUsage[ClassID] = 1; 6145 else 6146 RegUsage[ClassID] += 1; 6147 } 6148 } else { 6149 collectUniformsAndScalars(VFs[j]); 6150 for (auto Inst : OpenIntervals) { 6151 // Skip ignored values for VF > 1. 6152 if (VecValuesToIgnore.count(Inst)) 6153 continue; 6154 if (isScalarAfterVectorization(Inst, VFs[j])) { 6155 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6156 if (RegUsage.find(ClassID) == RegUsage.end()) 6157 RegUsage[ClassID] = 1; 6158 else 6159 RegUsage[ClassID] += 1; 6160 } else { 6161 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6162 if (RegUsage.find(ClassID) == RegUsage.end()) 6163 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6164 else 6165 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6166 } 6167 } 6168 } 6169 6170 for (auto& pair : RegUsage) { 6171 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6172 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6173 else 6174 MaxUsages[j][pair.first] = pair.second; 6175 } 6176 } 6177 6178 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6179 << OpenIntervals.size() << '\n'); 6180 6181 // Add the current instruction to the list of open intervals. 6182 OpenIntervals.insert(I); 6183 } 6184 6185 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6186 SmallMapVector<unsigned, unsigned, 4> Invariant; 6187 6188 for (auto Inst : LoopInvariants) { 6189 unsigned Usage = 6190 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6191 unsigned ClassID = 6192 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6193 if (Invariant.find(ClassID) == Invariant.end()) 6194 Invariant[ClassID] = Usage; 6195 else 6196 Invariant[ClassID] += Usage; 6197 } 6198 6199 LLVM_DEBUG({ 6200 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6201 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6202 << " item\n"; 6203 for (const auto &pair : MaxUsages[i]) { 6204 dbgs() << "LV(REG): RegisterClass: " 6205 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6206 << " registers\n"; 6207 } 6208 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6209 << " item\n"; 6210 for (const auto &pair : Invariant) { 6211 dbgs() << "LV(REG): RegisterClass: " 6212 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6213 << " registers\n"; 6214 } 6215 }); 6216 6217 RU.LoopInvariantRegs = Invariant; 6218 RU.MaxLocalUsers = MaxUsages[i]; 6219 RUs[i] = RU; 6220 } 6221 6222 return RUs; 6223 } 6224 6225 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6226 ElementCount VF) { 6227 // TODO: Cost model for emulated masked load/store is completely 6228 // broken. This hack guides the cost model to use an artificially 6229 // high enough value to practically disable vectorization with such 6230 // operations, except where previously deployed legality hack allowed 6231 // using very low cost values. This is to avoid regressions coming simply 6232 // from moving "masked load/store" check from legality to cost model. 6233 // Masked Load/Gather emulation was previously never allowed. 6234 // Limited number of Masked Store/Scatter emulation was allowed. 6235 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6236 return isa<LoadInst>(I) || 6237 (isa<StoreInst>(I) && 6238 NumPredStores > NumberOfStoresToPredicate); 6239 } 6240 6241 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6242 // If we aren't vectorizing the loop, or if we've already collected the 6243 // instructions to scalarize, there's nothing to do. Collection may already 6244 // have occurred if we have a user-selected VF and are now computing the 6245 // expected cost for interleaving. 6246 if (VF.isScalar() || VF.isZero() || 6247 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6248 return; 6249 6250 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6251 // not profitable to scalarize any instructions, the presence of VF in the 6252 // map will indicate that we've analyzed it already. 6253 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6254 6255 // Find all the instructions that are scalar with predication in the loop and 6256 // determine if it would be better to not if-convert the blocks they are in. 6257 // If so, we also record the instructions to scalarize. 6258 for (BasicBlock *BB : TheLoop->blocks()) { 6259 if (!blockNeedsPredicationForAnyReason(BB)) 6260 continue; 6261 for (Instruction &I : *BB) 6262 if (isScalarWithPredication(&I, VF)) { 6263 ScalarCostsTy ScalarCosts; 6264 // Do not apply discount if scalable, because that would lead to 6265 // invalid scalarization costs. 6266 // Do not apply discount logic if hacked cost is needed 6267 // for emulated masked memrefs. 6268 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6269 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6270 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6271 // Remember that BB will remain after vectorization. 6272 PredicatedBBsAfterVectorization.insert(BB); 6273 } 6274 } 6275 } 6276 6277 int LoopVectorizationCostModel::computePredInstDiscount( 6278 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6279 assert(!isUniformAfterVectorization(PredInst, VF) && 6280 "Instruction marked uniform-after-vectorization will be predicated"); 6281 6282 // Initialize the discount to zero, meaning that the scalar version and the 6283 // vector version cost the same. 6284 InstructionCost Discount = 0; 6285 6286 // Holds instructions to analyze. The instructions we visit are mapped in 6287 // ScalarCosts. Those instructions are the ones that would be scalarized if 6288 // we find that the scalar version costs less. 6289 SmallVector<Instruction *, 8> Worklist; 6290 6291 // Returns true if the given instruction can be scalarized. 6292 auto canBeScalarized = [&](Instruction *I) -> bool { 6293 // We only attempt to scalarize instructions forming a single-use chain 6294 // from the original predicated block that would otherwise be vectorized. 6295 // Although not strictly necessary, we give up on instructions we know will 6296 // already be scalar to avoid traversing chains that are unlikely to be 6297 // beneficial. 6298 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6299 isScalarAfterVectorization(I, VF)) 6300 return false; 6301 6302 // If the instruction is scalar with predication, it will be analyzed 6303 // separately. We ignore it within the context of PredInst. 6304 if (isScalarWithPredication(I, VF)) 6305 return false; 6306 6307 // If any of the instruction's operands are uniform after vectorization, 6308 // the instruction cannot be scalarized. This prevents, for example, a 6309 // masked load from being scalarized. 6310 // 6311 // We assume we will only emit a value for lane zero of an instruction 6312 // marked uniform after vectorization, rather than VF identical values. 6313 // Thus, if we scalarize an instruction that uses a uniform, we would 6314 // create uses of values corresponding to the lanes we aren't emitting code 6315 // for. This behavior can be changed by allowing getScalarValue to clone 6316 // the lane zero values for uniforms rather than asserting. 6317 for (Use &U : I->operands()) 6318 if (auto *J = dyn_cast<Instruction>(U.get())) 6319 if (isUniformAfterVectorization(J, VF)) 6320 return false; 6321 6322 // Otherwise, we can scalarize the instruction. 6323 return true; 6324 }; 6325 6326 // Compute the expected cost discount from scalarizing the entire expression 6327 // feeding the predicated instruction. We currently only consider expressions 6328 // that are single-use instruction chains. 6329 Worklist.push_back(PredInst); 6330 while (!Worklist.empty()) { 6331 Instruction *I = Worklist.pop_back_val(); 6332 6333 // If we've already analyzed the instruction, there's nothing to do. 6334 if (ScalarCosts.find(I) != ScalarCosts.end()) 6335 continue; 6336 6337 // Compute the cost of the vector instruction. Note that this cost already 6338 // includes the scalarization overhead of the predicated instruction. 6339 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6340 6341 // Compute the cost of the scalarized instruction. This cost is the cost of 6342 // the instruction as if it wasn't if-converted and instead remained in the 6343 // predicated block. We will scale this cost by block probability after 6344 // computing the scalarization overhead. 6345 InstructionCost ScalarCost = 6346 VF.getFixedValue() * 6347 getInstructionCost(I, ElementCount::getFixed(1)).first; 6348 6349 // Compute the scalarization overhead of needed insertelement instructions 6350 // and phi nodes. 6351 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6352 ScalarCost += TTI.getScalarizationOverhead( 6353 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6354 APInt::getAllOnes(VF.getFixedValue()), true, false); 6355 ScalarCost += 6356 VF.getFixedValue() * 6357 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6358 } 6359 6360 // Compute the scalarization overhead of needed extractelement 6361 // instructions. For each of the instruction's operands, if the operand can 6362 // be scalarized, add it to the worklist; otherwise, account for the 6363 // overhead. 6364 for (Use &U : I->operands()) 6365 if (auto *J = dyn_cast<Instruction>(U.get())) { 6366 assert(VectorType::isValidElementType(J->getType()) && 6367 "Instruction has non-scalar type"); 6368 if (canBeScalarized(J)) 6369 Worklist.push_back(J); 6370 else if (needsExtract(J, VF)) { 6371 ScalarCost += TTI.getScalarizationOverhead( 6372 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6373 APInt::getAllOnes(VF.getFixedValue()), false, true); 6374 } 6375 } 6376 6377 // Scale the total scalar cost by block probability. 6378 ScalarCost /= getReciprocalPredBlockProb(); 6379 6380 // Compute the discount. A non-negative discount means the vector version 6381 // of the instruction costs more, and scalarizing would be beneficial. 6382 Discount += VectorCost - ScalarCost; 6383 ScalarCosts[I] = ScalarCost; 6384 } 6385 6386 return *Discount.getValue(); 6387 } 6388 6389 LoopVectorizationCostModel::VectorizationCostTy 6390 LoopVectorizationCostModel::expectedCost( 6391 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6392 VectorizationCostTy Cost; 6393 6394 // For each block. 6395 for (BasicBlock *BB : TheLoop->blocks()) { 6396 VectorizationCostTy BlockCost; 6397 6398 // For each instruction in the old loop. 6399 for (Instruction &I : BB->instructionsWithoutDebug()) { 6400 // Skip ignored values. 6401 if (ValuesToIgnore.count(&I) || 6402 (VF.isVector() && VecValuesToIgnore.count(&I))) 6403 continue; 6404 6405 VectorizationCostTy C = getInstructionCost(&I, VF); 6406 6407 // Check if we should override the cost. 6408 if (C.first.isValid() && 6409 ForceTargetInstructionCost.getNumOccurrences() > 0) 6410 C.first = InstructionCost(ForceTargetInstructionCost); 6411 6412 // Keep a list of instructions with invalid costs. 6413 if (Invalid && !C.first.isValid()) 6414 Invalid->emplace_back(&I, VF); 6415 6416 BlockCost.first += C.first; 6417 BlockCost.second |= C.second; 6418 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6419 << " for VF " << VF << " For instruction: " << I 6420 << '\n'); 6421 } 6422 6423 // If we are vectorizing a predicated block, it will have been 6424 // if-converted. This means that the block's instructions (aside from 6425 // stores and instructions that may divide by zero) will now be 6426 // unconditionally executed. For the scalar case, we may not always execute 6427 // the predicated block, if it is an if-else block. Thus, scale the block's 6428 // cost by the probability of executing it. blockNeedsPredication from 6429 // Legal is used so as to not include all blocks in tail folded loops. 6430 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6431 BlockCost.first /= getReciprocalPredBlockProb(); 6432 6433 Cost.first += BlockCost.first; 6434 Cost.second |= BlockCost.second; 6435 } 6436 6437 return Cost; 6438 } 6439 6440 /// Gets Address Access SCEV after verifying that the access pattern 6441 /// is loop invariant except the induction variable dependence. 6442 /// 6443 /// This SCEV can be sent to the Target in order to estimate the address 6444 /// calculation cost. 6445 static const SCEV *getAddressAccessSCEV( 6446 Value *Ptr, 6447 LoopVectorizationLegality *Legal, 6448 PredicatedScalarEvolution &PSE, 6449 const Loop *TheLoop) { 6450 6451 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6452 if (!Gep) 6453 return nullptr; 6454 6455 // We are looking for a gep with all loop invariant indices except for one 6456 // which should be an induction variable. 6457 auto SE = PSE.getSE(); 6458 unsigned NumOperands = Gep->getNumOperands(); 6459 for (unsigned i = 1; i < NumOperands; ++i) { 6460 Value *Opd = Gep->getOperand(i); 6461 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6462 !Legal->isInductionVariable(Opd)) 6463 return nullptr; 6464 } 6465 6466 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6467 return PSE.getSCEV(Ptr); 6468 } 6469 6470 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6471 return Legal->hasStride(I->getOperand(0)) || 6472 Legal->hasStride(I->getOperand(1)); 6473 } 6474 6475 InstructionCost 6476 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6477 ElementCount VF) { 6478 assert(VF.isVector() && 6479 "Scalarization cost of instruction implies vectorization."); 6480 if (VF.isScalable()) 6481 return InstructionCost::getInvalid(); 6482 6483 Type *ValTy = getLoadStoreType(I); 6484 auto SE = PSE.getSE(); 6485 6486 unsigned AS = getLoadStoreAddressSpace(I); 6487 Value *Ptr = getLoadStorePointerOperand(I); 6488 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6489 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6490 // that it is being called from this specific place. 6491 6492 // Figure out whether the access is strided and get the stride value 6493 // if it's known in compile time 6494 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6495 6496 // Get the cost of the scalar memory instruction and address computation. 6497 InstructionCost Cost = 6498 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6499 6500 // Don't pass *I here, since it is scalar but will actually be part of a 6501 // vectorized loop where the user of it is a vectorized instruction. 6502 const Align Alignment = getLoadStoreAlignment(I); 6503 Cost += VF.getKnownMinValue() * 6504 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6505 AS, TTI::TCK_RecipThroughput); 6506 6507 // Get the overhead of the extractelement and insertelement instructions 6508 // we might create due to scalarization. 6509 Cost += getScalarizationOverhead(I, VF); 6510 6511 // If we have a predicated load/store, it will need extra i1 extracts and 6512 // conditional branches, but may not be executed for each vector lane. Scale 6513 // the cost by the probability of executing the predicated block. 6514 if (isPredicatedInst(I, VF)) { 6515 Cost /= getReciprocalPredBlockProb(); 6516 6517 // Add the cost of an i1 extract and a branch 6518 auto *Vec_i1Ty = 6519 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6520 Cost += TTI.getScalarizationOverhead( 6521 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6522 /*Insert=*/false, /*Extract=*/true); 6523 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6524 6525 if (useEmulatedMaskMemRefHack(I, VF)) 6526 // Artificially setting to a high enough value to practically disable 6527 // vectorization with such operations. 6528 Cost = 3000000; 6529 } 6530 6531 return Cost; 6532 } 6533 6534 InstructionCost 6535 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6536 ElementCount VF) { 6537 Type *ValTy = getLoadStoreType(I); 6538 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6539 Value *Ptr = getLoadStorePointerOperand(I); 6540 unsigned AS = getLoadStoreAddressSpace(I); 6541 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6542 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6543 6544 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6545 "Stride should be 1 or -1 for consecutive memory access"); 6546 const Align Alignment = getLoadStoreAlignment(I); 6547 InstructionCost Cost = 0; 6548 if (Legal->isMaskRequired(I)) 6549 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6550 CostKind); 6551 else 6552 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6553 CostKind, I); 6554 6555 bool Reverse = ConsecutiveStride < 0; 6556 if (Reverse) 6557 Cost += 6558 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6559 return Cost; 6560 } 6561 6562 InstructionCost 6563 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6564 ElementCount VF) { 6565 assert(Legal->isUniformMemOp(*I)); 6566 6567 Type *ValTy = getLoadStoreType(I); 6568 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6569 const Align Alignment = getLoadStoreAlignment(I); 6570 unsigned AS = getLoadStoreAddressSpace(I); 6571 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6572 if (isa<LoadInst>(I)) { 6573 return TTI.getAddressComputationCost(ValTy) + 6574 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6575 CostKind) + 6576 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6577 } 6578 StoreInst *SI = cast<StoreInst>(I); 6579 6580 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6581 return TTI.getAddressComputationCost(ValTy) + 6582 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6583 CostKind) + 6584 (isLoopInvariantStoreValue 6585 ? 0 6586 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6587 VF.getKnownMinValue() - 1)); 6588 } 6589 6590 InstructionCost 6591 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6592 ElementCount VF) { 6593 Type *ValTy = getLoadStoreType(I); 6594 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6595 const Align Alignment = getLoadStoreAlignment(I); 6596 const Value *Ptr = getLoadStorePointerOperand(I); 6597 6598 return TTI.getAddressComputationCost(VectorTy) + 6599 TTI.getGatherScatterOpCost( 6600 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6601 TargetTransformInfo::TCK_RecipThroughput, I); 6602 } 6603 6604 InstructionCost 6605 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6606 ElementCount VF) { 6607 // TODO: Once we have support for interleaving with scalable vectors 6608 // we can calculate the cost properly here. 6609 if (VF.isScalable()) 6610 return InstructionCost::getInvalid(); 6611 6612 Type *ValTy = getLoadStoreType(I); 6613 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6614 unsigned AS = getLoadStoreAddressSpace(I); 6615 6616 auto Group = getInterleavedAccessGroup(I); 6617 assert(Group && "Fail to get an interleaved access group."); 6618 6619 unsigned InterleaveFactor = Group->getFactor(); 6620 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6621 6622 // Holds the indices of existing members in the interleaved group. 6623 SmallVector<unsigned, 4> Indices; 6624 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6625 if (Group->getMember(IF)) 6626 Indices.push_back(IF); 6627 6628 // Calculate the cost of the whole interleaved group. 6629 bool UseMaskForGaps = 6630 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6631 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6632 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6633 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6634 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6635 6636 if (Group->isReverse()) { 6637 // TODO: Add support for reversed masked interleaved access. 6638 assert(!Legal->isMaskRequired(I) && 6639 "Reverse masked interleaved access not supported."); 6640 Cost += 6641 Group->getNumMembers() * 6642 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6643 } 6644 return Cost; 6645 } 6646 6647 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6648 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6649 using namespace llvm::PatternMatch; 6650 // Early exit for no inloop reductions 6651 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6652 return None; 6653 auto *VectorTy = cast<VectorType>(Ty); 6654 6655 // We are looking for a pattern of, and finding the minimal acceptable cost: 6656 // reduce(mul(ext(A), ext(B))) or 6657 // reduce(mul(A, B)) or 6658 // reduce(ext(A)) or 6659 // reduce(A). 6660 // The basic idea is that we walk down the tree to do that, finding the root 6661 // reduction instruction in InLoopReductionImmediateChains. From there we find 6662 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6663 // of the components. If the reduction cost is lower then we return it for the 6664 // reduction instruction and 0 for the other instructions in the pattern. If 6665 // it is not we return an invalid cost specifying the orignal cost method 6666 // should be used. 6667 Instruction *RetI = I; 6668 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6669 if (!RetI->hasOneUser()) 6670 return None; 6671 RetI = RetI->user_back(); 6672 } 6673 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6674 RetI->user_back()->getOpcode() == Instruction::Add) { 6675 if (!RetI->hasOneUser()) 6676 return None; 6677 RetI = RetI->user_back(); 6678 } 6679 6680 // Test if the found instruction is a reduction, and if not return an invalid 6681 // cost specifying the parent to use the original cost modelling. 6682 if (!InLoopReductionImmediateChains.count(RetI)) 6683 return None; 6684 6685 // Find the reduction this chain is a part of and calculate the basic cost of 6686 // the reduction on its own. 6687 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6688 Instruction *ReductionPhi = LastChain; 6689 while (!isa<PHINode>(ReductionPhi)) 6690 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6691 6692 const RecurrenceDescriptor &RdxDesc = 6693 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6694 6695 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6696 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6697 6698 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6699 // normal fmul instruction to the cost of the fadd reduction. 6700 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6701 BaseCost += 6702 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6703 6704 // If we're using ordered reductions then we can just return the base cost 6705 // here, since getArithmeticReductionCost calculates the full ordered 6706 // reduction cost when FP reassociation is not allowed. 6707 if (useOrderedReductions(RdxDesc)) 6708 return BaseCost; 6709 6710 // Get the operand that was not the reduction chain and match it to one of the 6711 // patterns, returning the better cost if it is found. 6712 Instruction *RedOp = RetI->getOperand(1) == LastChain 6713 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6714 : dyn_cast<Instruction>(RetI->getOperand(1)); 6715 6716 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6717 6718 Instruction *Op0, *Op1; 6719 if (RedOp && 6720 match(RedOp, 6721 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6722 match(Op0, m_ZExtOrSExt(m_Value())) && 6723 Op0->getOpcode() == Op1->getOpcode() && 6724 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6725 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6726 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6727 6728 // Matched reduce(ext(mul(ext(A), ext(B))) 6729 // Note that the extend opcodes need to all match, or if A==B they will have 6730 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6731 // which is equally fine. 6732 bool IsUnsigned = isa<ZExtInst>(Op0); 6733 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6734 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6735 6736 InstructionCost ExtCost = 6737 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6738 TTI::CastContextHint::None, CostKind, Op0); 6739 InstructionCost MulCost = 6740 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6741 InstructionCost Ext2Cost = 6742 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6743 TTI::CastContextHint::None, CostKind, RedOp); 6744 6745 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6746 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6747 CostKind); 6748 6749 if (RedCost.isValid() && 6750 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6751 return I == RetI ? RedCost : 0; 6752 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6753 !TheLoop->isLoopInvariant(RedOp)) { 6754 // Matched reduce(ext(A)) 6755 bool IsUnsigned = isa<ZExtInst>(RedOp); 6756 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6757 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6758 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6759 CostKind); 6760 6761 InstructionCost ExtCost = 6762 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6763 TTI::CastContextHint::None, CostKind, RedOp); 6764 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6765 return I == RetI ? RedCost : 0; 6766 } else if (RedOp && 6767 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6768 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6769 Op0->getOpcode() == Op1->getOpcode() && 6770 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6771 bool IsUnsigned = isa<ZExtInst>(Op0); 6772 Type *Op0Ty = Op0->getOperand(0)->getType(); 6773 Type *Op1Ty = Op1->getOperand(0)->getType(); 6774 Type *LargestOpTy = 6775 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6776 : Op0Ty; 6777 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6778 6779 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6780 // different sizes. We take the largest type as the ext to reduce, and add 6781 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6782 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6783 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6784 TTI::CastContextHint::None, CostKind, Op0); 6785 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6786 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6787 TTI::CastContextHint::None, CostKind, Op1); 6788 InstructionCost MulCost = 6789 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6790 6791 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6792 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6793 CostKind); 6794 InstructionCost ExtraExtCost = 0; 6795 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6796 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6797 ExtraExtCost = TTI.getCastInstrCost( 6798 ExtraExtOp->getOpcode(), ExtType, 6799 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6800 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6801 } 6802 6803 if (RedCost.isValid() && 6804 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6805 return I == RetI ? RedCost : 0; 6806 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6807 // Matched reduce(mul()) 6808 InstructionCost MulCost = 6809 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6810 6811 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6812 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6813 CostKind); 6814 6815 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6816 return I == RetI ? RedCost : 0; 6817 } 6818 } 6819 6820 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6821 } 6822 6823 InstructionCost 6824 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6825 ElementCount VF) { 6826 // Calculate scalar cost only. Vectorization cost should be ready at this 6827 // moment. 6828 if (VF.isScalar()) { 6829 Type *ValTy = getLoadStoreType(I); 6830 const Align Alignment = getLoadStoreAlignment(I); 6831 unsigned AS = getLoadStoreAddressSpace(I); 6832 6833 return TTI.getAddressComputationCost(ValTy) + 6834 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6835 TTI::TCK_RecipThroughput, I); 6836 } 6837 return getWideningCost(I, VF); 6838 } 6839 6840 LoopVectorizationCostModel::VectorizationCostTy 6841 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6842 ElementCount VF) { 6843 // If we know that this instruction will remain uniform, check the cost of 6844 // the scalar version. 6845 if (isUniformAfterVectorization(I, VF)) 6846 VF = ElementCount::getFixed(1); 6847 6848 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6849 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6850 6851 // Forced scalars do not have any scalarization overhead. 6852 auto ForcedScalar = ForcedScalars.find(VF); 6853 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6854 auto InstSet = ForcedScalar->second; 6855 if (InstSet.count(I)) 6856 return VectorizationCostTy( 6857 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6858 VF.getKnownMinValue()), 6859 false); 6860 } 6861 6862 Type *VectorTy; 6863 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6864 6865 bool TypeNotScalarized = false; 6866 if (VF.isVector() && VectorTy->isVectorTy()) { 6867 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 6868 if (NumParts) 6869 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6870 else 6871 C = InstructionCost::getInvalid(); 6872 } 6873 return VectorizationCostTy(C, TypeNotScalarized); 6874 } 6875 6876 InstructionCost 6877 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6878 ElementCount VF) const { 6879 6880 // There is no mechanism yet to create a scalable scalarization loop, 6881 // so this is currently Invalid. 6882 if (VF.isScalable()) 6883 return InstructionCost::getInvalid(); 6884 6885 if (VF.isScalar()) 6886 return 0; 6887 6888 InstructionCost Cost = 0; 6889 Type *RetTy = ToVectorTy(I->getType(), VF); 6890 if (!RetTy->isVoidTy() && 6891 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6892 Cost += TTI.getScalarizationOverhead( 6893 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6894 false); 6895 6896 // Some targets keep addresses scalar. 6897 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6898 return Cost; 6899 6900 // Some targets support efficient element stores. 6901 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6902 return Cost; 6903 6904 // Collect operands to consider. 6905 CallInst *CI = dyn_cast<CallInst>(I); 6906 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6907 6908 // Skip operands that do not require extraction/scalarization and do not incur 6909 // any overhead. 6910 SmallVector<Type *> Tys; 6911 for (auto *V : filterExtractingOperands(Ops, VF)) 6912 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6913 return Cost + TTI.getOperandsScalarizationOverhead( 6914 filterExtractingOperands(Ops, VF), Tys); 6915 } 6916 6917 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6918 if (VF.isScalar()) 6919 return; 6920 NumPredStores = 0; 6921 for (BasicBlock *BB : TheLoop->blocks()) { 6922 // For each instruction in the old loop. 6923 for (Instruction &I : *BB) { 6924 Value *Ptr = getLoadStorePointerOperand(&I); 6925 if (!Ptr) 6926 continue; 6927 6928 // TODO: We should generate better code and update the cost model for 6929 // predicated uniform stores. Today they are treated as any other 6930 // predicated store (see added test cases in 6931 // invariant-store-vectorization.ll). 6932 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6933 NumPredStores++; 6934 6935 if (Legal->isUniformMemOp(I)) { 6936 // TODO: Avoid replicating loads and stores instead of 6937 // relying on instcombine to remove them. 6938 // Load: Scalar load + broadcast 6939 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6940 InstructionCost Cost; 6941 if (isa<StoreInst>(&I) && VF.isScalable() && 6942 isLegalGatherOrScatter(&I, VF)) { 6943 Cost = getGatherScatterCost(&I, VF); 6944 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6945 } else { 6946 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 6947 "Cannot yet scalarize uniform stores"); 6948 Cost = getUniformMemOpCost(&I, VF); 6949 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6950 } 6951 continue; 6952 } 6953 6954 // We assume that widening is the best solution when possible. 6955 if (memoryInstructionCanBeWidened(&I, VF)) { 6956 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6957 int ConsecutiveStride = Legal->isConsecutivePtr( 6958 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6959 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6960 "Expected consecutive stride."); 6961 InstWidening Decision = 6962 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6963 setWideningDecision(&I, VF, Decision, Cost); 6964 continue; 6965 } 6966 6967 // Choose between Interleaving, Gather/Scatter or Scalarization. 6968 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6969 unsigned NumAccesses = 1; 6970 if (isAccessInterleaved(&I)) { 6971 auto Group = getInterleavedAccessGroup(&I); 6972 assert(Group && "Fail to get an interleaved access group."); 6973 6974 // Make one decision for the whole group. 6975 if (getWideningDecision(&I, VF) != CM_Unknown) 6976 continue; 6977 6978 NumAccesses = Group->getNumMembers(); 6979 if (interleavedAccessCanBeWidened(&I, VF)) 6980 InterleaveCost = getInterleaveGroupCost(&I, VF); 6981 } 6982 6983 InstructionCost GatherScatterCost = 6984 isLegalGatherOrScatter(&I, VF) 6985 ? getGatherScatterCost(&I, VF) * NumAccesses 6986 : InstructionCost::getInvalid(); 6987 6988 InstructionCost ScalarizationCost = 6989 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6990 6991 // Choose better solution for the current VF, 6992 // write down this decision and use it during vectorization. 6993 InstructionCost Cost; 6994 InstWidening Decision; 6995 if (InterleaveCost <= GatherScatterCost && 6996 InterleaveCost < ScalarizationCost) { 6997 Decision = CM_Interleave; 6998 Cost = InterleaveCost; 6999 } else if (GatherScatterCost < ScalarizationCost) { 7000 Decision = CM_GatherScatter; 7001 Cost = GatherScatterCost; 7002 } else { 7003 Decision = CM_Scalarize; 7004 Cost = ScalarizationCost; 7005 } 7006 // If the instructions belongs to an interleave group, the whole group 7007 // receives the same decision. The whole group receives the cost, but 7008 // the cost will actually be assigned to one instruction. 7009 if (auto Group = getInterleavedAccessGroup(&I)) 7010 setWideningDecision(Group, VF, Decision, Cost); 7011 else 7012 setWideningDecision(&I, VF, Decision, Cost); 7013 } 7014 } 7015 7016 // Make sure that any load of address and any other address computation 7017 // remains scalar unless there is gather/scatter support. This avoids 7018 // inevitable extracts into address registers, and also has the benefit of 7019 // activating LSR more, since that pass can't optimize vectorized 7020 // addresses. 7021 if (TTI.prefersVectorizedAddressing()) 7022 return; 7023 7024 // Start with all scalar pointer uses. 7025 SmallPtrSet<Instruction *, 8> AddrDefs; 7026 for (BasicBlock *BB : TheLoop->blocks()) 7027 for (Instruction &I : *BB) { 7028 Instruction *PtrDef = 7029 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7030 if (PtrDef && TheLoop->contains(PtrDef) && 7031 getWideningDecision(&I, VF) != CM_GatherScatter) 7032 AddrDefs.insert(PtrDef); 7033 } 7034 7035 // Add all instructions used to generate the addresses. 7036 SmallVector<Instruction *, 4> Worklist; 7037 append_range(Worklist, AddrDefs); 7038 while (!Worklist.empty()) { 7039 Instruction *I = Worklist.pop_back_val(); 7040 for (auto &Op : I->operands()) 7041 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7042 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7043 AddrDefs.insert(InstOp).second) 7044 Worklist.push_back(InstOp); 7045 } 7046 7047 for (auto *I : AddrDefs) { 7048 if (isa<LoadInst>(I)) { 7049 // Setting the desired widening decision should ideally be handled in 7050 // by cost functions, but since this involves the task of finding out 7051 // if the loaded register is involved in an address computation, it is 7052 // instead changed here when we know this is the case. 7053 InstWidening Decision = getWideningDecision(I, VF); 7054 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7055 // Scalarize a widened load of address. 7056 setWideningDecision( 7057 I, VF, CM_Scalarize, 7058 (VF.getKnownMinValue() * 7059 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7060 else if (auto Group = getInterleavedAccessGroup(I)) { 7061 // Scalarize an interleave group of address loads. 7062 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7063 if (Instruction *Member = Group->getMember(I)) 7064 setWideningDecision( 7065 Member, VF, CM_Scalarize, 7066 (VF.getKnownMinValue() * 7067 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7068 } 7069 } 7070 } else 7071 // Make sure I gets scalarized and a cost estimate without 7072 // scalarization overhead. 7073 ForcedScalars[VF].insert(I); 7074 } 7075 } 7076 7077 InstructionCost 7078 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7079 Type *&VectorTy) { 7080 Type *RetTy = I->getType(); 7081 if (canTruncateToMinimalBitwidth(I, VF)) 7082 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7083 auto SE = PSE.getSE(); 7084 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7085 7086 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7087 ElementCount VF) -> bool { 7088 if (VF.isScalar()) 7089 return true; 7090 7091 auto Scalarized = InstsToScalarize.find(VF); 7092 assert(Scalarized != InstsToScalarize.end() && 7093 "VF not yet analyzed for scalarization profitability"); 7094 return !Scalarized->second.count(I) && 7095 llvm::all_of(I->users(), [&](User *U) { 7096 auto *UI = cast<Instruction>(U); 7097 return !Scalarized->second.count(UI); 7098 }); 7099 }; 7100 (void) hasSingleCopyAfterVectorization; 7101 7102 if (isScalarAfterVectorization(I, VF)) { 7103 // With the exception of GEPs and PHIs, after scalarization there should 7104 // only be one copy of the instruction generated in the loop. This is 7105 // because the VF is either 1, or any instructions that need scalarizing 7106 // have already been dealt with by the the time we get here. As a result, 7107 // it means we don't have to multiply the instruction cost by VF. 7108 assert(I->getOpcode() == Instruction::GetElementPtr || 7109 I->getOpcode() == Instruction::PHI || 7110 (I->getOpcode() == Instruction::BitCast && 7111 I->getType()->isPointerTy()) || 7112 hasSingleCopyAfterVectorization(I, VF)); 7113 VectorTy = RetTy; 7114 } else 7115 VectorTy = ToVectorTy(RetTy, VF); 7116 7117 // TODO: We need to estimate the cost of intrinsic calls. 7118 switch (I->getOpcode()) { 7119 case Instruction::GetElementPtr: 7120 // We mark this instruction as zero-cost because the cost of GEPs in 7121 // vectorized code depends on whether the corresponding memory instruction 7122 // is scalarized or not. Therefore, we handle GEPs with the memory 7123 // instruction cost. 7124 return 0; 7125 case Instruction::Br: { 7126 // In cases of scalarized and predicated instructions, there will be VF 7127 // predicated blocks in the vectorized loop. Each branch around these 7128 // blocks requires also an extract of its vector compare i1 element. 7129 bool ScalarPredicatedBB = false; 7130 BranchInst *BI = cast<BranchInst>(I); 7131 if (VF.isVector() && BI->isConditional() && 7132 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7133 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7134 ScalarPredicatedBB = true; 7135 7136 if (ScalarPredicatedBB) { 7137 // Not possible to scalarize scalable vector with predicated instructions. 7138 if (VF.isScalable()) 7139 return InstructionCost::getInvalid(); 7140 // Return cost for branches around scalarized and predicated blocks. 7141 auto *Vec_i1Ty = 7142 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7143 return ( 7144 TTI.getScalarizationOverhead( 7145 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7146 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7147 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7148 // The back-edge branch will remain, as will all scalar branches. 7149 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7150 else 7151 // This branch will be eliminated by if-conversion. 7152 return 0; 7153 // Note: We currently assume zero cost for an unconditional branch inside 7154 // a predicated block since it will become a fall-through, although we 7155 // may decide in the future to call TTI for all branches. 7156 } 7157 case Instruction::PHI: { 7158 auto *Phi = cast<PHINode>(I); 7159 7160 // First-order recurrences are replaced by vector shuffles inside the loop. 7161 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7162 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7163 return TTI.getShuffleCost( 7164 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7165 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7166 7167 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7168 // converted into select instructions. We require N - 1 selects per phi 7169 // node, where N is the number of incoming values. 7170 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7171 return (Phi->getNumIncomingValues() - 1) * 7172 TTI.getCmpSelInstrCost( 7173 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7174 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7175 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7176 7177 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7178 } 7179 case Instruction::UDiv: 7180 case Instruction::SDiv: 7181 case Instruction::URem: 7182 case Instruction::SRem: 7183 // If we have a predicated instruction, it may not be executed for each 7184 // vector lane. Get the scalarization cost and scale this amount by the 7185 // probability of executing the predicated block. If the instruction is not 7186 // predicated, we fall through to the next case. 7187 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7188 InstructionCost Cost = 0; 7189 7190 // These instructions have a non-void type, so account for the phi nodes 7191 // that we will create. This cost is likely to be zero. The phi node 7192 // cost, if any, should be scaled by the block probability because it 7193 // models a copy at the end of each predicated block. 7194 Cost += VF.getKnownMinValue() * 7195 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7196 7197 // The cost of the non-predicated instruction. 7198 Cost += VF.getKnownMinValue() * 7199 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7200 7201 // The cost of insertelement and extractelement instructions needed for 7202 // scalarization. 7203 Cost += getScalarizationOverhead(I, VF); 7204 7205 // Scale the cost by the probability of executing the predicated blocks. 7206 // This assumes the predicated block for each vector lane is equally 7207 // likely. 7208 return Cost / getReciprocalPredBlockProb(); 7209 } 7210 LLVM_FALLTHROUGH; 7211 case Instruction::Add: 7212 case Instruction::FAdd: 7213 case Instruction::Sub: 7214 case Instruction::FSub: 7215 case Instruction::Mul: 7216 case Instruction::FMul: 7217 case Instruction::FDiv: 7218 case Instruction::FRem: 7219 case Instruction::Shl: 7220 case Instruction::LShr: 7221 case Instruction::AShr: 7222 case Instruction::And: 7223 case Instruction::Or: 7224 case Instruction::Xor: { 7225 // Since we will replace the stride by 1 the multiplication should go away. 7226 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7227 return 0; 7228 7229 // Detect reduction patterns 7230 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7231 return *RedCost; 7232 7233 // Certain instructions can be cheaper to vectorize if they have a constant 7234 // second vector operand. One example of this are shifts on x86. 7235 Value *Op2 = I->getOperand(1); 7236 TargetTransformInfo::OperandValueProperties Op2VP; 7237 TargetTransformInfo::OperandValueKind Op2VK = 7238 TTI.getOperandInfo(Op2, Op2VP); 7239 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7240 Op2VK = TargetTransformInfo::OK_UniformValue; 7241 7242 SmallVector<const Value *, 4> Operands(I->operand_values()); 7243 return TTI.getArithmeticInstrCost( 7244 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7245 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7246 } 7247 case Instruction::FNeg: { 7248 return TTI.getArithmeticInstrCost( 7249 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7250 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7251 TargetTransformInfo::OP_None, I->getOperand(0), I); 7252 } 7253 case Instruction::Select: { 7254 SelectInst *SI = cast<SelectInst>(I); 7255 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7256 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7257 7258 const Value *Op0, *Op1; 7259 using namespace llvm::PatternMatch; 7260 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7261 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7262 // select x, y, false --> x & y 7263 // select x, true, y --> x | y 7264 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7265 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7266 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7267 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7268 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7269 Op1->getType()->getScalarSizeInBits() == 1); 7270 7271 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7272 return TTI.getArithmeticInstrCost( 7273 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7274 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7275 } 7276 7277 Type *CondTy = SI->getCondition()->getType(); 7278 if (!ScalarCond) 7279 CondTy = VectorType::get(CondTy, VF); 7280 7281 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7282 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7283 Pred = Cmp->getPredicate(); 7284 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7285 CostKind, I); 7286 } 7287 case Instruction::ICmp: 7288 case Instruction::FCmp: { 7289 Type *ValTy = I->getOperand(0)->getType(); 7290 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7291 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7292 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7293 VectorTy = ToVectorTy(ValTy, VF); 7294 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7295 cast<CmpInst>(I)->getPredicate(), CostKind, 7296 I); 7297 } 7298 case Instruction::Store: 7299 case Instruction::Load: { 7300 ElementCount Width = VF; 7301 if (Width.isVector()) { 7302 InstWidening Decision = getWideningDecision(I, Width); 7303 assert(Decision != CM_Unknown && 7304 "CM decision should be taken at this point"); 7305 if (Decision == CM_Scalarize) 7306 Width = ElementCount::getFixed(1); 7307 } 7308 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7309 return getMemoryInstructionCost(I, VF); 7310 } 7311 case Instruction::BitCast: 7312 if (I->getType()->isPointerTy()) 7313 return 0; 7314 LLVM_FALLTHROUGH; 7315 case Instruction::ZExt: 7316 case Instruction::SExt: 7317 case Instruction::FPToUI: 7318 case Instruction::FPToSI: 7319 case Instruction::FPExt: 7320 case Instruction::PtrToInt: 7321 case Instruction::IntToPtr: 7322 case Instruction::SIToFP: 7323 case Instruction::UIToFP: 7324 case Instruction::Trunc: 7325 case Instruction::FPTrunc: { 7326 // Computes the CastContextHint from a Load/Store instruction. 7327 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7328 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7329 "Expected a load or a store!"); 7330 7331 if (VF.isScalar() || !TheLoop->contains(I)) 7332 return TTI::CastContextHint::Normal; 7333 7334 switch (getWideningDecision(I, VF)) { 7335 case LoopVectorizationCostModel::CM_GatherScatter: 7336 return TTI::CastContextHint::GatherScatter; 7337 case LoopVectorizationCostModel::CM_Interleave: 7338 return TTI::CastContextHint::Interleave; 7339 case LoopVectorizationCostModel::CM_Scalarize: 7340 case LoopVectorizationCostModel::CM_Widen: 7341 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7342 : TTI::CastContextHint::Normal; 7343 case LoopVectorizationCostModel::CM_Widen_Reverse: 7344 return TTI::CastContextHint::Reversed; 7345 case LoopVectorizationCostModel::CM_Unknown: 7346 llvm_unreachable("Instr did not go through cost modelling?"); 7347 } 7348 7349 llvm_unreachable("Unhandled case!"); 7350 }; 7351 7352 unsigned Opcode = I->getOpcode(); 7353 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7354 // For Trunc, the context is the only user, which must be a StoreInst. 7355 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7356 if (I->hasOneUse()) 7357 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7358 CCH = ComputeCCH(Store); 7359 } 7360 // For Z/Sext, the context is the operand, which must be a LoadInst. 7361 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7362 Opcode == Instruction::FPExt) { 7363 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7364 CCH = ComputeCCH(Load); 7365 } 7366 7367 // We optimize the truncation of induction variables having constant 7368 // integer steps. The cost of these truncations is the same as the scalar 7369 // operation. 7370 if (isOptimizableIVTruncate(I, VF)) { 7371 auto *Trunc = cast<TruncInst>(I); 7372 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7373 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7374 } 7375 7376 // Detect reduction patterns 7377 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7378 return *RedCost; 7379 7380 Type *SrcScalarTy = I->getOperand(0)->getType(); 7381 Type *SrcVecTy = 7382 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7383 if (canTruncateToMinimalBitwidth(I, VF)) { 7384 // This cast is going to be shrunk. This may remove the cast or it might 7385 // turn it into slightly different cast. For example, if MinBW == 16, 7386 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7387 // 7388 // Calculate the modified src and dest types. 7389 Type *MinVecTy = VectorTy; 7390 if (Opcode == Instruction::Trunc) { 7391 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7392 VectorTy = 7393 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7394 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7395 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7396 VectorTy = 7397 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7398 } 7399 } 7400 7401 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7402 } 7403 case Instruction::Call: { 7404 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7405 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7406 return *RedCost; 7407 bool NeedToScalarize; 7408 CallInst *CI = cast<CallInst>(I); 7409 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7410 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7411 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7412 return std::min(CallCost, IntrinsicCost); 7413 } 7414 return CallCost; 7415 } 7416 case Instruction::ExtractValue: 7417 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7418 case Instruction::Alloca: 7419 // We cannot easily widen alloca to a scalable alloca, as 7420 // the result would need to be a vector of pointers. 7421 if (VF.isScalable()) 7422 return InstructionCost::getInvalid(); 7423 LLVM_FALLTHROUGH; 7424 default: 7425 // This opcode is unknown. Assume that it is the same as 'mul'. 7426 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7427 } // end of switch. 7428 } 7429 7430 char LoopVectorize::ID = 0; 7431 7432 static const char lv_name[] = "Loop Vectorization"; 7433 7434 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7435 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7436 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7437 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7438 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7439 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7440 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7441 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7442 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7443 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7444 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7445 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7446 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7447 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7448 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7449 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7450 7451 namespace llvm { 7452 7453 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7454 7455 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7456 bool VectorizeOnlyWhenForced) { 7457 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7458 } 7459 7460 } // end namespace llvm 7461 7462 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7463 // Check if the pointer operand of a load or store instruction is 7464 // consecutive. 7465 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7466 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7467 return false; 7468 } 7469 7470 void LoopVectorizationCostModel::collectValuesToIgnore() { 7471 // Ignore ephemeral values. 7472 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7473 7474 // Ignore type-promoting instructions we identified during reduction 7475 // detection. 7476 for (auto &Reduction : Legal->getReductionVars()) { 7477 const RecurrenceDescriptor &RedDes = Reduction.second; 7478 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7479 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7480 } 7481 // Ignore type-casting instructions we identified during induction 7482 // detection. 7483 for (auto &Induction : Legal->getInductionVars()) { 7484 const InductionDescriptor &IndDes = Induction.second; 7485 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7486 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7487 } 7488 } 7489 7490 void LoopVectorizationCostModel::collectInLoopReductions() { 7491 for (auto &Reduction : Legal->getReductionVars()) { 7492 PHINode *Phi = Reduction.first; 7493 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7494 7495 // We don't collect reductions that are type promoted (yet). 7496 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7497 continue; 7498 7499 // If the target would prefer this reduction to happen "in-loop", then we 7500 // want to record it as such. 7501 unsigned Opcode = RdxDesc.getOpcode(); 7502 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7503 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7504 TargetTransformInfo::ReductionFlags())) 7505 continue; 7506 7507 // Check that we can correctly put the reductions into the loop, by 7508 // finding the chain of operations that leads from the phi to the loop 7509 // exit value. 7510 SmallVector<Instruction *, 4> ReductionOperations = 7511 RdxDesc.getReductionOpChain(Phi, TheLoop); 7512 bool InLoop = !ReductionOperations.empty(); 7513 if (InLoop) { 7514 InLoopReductionChains[Phi] = ReductionOperations; 7515 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7516 Instruction *LastChain = Phi; 7517 for (auto *I : ReductionOperations) { 7518 InLoopReductionImmediateChains[I] = LastChain; 7519 LastChain = I; 7520 } 7521 } 7522 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7523 << " reduction for phi: " << *Phi << "\n"); 7524 } 7525 } 7526 7527 // TODO: we could return a pair of values that specify the max VF and 7528 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7529 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7530 // doesn't have a cost model that can choose which plan to execute if 7531 // more than one is generated. 7532 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7533 LoopVectorizationCostModel &CM) { 7534 unsigned WidestType; 7535 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7536 return WidestVectorRegBits / WidestType; 7537 } 7538 7539 VectorizationFactor 7540 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7541 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7542 ElementCount VF = UserVF; 7543 // Outer loop handling: They may require CFG and instruction level 7544 // transformations before even evaluating whether vectorization is profitable. 7545 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7546 // the vectorization pipeline. 7547 if (!OrigLoop->isInnermost()) { 7548 // If the user doesn't provide a vectorization factor, determine a 7549 // reasonable one. 7550 if (UserVF.isZero()) { 7551 VF = ElementCount::getFixed(determineVPlanVF( 7552 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7553 .getFixedSize(), 7554 CM)); 7555 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7556 7557 // Make sure we have a VF > 1 for stress testing. 7558 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7559 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7560 << "overriding computed VF.\n"); 7561 VF = ElementCount::getFixed(4); 7562 } 7563 } 7564 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7565 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7566 "VF needs to be a power of two"); 7567 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7568 << "VF " << VF << " to build VPlans.\n"); 7569 buildVPlans(VF, VF); 7570 7571 // For VPlan build stress testing, we bail out after VPlan construction. 7572 if (VPlanBuildStressTest) 7573 return VectorizationFactor::Disabled(); 7574 7575 return {VF, 0 /*Cost*/}; 7576 } 7577 7578 LLVM_DEBUG( 7579 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7580 "VPlan-native path.\n"); 7581 return VectorizationFactor::Disabled(); 7582 } 7583 7584 Optional<VectorizationFactor> 7585 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7586 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7587 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7588 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7589 return None; 7590 7591 // Invalidate interleave groups if all blocks of loop will be predicated. 7592 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7593 !useMaskedInterleavedAccesses(*TTI)) { 7594 LLVM_DEBUG( 7595 dbgs() 7596 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7597 "which requires masked-interleaved support.\n"); 7598 if (CM.InterleaveInfo.invalidateGroups()) 7599 // Invalidating interleave groups also requires invalidating all decisions 7600 // based on them, which includes widening decisions and uniform and scalar 7601 // values. 7602 CM.invalidateCostModelingDecisions(); 7603 } 7604 7605 ElementCount MaxUserVF = 7606 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7607 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7608 if (!UserVF.isZero() && UserVFIsLegal) { 7609 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7610 "VF needs to be a power of two"); 7611 // Collect the instructions (and their associated costs) that will be more 7612 // profitable to scalarize. 7613 if (CM.selectUserVectorizationFactor(UserVF)) { 7614 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7615 CM.collectInLoopReductions(); 7616 buildVPlansWithVPRecipes(UserVF, UserVF); 7617 LLVM_DEBUG(printPlans(dbgs())); 7618 return {{UserVF, 0}}; 7619 } else 7620 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7621 "InvalidCost", ORE, OrigLoop); 7622 } 7623 7624 // Populate the set of Vectorization Factor Candidates. 7625 ElementCountSet VFCandidates; 7626 for (auto VF = ElementCount::getFixed(1); 7627 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7628 VFCandidates.insert(VF); 7629 for (auto VF = ElementCount::getScalable(1); 7630 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7631 VFCandidates.insert(VF); 7632 7633 for (const auto &VF : VFCandidates) { 7634 // Collect Uniform and Scalar instructions after vectorization with VF. 7635 CM.collectUniformsAndScalars(VF); 7636 7637 // Collect the instructions (and their associated costs) that will be more 7638 // profitable to scalarize. 7639 if (VF.isVector()) 7640 CM.collectInstsToScalarize(VF); 7641 } 7642 7643 CM.collectInLoopReductions(); 7644 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7645 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7646 7647 LLVM_DEBUG(printPlans(dbgs())); 7648 if (!MaxFactors.hasVector()) 7649 return VectorizationFactor::Disabled(); 7650 7651 // Select the optimal vectorization factor. 7652 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7653 7654 // Check if it is profitable to vectorize with runtime checks. 7655 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7656 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7657 bool PragmaThresholdReached = 7658 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7659 bool ThresholdReached = 7660 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7661 if ((ThresholdReached && !Hints.allowReordering()) || 7662 PragmaThresholdReached) { 7663 ORE->emit([&]() { 7664 return OptimizationRemarkAnalysisAliasing( 7665 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7666 OrigLoop->getHeader()) 7667 << "loop not vectorized: cannot prove it is safe to reorder " 7668 "memory operations"; 7669 }); 7670 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7671 Hints.emitRemarkWithHints(); 7672 return VectorizationFactor::Disabled(); 7673 } 7674 } 7675 return SelectedVF; 7676 } 7677 7678 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7679 assert(count_if(VPlans, 7680 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7681 1 && 7682 "Best VF has not a single VPlan."); 7683 7684 for (const VPlanPtr &Plan : VPlans) { 7685 if (Plan->hasVF(VF)) 7686 return *Plan.get(); 7687 } 7688 llvm_unreachable("No plan found!"); 7689 } 7690 7691 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7692 SmallVector<Metadata *, 4> MDs; 7693 // Reserve first location for self reference to the LoopID metadata node. 7694 MDs.push_back(nullptr); 7695 bool IsUnrollMetadata = false; 7696 MDNode *LoopID = L->getLoopID(); 7697 if (LoopID) { 7698 // First find existing loop unrolling disable metadata. 7699 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7700 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7701 if (MD) { 7702 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7703 IsUnrollMetadata = 7704 S && S->getString().startswith("llvm.loop.unroll.disable"); 7705 } 7706 MDs.push_back(LoopID->getOperand(i)); 7707 } 7708 } 7709 7710 if (!IsUnrollMetadata) { 7711 // Add runtime unroll disable metadata. 7712 LLVMContext &Context = L->getHeader()->getContext(); 7713 SmallVector<Metadata *, 1> DisableOperands; 7714 DisableOperands.push_back( 7715 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7716 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7717 MDs.push_back(DisableNode); 7718 MDNode *NewLoopID = MDNode::get(Context, MDs); 7719 // Set operand 0 to refer to the loop id itself. 7720 NewLoopID->replaceOperandWith(0, NewLoopID); 7721 L->setLoopID(NewLoopID); 7722 } 7723 } 7724 7725 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7726 VPlan &BestVPlan, 7727 InnerLoopVectorizer &ILV, 7728 DominatorTree *DT) { 7729 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7730 << '\n'); 7731 7732 // Perform the actual loop transformation. 7733 7734 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7735 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7736 Value *CanonicalIVStartValue; 7737 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7738 ILV.createVectorizedLoopSkeleton(); 7739 ILV.collectPoisonGeneratingRecipes(State); 7740 7741 ILV.printDebugTracesAtStart(); 7742 7743 //===------------------------------------------------===// 7744 // 7745 // Notice: any optimization or new instruction that go 7746 // into the code below should also be implemented in 7747 // the cost-model. 7748 // 7749 //===------------------------------------------------===// 7750 7751 // 2. Copy and widen instructions from the old loop into the new loop. 7752 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7753 ILV.getOrCreateVectorTripCount(nullptr), 7754 CanonicalIVStartValue, State); 7755 BestVPlan.execute(&State); 7756 7757 // Keep all loop hints from the original loop on the vector loop (we'll 7758 // replace the vectorizer-specific hints below). 7759 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7760 7761 Optional<MDNode *> VectorizedLoopID = 7762 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7763 LLVMLoopVectorizeFollowupVectorized}); 7764 7765 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 7766 if (VectorizedLoopID.hasValue()) 7767 L->setLoopID(VectorizedLoopID.getValue()); 7768 else { 7769 // Keep all loop hints from the original loop on the vector loop (we'll 7770 // replace the vectorizer-specific hints below). 7771 if (MDNode *LID = OrigLoop->getLoopID()) 7772 L->setLoopID(LID); 7773 7774 LoopVectorizeHints Hints(L, true, *ORE); 7775 Hints.setAlreadyVectorized(); 7776 } 7777 // Disable runtime unrolling when vectorizing the epilogue loop. 7778 if (CanonicalIVStartValue) 7779 AddRuntimeUnrollDisableMetaData(L); 7780 7781 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7782 // predication, updating analyses. 7783 ILV.fixVectorizedLoop(State); 7784 7785 ILV.printDebugTracesAtEnd(); 7786 } 7787 7788 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7789 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7790 for (const auto &Plan : VPlans) 7791 if (PrintVPlansInDotFormat) 7792 Plan->printDOT(O); 7793 else 7794 Plan->print(O); 7795 } 7796 #endif 7797 7798 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7799 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7800 7801 // We create new control-flow for the vectorized loop, so the original exit 7802 // conditions will be dead after vectorization if it's only used by the 7803 // terminator 7804 SmallVector<BasicBlock*> ExitingBlocks; 7805 OrigLoop->getExitingBlocks(ExitingBlocks); 7806 for (auto *BB : ExitingBlocks) { 7807 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7808 if (!Cmp || !Cmp->hasOneUse()) 7809 continue; 7810 7811 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7812 if (!DeadInstructions.insert(Cmp).second) 7813 continue; 7814 7815 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7816 // TODO: can recurse through operands in general 7817 for (Value *Op : Cmp->operands()) { 7818 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7819 DeadInstructions.insert(cast<Instruction>(Op)); 7820 } 7821 } 7822 7823 // We create new "steps" for induction variable updates to which the original 7824 // induction variables map. An original update instruction will be dead if 7825 // all its users except the induction variable are dead. 7826 auto *Latch = OrigLoop->getLoopLatch(); 7827 for (auto &Induction : Legal->getInductionVars()) { 7828 PHINode *Ind = Induction.first; 7829 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7830 7831 // If the tail is to be folded by masking, the primary induction variable, 7832 // if exists, isn't dead: it will be used for masking. Don't kill it. 7833 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7834 continue; 7835 7836 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7837 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7838 })) 7839 DeadInstructions.insert(IndUpdate); 7840 } 7841 } 7842 7843 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7844 7845 //===--------------------------------------------------------------------===// 7846 // EpilogueVectorizerMainLoop 7847 //===--------------------------------------------------------------------===// 7848 7849 /// This function is partially responsible for generating the control flow 7850 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7851 std::pair<BasicBlock *, Value *> 7852 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7853 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7854 Loop *Lp = createVectorLoopSkeleton(""); 7855 7856 // Generate the code to check the minimum iteration count of the vector 7857 // epilogue (see below). 7858 EPI.EpilogueIterationCountCheck = 7859 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7860 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7861 7862 // Generate the code to check any assumptions that we've made for SCEV 7863 // expressions. 7864 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7865 7866 // Generate the code that checks at runtime if arrays overlap. We put the 7867 // checks into a separate block to make the more common case of few elements 7868 // faster. 7869 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7870 7871 // Generate the iteration count check for the main loop, *after* the check 7872 // for the epilogue loop, so that the path-length is shorter for the case 7873 // that goes directly through the vector epilogue. The longer-path length for 7874 // the main loop is compensated for, by the gain from vectorizing the larger 7875 // trip count. Note: the branch will get updated later on when we vectorize 7876 // the epilogue. 7877 EPI.MainLoopIterationCountCheck = 7878 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7879 7880 // Generate the induction variable. 7881 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7882 EPI.VectorTripCount = CountRoundDown; 7883 createHeaderBranch(Lp); 7884 7885 // Skip induction resume value creation here because they will be created in 7886 // the second pass. If we created them here, they wouldn't be used anyway, 7887 // because the vplan in the second pass still contains the inductions from the 7888 // original loop. 7889 7890 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 7891 } 7892 7893 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7894 LLVM_DEBUG({ 7895 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7896 << "Main Loop VF:" << EPI.MainLoopVF 7897 << ", Main Loop UF:" << EPI.MainLoopUF 7898 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7899 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7900 }); 7901 } 7902 7903 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7904 DEBUG_WITH_TYPE(VerboseDebug, { 7905 dbgs() << "intermediate fn:\n" 7906 << *OrigLoop->getHeader()->getParent() << "\n"; 7907 }); 7908 } 7909 7910 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7911 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7912 assert(L && "Expected valid Loop."); 7913 assert(Bypass && "Expected valid bypass basic block."); 7914 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7915 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7916 Value *Count = getOrCreateTripCount(L); 7917 // Reuse existing vector loop preheader for TC checks. 7918 // Note that new preheader block is generated for vector loop. 7919 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7920 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7921 7922 // Generate code to check if the loop's trip count is less than VF * UF of the 7923 // main vector loop. 7924 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7925 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7926 7927 Value *CheckMinIters = Builder.CreateICmp( 7928 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7929 "min.iters.check"); 7930 7931 if (!ForEpilogue) 7932 TCCheckBlock->setName("vector.main.loop.iter.check"); 7933 7934 // Create new preheader for vector loop. 7935 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7936 DT, LI, nullptr, "vector.ph"); 7937 7938 if (ForEpilogue) { 7939 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7940 DT->getNode(Bypass)->getIDom()) && 7941 "TC check is expected to dominate Bypass"); 7942 7943 // Update dominator for Bypass & LoopExit. 7944 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7945 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7946 // For loops with multiple exits, there's no edge from the middle block 7947 // to exit blocks (as the epilogue must run) and thus no need to update 7948 // the immediate dominator of the exit blocks. 7949 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7950 7951 LoopBypassBlocks.push_back(TCCheckBlock); 7952 7953 // Save the trip count so we don't have to regenerate it in the 7954 // vec.epilog.iter.check. This is safe to do because the trip count 7955 // generated here dominates the vector epilog iter check. 7956 EPI.TripCount = Count; 7957 } 7958 7959 ReplaceInstWithInst( 7960 TCCheckBlock->getTerminator(), 7961 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7962 7963 return TCCheckBlock; 7964 } 7965 7966 //===--------------------------------------------------------------------===// 7967 // EpilogueVectorizerEpilogueLoop 7968 //===--------------------------------------------------------------------===// 7969 7970 /// This function is partially responsible for generating the control flow 7971 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7972 std::pair<BasicBlock *, Value *> 7973 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7974 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7975 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7976 7977 // Now, compare the remaining count and if there aren't enough iterations to 7978 // execute the vectorized epilogue skip to the scalar part. 7979 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7980 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7981 LoopVectorPreHeader = 7982 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7983 LI, nullptr, "vec.epilog.ph"); 7984 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7985 VecEpilogueIterationCountCheck); 7986 7987 // Adjust the control flow taking the state info from the main loop 7988 // vectorization into account. 7989 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7990 "expected this to be saved from the previous pass."); 7991 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7992 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7993 7994 DT->changeImmediateDominator(LoopVectorPreHeader, 7995 EPI.MainLoopIterationCountCheck); 7996 7997 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7998 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7999 8000 if (EPI.SCEVSafetyCheck) 8001 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8002 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8003 if (EPI.MemSafetyCheck) 8004 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8005 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8006 8007 DT->changeImmediateDominator( 8008 VecEpilogueIterationCountCheck, 8009 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8010 8011 DT->changeImmediateDominator(LoopScalarPreHeader, 8012 EPI.EpilogueIterationCountCheck); 8013 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8014 // If there is an epilogue which must run, there's no edge from the 8015 // middle block to exit blocks and thus no need to update the immediate 8016 // dominator of the exit blocks. 8017 DT->changeImmediateDominator(LoopExitBlock, 8018 EPI.EpilogueIterationCountCheck); 8019 8020 // Keep track of bypass blocks, as they feed start values to the induction 8021 // phis in the scalar loop preheader. 8022 if (EPI.SCEVSafetyCheck) 8023 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8024 if (EPI.MemSafetyCheck) 8025 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8026 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8027 8028 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 8029 // merge control-flow from the latch block and the middle block. Update the 8030 // incoming values here and move the Phi into the preheader. 8031 SmallVector<PHINode *, 4> PhisInBlock; 8032 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8033 PhisInBlock.push_back(&Phi); 8034 8035 for (PHINode *Phi : PhisInBlock) { 8036 Phi->replaceIncomingBlockWith( 8037 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8038 VecEpilogueIterationCountCheck); 8039 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8040 if (EPI.SCEVSafetyCheck) 8041 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8042 if (EPI.MemSafetyCheck) 8043 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8044 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8045 } 8046 8047 // Generate a resume induction for the vector epilogue and put it in the 8048 // vector epilogue preheader 8049 Type *IdxTy = Legal->getWidestInductionType(); 8050 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8051 LoopVectorPreHeader->getFirstNonPHI()); 8052 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8053 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8054 EPI.MainLoopIterationCountCheck); 8055 8056 // Generate the induction variable. 8057 createHeaderBranch(Lp); 8058 8059 // Generate induction resume values. These variables save the new starting 8060 // indexes for the scalar loop. They are used to test if there are any tail 8061 // iterations left once the vector loop has completed. 8062 // Note that when the vectorized epilogue is skipped due to iteration count 8063 // check, then the resume value for the induction variable comes from 8064 // the trip count of the main vector loop, hence passing the AdditionalBypass 8065 // argument. 8066 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8067 EPI.VectorTripCount} /* AdditionalBypass */); 8068 8069 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8070 } 8071 8072 BasicBlock * 8073 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8074 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8075 8076 assert(EPI.TripCount && 8077 "Expected trip count to have been safed in the first pass."); 8078 assert( 8079 (!isa<Instruction>(EPI.TripCount) || 8080 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8081 "saved trip count does not dominate insertion point."); 8082 Value *TC = EPI.TripCount; 8083 IRBuilder<> Builder(Insert->getTerminator()); 8084 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8085 8086 // Generate code to check if the loop's trip count is less than VF * UF of the 8087 // vector epilogue loop. 8088 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8089 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8090 8091 Value *CheckMinIters = 8092 Builder.CreateICmp(P, Count, 8093 createStepForVF(Builder, Count->getType(), 8094 EPI.EpilogueVF, EPI.EpilogueUF), 8095 "min.epilog.iters.check"); 8096 8097 ReplaceInstWithInst( 8098 Insert->getTerminator(), 8099 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8100 8101 LoopBypassBlocks.push_back(Insert); 8102 return Insert; 8103 } 8104 8105 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8106 LLVM_DEBUG({ 8107 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8108 << "Epilogue Loop VF:" << EPI.EpilogueVF 8109 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8110 }); 8111 } 8112 8113 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8114 DEBUG_WITH_TYPE(VerboseDebug, { 8115 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8116 }); 8117 } 8118 8119 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8120 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8121 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8122 bool PredicateAtRangeStart = Predicate(Range.Start); 8123 8124 for (ElementCount TmpVF = Range.Start * 2; 8125 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8126 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8127 Range.End = TmpVF; 8128 break; 8129 } 8130 8131 return PredicateAtRangeStart; 8132 } 8133 8134 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8135 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8136 /// of VF's starting at a given VF and extending it as much as possible. Each 8137 /// vectorization decision can potentially shorten this sub-range during 8138 /// buildVPlan(). 8139 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8140 ElementCount MaxVF) { 8141 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8142 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8143 VFRange SubRange = {VF, MaxVFPlusOne}; 8144 VPlans.push_back(buildVPlan(SubRange)); 8145 VF = SubRange.End; 8146 } 8147 } 8148 8149 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8150 VPlanPtr &Plan) { 8151 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8152 8153 // Look for cached value. 8154 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8155 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8156 if (ECEntryIt != EdgeMaskCache.end()) 8157 return ECEntryIt->second; 8158 8159 VPValue *SrcMask = createBlockInMask(Src, Plan); 8160 8161 // The terminator has to be a branch inst! 8162 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8163 assert(BI && "Unexpected terminator found"); 8164 8165 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8166 return EdgeMaskCache[Edge] = SrcMask; 8167 8168 // If source is an exiting block, we know the exit edge is dynamically dead 8169 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8170 // adding uses of an otherwise potentially dead instruction. 8171 if (OrigLoop->isLoopExiting(Src)) 8172 return EdgeMaskCache[Edge] = SrcMask; 8173 8174 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8175 assert(EdgeMask && "No Edge Mask found for condition"); 8176 8177 if (BI->getSuccessor(0) != Dst) 8178 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8179 8180 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8181 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8182 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8183 // The select version does not introduce new UB if SrcMask is false and 8184 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8185 VPValue *False = Plan->getOrAddVPValue( 8186 ConstantInt::getFalse(BI->getCondition()->getType())); 8187 EdgeMask = 8188 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8189 } 8190 8191 return EdgeMaskCache[Edge] = EdgeMask; 8192 } 8193 8194 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8195 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8196 8197 // Look for cached value. 8198 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8199 if (BCEntryIt != BlockMaskCache.end()) 8200 return BCEntryIt->second; 8201 8202 // All-one mask is modelled as no-mask following the convention for masked 8203 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8204 VPValue *BlockMask = nullptr; 8205 8206 if (OrigLoop->getHeader() == BB) { 8207 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8208 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8209 8210 // Introduce the early-exit compare IV <= BTC to form header block mask. 8211 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8212 // constructing the desired canonical IV in the header block as its first 8213 // non-phi instructions. 8214 assert(CM.foldTailByMasking() && "must fold the tail"); 8215 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8216 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8217 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8218 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8219 8220 VPBuilder::InsertPointGuard Guard(Builder); 8221 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8222 if (CM.TTI.emitGetActiveLaneMask()) { 8223 VPValue *TC = Plan->getOrCreateTripCount(); 8224 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8225 } else { 8226 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8227 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8228 } 8229 return BlockMaskCache[BB] = BlockMask; 8230 } 8231 8232 // This is the block mask. We OR all incoming edges. 8233 for (auto *Predecessor : predecessors(BB)) { 8234 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8235 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8236 return BlockMaskCache[BB] = EdgeMask; 8237 8238 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8239 BlockMask = EdgeMask; 8240 continue; 8241 } 8242 8243 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8244 } 8245 8246 return BlockMaskCache[BB] = BlockMask; 8247 } 8248 8249 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8250 ArrayRef<VPValue *> Operands, 8251 VFRange &Range, 8252 VPlanPtr &Plan) { 8253 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8254 "Must be called with either a load or store"); 8255 8256 auto willWiden = [&](ElementCount VF) -> bool { 8257 if (VF.isScalar()) 8258 return false; 8259 LoopVectorizationCostModel::InstWidening Decision = 8260 CM.getWideningDecision(I, VF); 8261 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8262 "CM decision should be taken at this point."); 8263 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8264 return true; 8265 if (CM.isScalarAfterVectorization(I, VF) || 8266 CM.isProfitableToScalarize(I, VF)) 8267 return false; 8268 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8269 }; 8270 8271 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8272 return nullptr; 8273 8274 VPValue *Mask = nullptr; 8275 if (Legal->isMaskRequired(I)) 8276 Mask = createBlockInMask(I->getParent(), Plan); 8277 8278 // Determine if the pointer operand of the access is either consecutive or 8279 // reverse consecutive. 8280 LoopVectorizationCostModel::InstWidening Decision = 8281 CM.getWideningDecision(I, Range.Start); 8282 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8283 bool Consecutive = 8284 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8285 8286 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8287 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8288 Consecutive, Reverse); 8289 8290 StoreInst *Store = cast<StoreInst>(I); 8291 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8292 Mask, Consecutive, Reverse); 8293 } 8294 8295 static VPWidenIntOrFpInductionRecipe * 8296 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, 8297 VPValue *Start, const InductionDescriptor &IndDesc, 8298 LoopVectorizationCostModel &CM, ScalarEvolution &SE, 8299 Loop &OrigLoop, VFRange &Range) { 8300 // Returns true if an instruction \p I should be scalarized instead of 8301 // vectorized for the chosen vectorization factor. 8302 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8303 return CM.isScalarAfterVectorization(I, VF) || 8304 CM.isProfitableToScalarize(I, VF); 8305 }; 8306 8307 bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( 8308 [&](ElementCount VF) { 8309 // Returns true if we should generate a scalar version of \p IV. 8310 if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) 8311 return true; 8312 auto isScalarInst = [&](User *U) -> bool { 8313 auto *I = cast<Instruction>(U); 8314 return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); 8315 }; 8316 return any_of(PhiOrTrunc->users(), isScalarInst); 8317 }, 8318 Range); 8319 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8320 [&](ElementCount VF) { 8321 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8322 }, 8323 Range); 8324 assert(IndDesc.getStartValue() == 8325 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8326 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8327 "step must be loop invariant"); 8328 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8329 return new VPWidenIntOrFpInductionRecipe( 8330 Phi, Start, IndDesc, TruncI, NeedsScalarIV, !NeedsScalarIVOnly, SE); 8331 } 8332 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8333 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, 8334 !NeedsScalarIVOnly, SE); 8335 } 8336 8337 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8338 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { 8339 8340 // Check if this is an integer or fp induction. If so, build the recipe that 8341 // produces its scalar and vector values. 8342 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8343 return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, 8344 *PSE.getSE(), *OrigLoop, Range); 8345 8346 return nullptr; 8347 } 8348 8349 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8350 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8351 VPlan &Plan) const { 8352 // Optimize the special case where the source is a constant integer 8353 // induction variable. Notice that we can only optimize the 'trunc' case 8354 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8355 // (c) other casts depend on pointer size. 8356 8357 // Determine whether \p K is a truncation based on an induction variable that 8358 // can be optimized. 8359 auto isOptimizableIVTruncate = 8360 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8361 return [=](ElementCount VF) -> bool { 8362 return CM.isOptimizableIVTruncate(K, VF); 8363 }; 8364 }; 8365 8366 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8367 isOptimizableIVTruncate(I), Range)) { 8368 8369 auto *Phi = cast<PHINode>(I->getOperand(0)); 8370 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8371 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8372 return createWidenInductionRecipe(Phi, I, Start, II, CM, *PSE.getSE(), 8373 *OrigLoop, Range); 8374 } 8375 return nullptr; 8376 } 8377 8378 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8379 ArrayRef<VPValue *> Operands, 8380 VPlanPtr &Plan) { 8381 // If all incoming values are equal, the incoming VPValue can be used directly 8382 // instead of creating a new VPBlendRecipe. 8383 VPValue *FirstIncoming = Operands[0]; 8384 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8385 return FirstIncoming == Inc; 8386 })) { 8387 return Operands[0]; 8388 } 8389 8390 unsigned NumIncoming = Phi->getNumIncomingValues(); 8391 // For in-loop reductions, we do not need to create an additional select. 8392 VPValue *InLoopVal = nullptr; 8393 for (unsigned In = 0; In < NumIncoming; In++) { 8394 PHINode *PhiOp = 8395 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8396 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8397 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8398 InLoopVal = Operands[In]; 8399 } 8400 } 8401 8402 assert((!InLoopVal || NumIncoming == 2) && 8403 "Found an in-loop reduction for PHI with unexpected number of " 8404 "incoming values"); 8405 if (InLoopVal) 8406 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8407 8408 // We know that all PHIs in non-header blocks are converted into selects, so 8409 // we don't have to worry about the insertion order and we can just use the 8410 // builder. At this point we generate the predication tree. There may be 8411 // duplications since this is a simple recursive scan, but future 8412 // optimizations will clean it up. 8413 SmallVector<VPValue *, 2> OperandsWithMask; 8414 8415 for (unsigned In = 0; In < NumIncoming; In++) { 8416 VPValue *EdgeMask = 8417 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8418 assert((EdgeMask || NumIncoming == 1) && 8419 "Multiple predecessors with one having a full mask"); 8420 OperandsWithMask.push_back(Operands[In]); 8421 if (EdgeMask) 8422 OperandsWithMask.push_back(EdgeMask); 8423 } 8424 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8425 } 8426 8427 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8428 ArrayRef<VPValue *> Operands, 8429 VFRange &Range) const { 8430 8431 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8432 [this, CI](ElementCount VF) { 8433 return CM.isScalarWithPredication(CI, VF); 8434 }, 8435 Range); 8436 8437 if (IsPredicated) 8438 return nullptr; 8439 8440 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8441 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8442 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8443 ID == Intrinsic::pseudoprobe || 8444 ID == Intrinsic::experimental_noalias_scope_decl)) 8445 return nullptr; 8446 8447 auto willWiden = [&](ElementCount VF) -> bool { 8448 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8449 // The following case may be scalarized depending on the VF. 8450 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8451 // version of the instruction. 8452 // Is it beneficial to perform intrinsic call compared to lib call? 8453 bool NeedToScalarize = false; 8454 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8455 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8456 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8457 return UseVectorIntrinsic || !NeedToScalarize; 8458 }; 8459 8460 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8461 return nullptr; 8462 8463 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8464 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8465 } 8466 8467 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8468 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8469 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8470 // Instruction should be widened, unless it is scalar after vectorization, 8471 // scalarization is profitable or it is predicated. 8472 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8473 return CM.isScalarAfterVectorization(I, VF) || 8474 CM.isProfitableToScalarize(I, VF) || 8475 CM.isScalarWithPredication(I, VF); 8476 }; 8477 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8478 Range); 8479 } 8480 8481 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8482 ArrayRef<VPValue *> Operands) const { 8483 auto IsVectorizableOpcode = [](unsigned Opcode) { 8484 switch (Opcode) { 8485 case Instruction::Add: 8486 case Instruction::And: 8487 case Instruction::AShr: 8488 case Instruction::BitCast: 8489 case Instruction::FAdd: 8490 case Instruction::FCmp: 8491 case Instruction::FDiv: 8492 case Instruction::FMul: 8493 case Instruction::FNeg: 8494 case Instruction::FPExt: 8495 case Instruction::FPToSI: 8496 case Instruction::FPToUI: 8497 case Instruction::FPTrunc: 8498 case Instruction::FRem: 8499 case Instruction::FSub: 8500 case Instruction::ICmp: 8501 case Instruction::IntToPtr: 8502 case Instruction::LShr: 8503 case Instruction::Mul: 8504 case Instruction::Or: 8505 case Instruction::PtrToInt: 8506 case Instruction::SDiv: 8507 case Instruction::Select: 8508 case Instruction::SExt: 8509 case Instruction::Shl: 8510 case Instruction::SIToFP: 8511 case Instruction::SRem: 8512 case Instruction::Sub: 8513 case Instruction::Trunc: 8514 case Instruction::UDiv: 8515 case Instruction::UIToFP: 8516 case Instruction::URem: 8517 case Instruction::Xor: 8518 case Instruction::ZExt: 8519 return true; 8520 } 8521 return false; 8522 }; 8523 8524 if (!IsVectorizableOpcode(I->getOpcode())) 8525 return nullptr; 8526 8527 // Success: widen this instruction. 8528 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8529 } 8530 8531 void VPRecipeBuilder::fixHeaderPhis() { 8532 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8533 for (VPHeaderPHIRecipe *R : PhisToFix) { 8534 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8535 VPRecipeBase *IncR = 8536 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8537 R->addOperand(IncR->getVPSingleValue()); 8538 } 8539 } 8540 8541 VPBasicBlock *VPRecipeBuilder::handleReplication( 8542 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8543 VPlanPtr &Plan) { 8544 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8545 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8546 Range); 8547 8548 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8549 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8550 Range); 8551 8552 // Even if the instruction is not marked as uniform, there are certain 8553 // intrinsic calls that can be effectively treated as such, so we check for 8554 // them here. Conservatively, we only do this for scalable vectors, since 8555 // for fixed-width VFs we can always fall back on full scalarization. 8556 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8557 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8558 case Intrinsic::assume: 8559 case Intrinsic::lifetime_start: 8560 case Intrinsic::lifetime_end: 8561 // For scalable vectors if one of the operands is variant then we still 8562 // want to mark as uniform, which will generate one instruction for just 8563 // the first lane of the vector. We can't scalarize the call in the same 8564 // way as for fixed-width vectors because we don't know how many lanes 8565 // there are. 8566 // 8567 // The reasons for doing it this way for scalable vectors are: 8568 // 1. For the assume intrinsic generating the instruction for the first 8569 // lane is still be better than not generating any at all. For 8570 // example, the input may be a splat across all lanes. 8571 // 2. For the lifetime start/end intrinsics the pointer operand only 8572 // does anything useful when the input comes from a stack object, 8573 // which suggests it should always be uniform. For non-stack objects 8574 // the effect is to poison the object, which still allows us to 8575 // remove the call. 8576 IsUniform = true; 8577 break; 8578 default: 8579 break; 8580 } 8581 } 8582 8583 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8584 IsUniform, IsPredicated); 8585 setRecipe(I, Recipe); 8586 Plan->addVPValue(I, Recipe); 8587 8588 // Find if I uses a predicated instruction. If so, it will use its scalar 8589 // value. Avoid hoisting the insert-element which packs the scalar value into 8590 // a vector value, as that happens iff all users use the vector value. 8591 for (VPValue *Op : Recipe->operands()) { 8592 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8593 if (!PredR) 8594 continue; 8595 auto *RepR = 8596 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8597 assert(RepR->isPredicated() && 8598 "expected Replicate recipe to be predicated"); 8599 RepR->setAlsoPack(false); 8600 } 8601 8602 // Finalize the recipe for Instr, first if it is not predicated. 8603 if (!IsPredicated) { 8604 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8605 VPBB->appendRecipe(Recipe); 8606 return VPBB; 8607 } 8608 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8609 8610 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8611 assert(SingleSucc && "VPBB must have a single successor when handling " 8612 "predicated replication."); 8613 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8614 // Record predicated instructions for above packing optimizations. 8615 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8616 VPBlockUtils::insertBlockAfter(Region, VPBB); 8617 auto *RegSucc = new VPBasicBlock(); 8618 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8619 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8620 return RegSucc; 8621 } 8622 8623 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8624 VPRecipeBase *PredRecipe, 8625 VPlanPtr &Plan) { 8626 // Instructions marked for predication are replicated and placed under an 8627 // if-then construct to prevent side-effects. 8628 8629 // Generate recipes to compute the block mask for this region. 8630 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8631 8632 // Build the triangular if-then region. 8633 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8634 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8635 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8636 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8637 auto *PHIRecipe = Instr->getType()->isVoidTy() 8638 ? nullptr 8639 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8640 if (PHIRecipe) { 8641 Plan->removeVPValueFor(Instr); 8642 Plan->addVPValue(Instr, PHIRecipe); 8643 } 8644 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8645 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8646 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8647 8648 // Note: first set Entry as region entry and then connect successors starting 8649 // from it in order, to propagate the "parent" of each VPBasicBlock. 8650 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8651 VPBlockUtils::connectBlocks(Pred, Exit); 8652 8653 return Region; 8654 } 8655 8656 VPRecipeOrVPValueTy 8657 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8658 ArrayRef<VPValue *> Operands, 8659 VFRange &Range, VPlanPtr &Plan) { 8660 // First, check for specific widening recipes that deal with calls, memory 8661 // operations, inductions and Phi nodes. 8662 if (auto *CI = dyn_cast<CallInst>(Instr)) 8663 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8664 8665 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8666 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8667 8668 VPRecipeBase *Recipe; 8669 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8670 if (Phi->getParent() != OrigLoop->getHeader()) 8671 return tryToBlend(Phi, Operands, Plan); 8672 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8673 return toVPRecipeResult(Recipe); 8674 8675 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8676 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8677 VPValue *StartV = Operands[0]; 8678 if (Legal->isReductionVariable(Phi)) { 8679 const RecurrenceDescriptor &RdxDesc = 8680 Legal->getReductionVars().find(Phi)->second; 8681 assert(RdxDesc.getRecurrenceStartValue() == 8682 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8683 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8684 CM.isInLoopReduction(Phi), 8685 CM.useOrderedReductions(RdxDesc)); 8686 } else { 8687 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8688 } 8689 8690 // Record the incoming value from the backedge, so we can add the incoming 8691 // value from the backedge after all recipes have been created. 8692 recordRecipeOf(cast<Instruction>( 8693 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8694 PhisToFix.push_back(PhiRecipe); 8695 } else { 8696 // TODO: record backedge value for remaining pointer induction phis. 8697 assert(Phi->getType()->isPointerTy() && 8698 "only pointer phis should be handled here"); 8699 assert(Legal->getInductionVars().count(Phi) && 8700 "Not an induction variable"); 8701 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8702 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8703 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8704 } 8705 8706 return toVPRecipeResult(PhiRecipe); 8707 } 8708 8709 if (isa<TruncInst>(Instr) && 8710 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8711 Range, *Plan))) 8712 return toVPRecipeResult(Recipe); 8713 8714 if (!shouldWiden(Instr, Range)) 8715 return nullptr; 8716 8717 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8718 return toVPRecipeResult(new VPWidenGEPRecipe( 8719 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8720 8721 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8722 bool InvariantCond = 8723 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8724 return toVPRecipeResult(new VPWidenSelectRecipe( 8725 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8726 } 8727 8728 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8729 } 8730 8731 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8732 ElementCount MaxVF) { 8733 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8734 8735 // Collect instructions from the original loop that will become trivially dead 8736 // in the vectorized loop. We don't need to vectorize these instructions. For 8737 // example, original induction update instructions can become dead because we 8738 // separately emit induction "steps" when generating code for the new loop. 8739 // Similarly, we create a new latch condition when setting up the structure 8740 // of the new loop, so the old one can become dead. 8741 SmallPtrSet<Instruction *, 4> DeadInstructions; 8742 collectTriviallyDeadInstructions(DeadInstructions); 8743 8744 // Add assume instructions we need to drop to DeadInstructions, to prevent 8745 // them from being added to the VPlan. 8746 // TODO: We only need to drop assumes in blocks that get flattend. If the 8747 // control flow is preserved, we should keep them. 8748 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8749 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8750 8751 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8752 // Dead instructions do not need sinking. Remove them from SinkAfter. 8753 for (Instruction *I : DeadInstructions) 8754 SinkAfter.erase(I); 8755 8756 // Cannot sink instructions after dead instructions (there won't be any 8757 // recipes for them). Instead, find the first non-dead previous instruction. 8758 for (auto &P : Legal->getSinkAfter()) { 8759 Instruction *SinkTarget = P.second; 8760 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8761 (void)FirstInst; 8762 while (DeadInstructions.contains(SinkTarget)) { 8763 assert( 8764 SinkTarget != FirstInst && 8765 "Must find a live instruction (at least the one feeding the " 8766 "first-order recurrence PHI) before reaching beginning of the block"); 8767 SinkTarget = SinkTarget->getPrevNode(); 8768 assert(SinkTarget != P.first && 8769 "sink source equals target, no sinking required"); 8770 } 8771 P.second = SinkTarget; 8772 } 8773 8774 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8775 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8776 VFRange SubRange = {VF, MaxVFPlusOne}; 8777 VPlans.push_back( 8778 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8779 VF = SubRange.End; 8780 } 8781 } 8782 8783 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8784 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8785 // BranchOnCount VPInstruction to the latch. 8786 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8787 bool HasNUW, bool IsVPlanNative) { 8788 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8789 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8790 8791 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8792 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8793 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8794 if (IsVPlanNative) 8795 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 8796 Header->insert(CanonicalIVPHI, Header->begin()); 8797 8798 auto *CanonicalIVIncrement = 8799 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8800 : VPInstruction::CanonicalIVIncrement, 8801 {CanonicalIVPHI}, DL); 8802 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8803 8804 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 8805 if (IsVPlanNative) { 8806 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 8807 EB->setCondBit(nullptr); 8808 } 8809 EB->appendRecipe(CanonicalIVIncrement); 8810 8811 auto *BranchOnCount = 8812 new VPInstruction(VPInstruction::BranchOnCount, 8813 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8814 EB->appendRecipe(BranchOnCount); 8815 } 8816 8817 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8818 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8819 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8820 8821 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8822 8823 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8824 8825 // --------------------------------------------------------------------------- 8826 // Pre-construction: record ingredients whose recipes we'll need to further 8827 // process after constructing the initial VPlan. 8828 // --------------------------------------------------------------------------- 8829 8830 // Mark instructions we'll need to sink later and their targets as 8831 // ingredients whose recipe we'll need to record. 8832 for (auto &Entry : SinkAfter) { 8833 RecipeBuilder.recordRecipeOf(Entry.first); 8834 RecipeBuilder.recordRecipeOf(Entry.second); 8835 } 8836 for (auto &Reduction : CM.getInLoopReductionChains()) { 8837 PHINode *Phi = Reduction.first; 8838 RecurKind Kind = 8839 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8840 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8841 8842 RecipeBuilder.recordRecipeOf(Phi); 8843 for (auto &R : ReductionOperations) { 8844 RecipeBuilder.recordRecipeOf(R); 8845 // For min/max reducitons, where we have a pair of icmp/select, we also 8846 // need to record the ICmp recipe, so it can be removed later. 8847 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8848 "Only min/max recurrences allowed for inloop reductions"); 8849 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8850 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8851 } 8852 } 8853 8854 // For each interleave group which is relevant for this (possibly trimmed) 8855 // Range, add it to the set of groups to be later applied to the VPlan and add 8856 // placeholders for its members' Recipes which we'll be replacing with a 8857 // single VPInterleaveRecipe. 8858 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8859 auto applyIG = [IG, this](ElementCount VF) -> bool { 8860 return (VF.isVector() && // Query is illegal for VF == 1 8861 CM.getWideningDecision(IG->getInsertPos(), VF) == 8862 LoopVectorizationCostModel::CM_Interleave); 8863 }; 8864 if (!getDecisionAndClampRange(applyIG, Range)) 8865 continue; 8866 InterleaveGroups.insert(IG); 8867 for (unsigned i = 0; i < IG->getFactor(); i++) 8868 if (Instruction *Member = IG->getMember(i)) 8869 RecipeBuilder.recordRecipeOf(Member); 8870 }; 8871 8872 // --------------------------------------------------------------------------- 8873 // Build initial VPlan: Scan the body of the loop in a topological order to 8874 // visit each basic block after having visited its predecessor basic blocks. 8875 // --------------------------------------------------------------------------- 8876 8877 // Create initial VPlan skeleton, with separate header and latch blocks. 8878 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 8879 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8880 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8881 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8882 auto Plan = std::make_unique<VPlan>(TopRegion); 8883 8884 Instruction *DLInst = 8885 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8886 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8887 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8888 !CM.foldTailByMasking(), false); 8889 8890 // Scan the body of the loop in a topological order to visit each basic block 8891 // after having visited its predecessor basic blocks. 8892 LoopBlocksDFS DFS(OrigLoop); 8893 DFS.perform(LI); 8894 8895 VPBasicBlock *VPBB = HeaderVPBB; 8896 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8897 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8898 // Relevant instructions from basic block BB will be grouped into VPRecipe 8899 // ingredients and fill a new VPBasicBlock. 8900 unsigned VPBBsForBB = 0; 8901 VPBB->setName(BB->getName()); 8902 Builder.setInsertPoint(VPBB); 8903 8904 // Introduce each ingredient into VPlan. 8905 // TODO: Model and preserve debug instrinsics in VPlan. 8906 for (Instruction &I : BB->instructionsWithoutDebug()) { 8907 Instruction *Instr = &I; 8908 8909 // First filter out irrelevant instructions, to ensure no recipes are 8910 // built for them. 8911 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8912 continue; 8913 8914 SmallVector<VPValue *, 4> Operands; 8915 auto *Phi = dyn_cast<PHINode>(Instr); 8916 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8917 Operands.push_back(Plan->getOrAddVPValue( 8918 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8919 } else { 8920 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8921 Operands = {OpRange.begin(), OpRange.end()}; 8922 } 8923 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8924 Instr, Operands, Range, Plan)) { 8925 // If Instr can be simplified to an existing VPValue, use it. 8926 if (RecipeOrValue.is<VPValue *>()) { 8927 auto *VPV = RecipeOrValue.get<VPValue *>(); 8928 Plan->addVPValue(Instr, VPV); 8929 // If the re-used value is a recipe, register the recipe for the 8930 // instruction, in case the recipe for Instr needs to be recorded. 8931 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8932 RecipeBuilder.setRecipe(Instr, R); 8933 continue; 8934 } 8935 // Otherwise, add the new recipe. 8936 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8937 for (auto *Def : Recipe->definedValues()) { 8938 auto *UV = Def->getUnderlyingValue(); 8939 Plan->addVPValue(UV, Def); 8940 } 8941 8942 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8943 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8944 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8945 // of the header block. That can happen for truncates of induction 8946 // variables. Those recipes are moved to the phi section of the header 8947 // block after applying SinkAfter, which relies on the original 8948 // position of the trunc. 8949 assert(isa<TruncInst>(Instr)); 8950 InductionsToMove.push_back( 8951 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8952 } 8953 RecipeBuilder.setRecipe(Instr, Recipe); 8954 VPBB->appendRecipe(Recipe); 8955 continue; 8956 } 8957 8958 // Otherwise, if all widening options failed, Instruction is to be 8959 // replicated. This may create a successor for VPBB. 8960 VPBasicBlock *NextVPBB = 8961 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8962 if (NextVPBB != VPBB) { 8963 VPBB = NextVPBB; 8964 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8965 : ""); 8966 } 8967 } 8968 8969 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8970 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8971 } 8972 8973 // Fold the last, empty block into its predecessor. 8974 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8975 assert(VPBB && "expected to fold last (empty) block"); 8976 // After here, VPBB should not be used. 8977 VPBB = nullptr; 8978 8979 assert(isa<VPRegionBlock>(Plan->getEntry()) && 8980 !Plan->getEntry()->getEntryBasicBlock()->empty() && 8981 "entry block must be set to a VPRegionBlock having a non-empty entry " 8982 "VPBasicBlock"); 8983 RecipeBuilder.fixHeaderPhis(); 8984 8985 // --------------------------------------------------------------------------- 8986 // Transform initial VPlan: Apply previously taken decisions, in order, to 8987 // bring the VPlan to its final state. 8988 // --------------------------------------------------------------------------- 8989 8990 // Apply Sink-After legal constraints. 8991 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8992 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8993 if (Region && Region->isReplicator()) { 8994 assert(Region->getNumSuccessors() == 1 && 8995 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8996 assert(R->getParent()->size() == 1 && 8997 "A recipe in an original replicator region must be the only " 8998 "recipe in its block"); 8999 return Region; 9000 } 9001 return nullptr; 9002 }; 9003 for (auto &Entry : SinkAfter) { 9004 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9005 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9006 9007 auto *TargetRegion = GetReplicateRegion(Target); 9008 auto *SinkRegion = GetReplicateRegion(Sink); 9009 if (!SinkRegion) { 9010 // If the sink source is not a replicate region, sink the recipe directly. 9011 if (TargetRegion) { 9012 // The target is in a replication region, make sure to move Sink to 9013 // the block after it, not into the replication region itself. 9014 VPBasicBlock *NextBlock = 9015 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9016 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9017 } else 9018 Sink->moveAfter(Target); 9019 continue; 9020 } 9021 9022 // The sink source is in a replicate region. Unhook the region from the CFG. 9023 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9024 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9025 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9026 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9027 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9028 9029 if (TargetRegion) { 9030 // The target recipe is also in a replicate region, move the sink region 9031 // after the target region. 9032 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9033 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9034 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9035 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9036 } else { 9037 // The sink source is in a replicate region, we need to move the whole 9038 // replicate region, which should only contain a single recipe in the 9039 // main block. 9040 auto *SplitBlock = 9041 Target->getParent()->splitAt(std::next(Target->getIterator())); 9042 9043 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9044 9045 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9046 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9047 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9048 } 9049 } 9050 9051 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9052 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9053 9054 // Now that sink-after is done, move induction recipes for optimized truncates 9055 // to the phi section of the header block. 9056 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9057 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9058 9059 // Adjust the recipes for any inloop reductions. 9060 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9061 RecipeBuilder, Range.Start); 9062 9063 // Introduce a recipe to combine the incoming and previous values of a 9064 // first-order recurrence. 9065 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9066 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9067 if (!RecurPhi) 9068 continue; 9069 9070 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9071 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9072 auto *Region = GetReplicateRegion(PrevRecipe); 9073 if (Region) 9074 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9075 if (Region || PrevRecipe->isPhi()) 9076 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9077 else 9078 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9079 9080 auto *RecurSplice = cast<VPInstruction>( 9081 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9082 {RecurPhi, RecurPhi->getBackedgeValue()})); 9083 9084 RecurPhi->replaceAllUsesWith(RecurSplice); 9085 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9086 // all users. 9087 RecurSplice->setOperand(0, RecurPhi); 9088 } 9089 9090 // Interleave memory: for each Interleave Group we marked earlier as relevant 9091 // for this VPlan, replace the Recipes widening its memory instructions with a 9092 // single VPInterleaveRecipe at its insertion point. 9093 for (auto IG : InterleaveGroups) { 9094 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9095 RecipeBuilder.getRecipe(IG->getInsertPos())); 9096 SmallVector<VPValue *, 4> StoredValues; 9097 for (unsigned i = 0; i < IG->getFactor(); ++i) 9098 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9099 auto *StoreR = 9100 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9101 StoredValues.push_back(StoreR->getStoredValue()); 9102 } 9103 9104 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9105 Recipe->getMask()); 9106 VPIG->insertBefore(Recipe); 9107 unsigned J = 0; 9108 for (unsigned i = 0; i < IG->getFactor(); ++i) 9109 if (Instruction *Member = IG->getMember(i)) { 9110 if (!Member->getType()->isVoidTy()) { 9111 VPValue *OriginalV = Plan->getVPValue(Member); 9112 Plan->removeVPValueFor(Member); 9113 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9114 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9115 J++; 9116 } 9117 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9118 } 9119 } 9120 9121 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9122 // in ways that accessing values using original IR values is incorrect. 9123 Plan->disableValue2VPValue(); 9124 9125 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9126 VPlanTransforms::sinkScalarOperands(*Plan); 9127 VPlanTransforms::mergeReplicateRegions(*Plan); 9128 VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop); 9129 9130 std::string PlanName; 9131 raw_string_ostream RSO(PlanName); 9132 ElementCount VF = Range.Start; 9133 Plan->addVF(VF); 9134 RSO << "Initial VPlan for VF={" << VF; 9135 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9136 Plan->addVF(VF); 9137 RSO << "," << VF; 9138 } 9139 RSO << "},UF>=1"; 9140 RSO.flush(); 9141 Plan->setName(PlanName); 9142 9143 // Fold Exit block into its predecessor if possible. 9144 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9145 // VPBasicBlock as exit. 9146 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9147 9148 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9149 return Plan; 9150 } 9151 9152 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9153 // Outer loop handling: They may require CFG and instruction level 9154 // transformations before even evaluating whether vectorization is profitable. 9155 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9156 // the vectorization pipeline. 9157 assert(!OrigLoop->isInnermost()); 9158 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9159 9160 // Create new empty VPlan 9161 auto Plan = std::make_unique<VPlan>(); 9162 9163 // Build hierarchical CFG 9164 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9165 HCFGBuilder.buildHierarchicalCFG(); 9166 9167 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9168 VF *= 2) 9169 Plan->addVF(VF); 9170 9171 if (EnableVPlanPredication) { 9172 VPlanPredicator VPP(*Plan); 9173 VPP.predicate(); 9174 9175 // Avoid running transformation to recipes until masked code generation in 9176 // VPlan-native path is in place. 9177 return Plan; 9178 } 9179 9180 SmallPtrSet<Instruction *, 1> DeadInstructions; 9181 VPlanTransforms::VPInstructionsToVPRecipes( 9182 OrigLoop, Plan, 9183 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9184 DeadInstructions, *PSE.getSE()); 9185 9186 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9187 true, true); 9188 return Plan; 9189 } 9190 9191 // Adjust the recipes for reductions. For in-loop reductions the chain of 9192 // instructions leading from the loop exit instr to the phi need to be converted 9193 // to reductions, with one operand being vector and the other being the scalar 9194 // reduction chain. For other reductions, a select is introduced between the phi 9195 // and live-out recipes when folding the tail. 9196 void LoopVectorizationPlanner::adjustRecipesForReductions( 9197 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9198 ElementCount MinVF) { 9199 for (auto &Reduction : CM.getInLoopReductionChains()) { 9200 PHINode *Phi = Reduction.first; 9201 const RecurrenceDescriptor &RdxDesc = 9202 Legal->getReductionVars().find(Phi)->second; 9203 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9204 9205 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9206 continue; 9207 9208 // ReductionOperations are orders top-down from the phi's use to the 9209 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9210 // which of the two operands will remain scalar and which will be reduced. 9211 // For minmax the chain will be the select instructions. 9212 Instruction *Chain = Phi; 9213 for (Instruction *R : ReductionOperations) { 9214 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9215 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9216 9217 VPValue *ChainOp = Plan->getVPValue(Chain); 9218 unsigned FirstOpId; 9219 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9220 "Only min/max recurrences allowed for inloop reductions"); 9221 // Recognize a call to the llvm.fmuladd intrinsic. 9222 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9223 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9224 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9225 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9226 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9227 "Expected to replace a VPWidenSelectSC"); 9228 FirstOpId = 1; 9229 } else { 9230 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9231 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9232 "Expected to replace a VPWidenSC"); 9233 FirstOpId = 0; 9234 } 9235 unsigned VecOpId = 9236 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9237 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9238 9239 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9240 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9241 : nullptr; 9242 9243 if (IsFMulAdd) { 9244 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9245 // need to create an fmul recipe to use as the vector operand for the 9246 // fadd reduction. 9247 VPInstruction *FMulRecipe = new VPInstruction( 9248 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9249 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9250 WidenRecipe->getParent()->insert(FMulRecipe, 9251 WidenRecipe->getIterator()); 9252 VecOp = FMulRecipe; 9253 } 9254 VPReductionRecipe *RedRecipe = 9255 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9256 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9257 Plan->removeVPValueFor(R); 9258 Plan->addVPValue(R, RedRecipe); 9259 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9260 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9261 WidenRecipe->eraseFromParent(); 9262 9263 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9264 VPRecipeBase *CompareRecipe = 9265 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9266 assert(isa<VPWidenRecipe>(CompareRecipe) && 9267 "Expected to replace a VPWidenSC"); 9268 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9269 "Expected no remaining users"); 9270 CompareRecipe->eraseFromParent(); 9271 } 9272 Chain = R; 9273 } 9274 } 9275 9276 // If tail is folded by masking, introduce selects between the phi 9277 // and the live-out instruction of each reduction, at the beginning of the 9278 // dedicated latch block. 9279 if (CM.foldTailByMasking()) { 9280 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9281 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9282 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9283 if (!PhiR || PhiR->isInLoop()) 9284 continue; 9285 VPValue *Cond = 9286 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9287 VPValue *Red = PhiR->getBackedgeValue(); 9288 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9289 "reduction recipe must be defined before latch"); 9290 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9291 } 9292 } 9293 } 9294 9295 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9296 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9297 VPSlotTracker &SlotTracker) const { 9298 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9299 IG->getInsertPos()->printAsOperand(O, false); 9300 O << ", "; 9301 getAddr()->printAsOperand(O, SlotTracker); 9302 VPValue *Mask = getMask(); 9303 if (Mask) { 9304 O << ", "; 9305 Mask->printAsOperand(O, SlotTracker); 9306 } 9307 9308 unsigned OpIdx = 0; 9309 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9310 if (!IG->getMember(i)) 9311 continue; 9312 if (getNumStoreOperands() > 0) { 9313 O << "\n" << Indent << " store "; 9314 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9315 O << " to index " << i; 9316 } else { 9317 O << "\n" << Indent << " "; 9318 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9319 O << " = load from index " << i; 9320 } 9321 ++OpIdx; 9322 } 9323 } 9324 #endif 9325 9326 void VPWidenCallRecipe::execute(VPTransformState &State) { 9327 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9328 *this, State); 9329 } 9330 9331 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9332 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9333 State.ILV->setDebugLocFromInst(&I); 9334 9335 // The condition can be loop invariant but still defined inside the 9336 // loop. This means that we can't just use the original 'cond' value. 9337 // We have to take the 'vectorized' value and pick the first lane. 9338 // Instcombine will make this a no-op. 9339 auto *InvarCond = 9340 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9341 9342 for (unsigned Part = 0; Part < State.UF; ++Part) { 9343 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9344 Value *Op0 = State.get(getOperand(1), Part); 9345 Value *Op1 = State.get(getOperand(2), Part); 9346 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9347 State.set(this, Sel, Part); 9348 State.ILV->addMetadata(Sel, &I); 9349 } 9350 } 9351 9352 void VPWidenRecipe::execute(VPTransformState &State) { 9353 auto &I = *cast<Instruction>(getUnderlyingValue()); 9354 auto &Builder = State.Builder; 9355 switch (I.getOpcode()) { 9356 case Instruction::Call: 9357 case Instruction::Br: 9358 case Instruction::PHI: 9359 case Instruction::GetElementPtr: 9360 case Instruction::Select: 9361 llvm_unreachable("This instruction is handled by a different recipe."); 9362 case Instruction::UDiv: 9363 case Instruction::SDiv: 9364 case Instruction::SRem: 9365 case Instruction::URem: 9366 case Instruction::Add: 9367 case Instruction::FAdd: 9368 case Instruction::Sub: 9369 case Instruction::FSub: 9370 case Instruction::FNeg: 9371 case Instruction::Mul: 9372 case Instruction::FMul: 9373 case Instruction::FDiv: 9374 case Instruction::FRem: 9375 case Instruction::Shl: 9376 case Instruction::LShr: 9377 case Instruction::AShr: 9378 case Instruction::And: 9379 case Instruction::Or: 9380 case Instruction::Xor: { 9381 // Just widen unops and binops. 9382 State.ILV->setDebugLocFromInst(&I); 9383 9384 for (unsigned Part = 0; Part < State.UF; ++Part) { 9385 SmallVector<Value *, 2> Ops; 9386 for (VPValue *VPOp : operands()) 9387 Ops.push_back(State.get(VPOp, Part)); 9388 9389 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9390 9391 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9392 VecOp->copyIRFlags(&I); 9393 9394 // If the instruction is vectorized and was in a basic block that needed 9395 // predication, we can't propagate poison-generating flags (nuw/nsw, 9396 // exact, etc.). The control flow has been linearized and the 9397 // instruction is no longer guarded by the predicate, which could make 9398 // the flag properties to no longer hold. 9399 if (State.MayGeneratePoisonRecipes.contains(this)) 9400 VecOp->dropPoisonGeneratingFlags(); 9401 } 9402 9403 // Use this vector value for all users of the original instruction. 9404 State.set(this, V, Part); 9405 State.ILV->addMetadata(V, &I); 9406 } 9407 9408 break; 9409 } 9410 case Instruction::ICmp: 9411 case Instruction::FCmp: { 9412 // Widen compares. Generate vector compares. 9413 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9414 auto *Cmp = cast<CmpInst>(&I); 9415 State.ILV->setDebugLocFromInst(Cmp); 9416 for (unsigned Part = 0; Part < State.UF; ++Part) { 9417 Value *A = State.get(getOperand(0), Part); 9418 Value *B = State.get(getOperand(1), Part); 9419 Value *C = nullptr; 9420 if (FCmp) { 9421 // Propagate fast math flags. 9422 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9423 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9424 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9425 } else { 9426 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9427 } 9428 State.set(this, C, Part); 9429 State.ILV->addMetadata(C, &I); 9430 } 9431 9432 break; 9433 } 9434 9435 case Instruction::ZExt: 9436 case Instruction::SExt: 9437 case Instruction::FPToUI: 9438 case Instruction::FPToSI: 9439 case Instruction::FPExt: 9440 case Instruction::PtrToInt: 9441 case Instruction::IntToPtr: 9442 case Instruction::SIToFP: 9443 case Instruction::UIToFP: 9444 case Instruction::Trunc: 9445 case Instruction::FPTrunc: 9446 case Instruction::BitCast: { 9447 auto *CI = cast<CastInst>(&I); 9448 State.ILV->setDebugLocFromInst(CI); 9449 9450 /// Vectorize casts. 9451 Type *DestTy = (State.VF.isScalar()) 9452 ? CI->getType() 9453 : VectorType::get(CI->getType(), State.VF); 9454 9455 for (unsigned Part = 0; Part < State.UF; ++Part) { 9456 Value *A = State.get(getOperand(0), Part); 9457 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9458 State.set(this, Cast, Part); 9459 State.ILV->addMetadata(Cast, &I); 9460 } 9461 break; 9462 } 9463 default: 9464 // This instruction is not vectorized by simple widening. 9465 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9466 llvm_unreachable("Unhandled instruction!"); 9467 } // end of switch. 9468 } 9469 9470 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9471 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9472 // Construct a vector GEP by widening the operands of the scalar GEP as 9473 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9474 // results in a vector of pointers when at least one operand of the GEP 9475 // is vector-typed. Thus, to keep the representation compact, we only use 9476 // vector-typed operands for loop-varying values. 9477 9478 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9479 // If we are vectorizing, but the GEP has only loop-invariant operands, 9480 // the GEP we build (by only using vector-typed operands for 9481 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9482 // produce a vector of pointers, we need to either arbitrarily pick an 9483 // operand to broadcast, or broadcast a clone of the original GEP. 9484 // Here, we broadcast a clone of the original. 9485 // 9486 // TODO: If at some point we decide to scalarize instructions having 9487 // loop-invariant operands, this special case will no longer be 9488 // required. We would add the scalarization decision to 9489 // collectLoopScalars() and teach getVectorValue() to broadcast 9490 // the lane-zero scalar value. 9491 auto *Clone = State.Builder.Insert(GEP->clone()); 9492 for (unsigned Part = 0; Part < State.UF; ++Part) { 9493 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9494 State.set(this, EntryPart, Part); 9495 State.ILV->addMetadata(EntryPart, GEP); 9496 } 9497 } else { 9498 // If the GEP has at least one loop-varying operand, we are sure to 9499 // produce a vector of pointers. But if we are only unrolling, we want 9500 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9501 // produce with the code below will be scalar (if VF == 1) or vector 9502 // (otherwise). Note that for the unroll-only case, we still maintain 9503 // values in the vector mapping with initVector, as we do for other 9504 // instructions. 9505 for (unsigned Part = 0; Part < State.UF; ++Part) { 9506 // The pointer operand of the new GEP. If it's loop-invariant, we 9507 // won't broadcast it. 9508 auto *Ptr = IsPtrLoopInvariant 9509 ? State.get(getOperand(0), VPIteration(0, 0)) 9510 : State.get(getOperand(0), Part); 9511 9512 // Collect all the indices for the new GEP. If any index is 9513 // loop-invariant, we won't broadcast it. 9514 SmallVector<Value *, 4> Indices; 9515 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9516 VPValue *Operand = getOperand(I); 9517 if (IsIndexLoopInvariant[I - 1]) 9518 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9519 else 9520 Indices.push_back(State.get(Operand, Part)); 9521 } 9522 9523 // If the GEP instruction is vectorized and was in a basic block that 9524 // needed predication, we can't propagate the poison-generating 'inbounds' 9525 // flag. The control flow has been linearized and the GEP is no longer 9526 // guarded by the predicate, which could make the 'inbounds' properties to 9527 // no longer hold. 9528 bool IsInBounds = 9529 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9530 9531 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9532 // but it should be a vector, otherwise. 9533 auto *NewGEP = IsInBounds 9534 ? State.Builder.CreateInBoundsGEP( 9535 GEP->getSourceElementType(), Ptr, Indices) 9536 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9537 Ptr, Indices); 9538 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9539 "NewGEP is not a pointer vector"); 9540 State.set(this, NewGEP, Part); 9541 State.ILV->addMetadata(NewGEP, GEP); 9542 } 9543 } 9544 } 9545 9546 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9547 assert(!State.Instance && "Int or FP induction being replicated."); 9548 9549 Value *Start = getStartValue()->getLiveInIRValue(); 9550 const InductionDescriptor &ID = getInductionDescriptor(); 9551 TruncInst *Trunc = getTruncInst(); 9552 IRBuilderBase &Builder = State.Builder; 9553 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9554 assert(State.VF.isVector() && "must have vector VF"); 9555 9556 // The value from the original loop to which we are mapping the new induction 9557 // variable. 9558 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9559 9560 auto &DL = EntryVal->getModule()->getDataLayout(); 9561 9562 // Generate code for the induction step. Note that induction steps are 9563 // required to be loop-invariant 9564 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 9565 if (SE.isSCEVable(IV->getType())) { 9566 SCEVExpander Exp(SE, DL, "induction"); 9567 return Exp.expandCodeFor(Step, Step->getType(), 9568 State.CFG.VectorPreHeader->getTerminator()); 9569 } 9570 return cast<SCEVUnknown>(Step)->getValue(); 9571 }; 9572 9573 // Fast-math-flags propagate from the original induction instruction. 9574 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9575 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9576 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9577 9578 // Now do the actual transformations, and start with creating the step value. 9579 Value *Step = CreateStepValue(ID.getStep()); 9580 9581 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9582 "Expected either an induction phi-node or a truncate of it!"); 9583 9584 // Construct the initial value of the vector IV in the vector loop preheader 9585 auto CurrIP = Builder.saveIP(); 9586 Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); 9587 if (isa<TruncInst>(EntryVal)) { 9588 assert(Start->getType()->isIntegerTy() && 9589 "Truncation requires an integer type"); 9590 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9591 Step = Builder.CreateTrunc(Step, TruncType); 9592 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9593 } 9594 9595 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9596 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9597 Value *SteppedStart = getStepVector( 9598 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9599 9600 // We create vector phi nodes for both integer and floating-point induction 9601 // variables. Here, we determine the kind of arithmetic we will perform. 9602 Instruction::BinaryOps AddOp; 9603 Instruction::BinaryOps MulOp; 9604 if (Step->getType()->isIntegerTy()) { 9605 AddOp = Instruction::Add; 9606 MulOp = Instruction::Mul; 9607 } else { 9608 AddOp = ID.getInductionOpcode(); 9609 MulOp = Instruction::FMul; 9610 } 9611 9612 // Multiply the vectorization factor by the step using integer or 9613 // floating-point arithmetic as appropriate. 9614 Type *StepType = Step->getType(); 9615 Value *RuntimeVF; 9616 if (Step->getType()->isFloatingPointTy()) 9617 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9618 else 9619 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9620 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9621 9622 // Create a vector splat to use in the induction update. 9623 // 9624 // FIXME: If the step is non-constant, we create the vector splat with 9625 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9626 // handle a constant vector splat. 9627 Value *SplatVF = isa<Constant>(Mul) 9628 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9629 : Builder.CreateVectorSplat(State.VF, Mul); 9630 Builder.restoreIP(CurrIP); 9631 9632 // We may need to add the step a number of times, depending on the unroll 9633 // factor. The last of those goes into the PHI. 9634 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9635 &*State.CFG.PrevBB->getFirstInsertionPt()); 9636 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9637 Instruction *LastInduction = VecInd; 9638 for (unsigned Part = 0; Part < State.UF; ++Part) { 9639 State.set(this, LastInduction, Part); 9640 9641 if (isa<TruncInst>(EntryVal)) 9642 State.ILV->addMetadata(LastInduction, EntryVal); 9643 9644 LastInduction = cast<Instruction>( 9645 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9646 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9647 } 9648 9649 // Move the last step to the end of the latch block. This ensures consistent 9650 // placement of all induction updates. 9651 auto *LoopVectorLatch = 9652 State.LI->getLoopFor(State.CFG.PrevBB)->getLoopLatch(); 9653 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 9654 LastInduction->moveBefore(Br); 9655 LastInduction->setName("vec.ind.next"); 9656 9657 VecInd->addIncoming(SteppedStart, State.CFG.VectorPreHeader); 9658 VecInd->addIncoming(LastInduction, LoopVectorLatch); 9659 } 9660 9661 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9662 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9663 9664 // Fast-math-flags propagate from the original induction instruction. 9665 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9666 if (IndDesc.getInductionBinOp() && 9667 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9668 State.Builder.setFastMathFlags( 9669 IndDesc.getInductionBinOp()->getFastMathFlags()); 9670 9671 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9672 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9673 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9674 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9675 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9676 ScalarIV = 9677 Ty->isIntegerTy() 9678 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9679 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9680 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9681 getStartValue()->getLiveInIRValue(), Step, 9682 IndDesc); 9683 ScalarIV->setName("offset.idx"); 9684 } 9685 if (TruncToTy) { 9686 assert(Step->getType()->isIntegerTy() && 9687 "Truncation requires an integer step"); 9688 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9689 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9690 } 9691 return ScalarIV; 9692 }; 9693 9694 Value *ScalarIV = CreateScalarIV(Step); 9695 if (State.VF.isVector()) { 9696 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9697 return; 9698 } 9699 9700 for (unsigned Part = 0; Part < State.UF; ++Part) { 9701 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9702 Value *EntryPart; 9703 if (Step->getType()->isFloatingPointTy()) { 9704 Value *StartIdx = 9705 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9706 // Floating-point operations inherit FMF via the builder's flags. 9707 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9708 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9709 ScalarIV, MulOp); 9710 } else { 9711 Value *StartIdx = 9712 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9713 EntryPart = State.Builder.CreateAdd( 9714 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9715 } 9716 State.set(this, EntryPart, Part); 9717 } 9718 } 9719 9720 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9721 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9722 State); 9723 } 9724 9725 void VPBlendRecipe::execute(VPTransformState &State) { 9726 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9727 // We know that all PHIs in non-header blocks are converted into 9728 // selects, so we don't have to worry about the insertion order and we 9729 // can just use the builder. 9730 // At this point we generate the predication tree. There may be 9731 // duplications since this is a simple recursive scan, but future 9732 // optimizations will clean it up. 9733 9734 unsigned NumIncoming = getNumIncomingValues(); 9735 9736 // Generate a sequence of selects of the form: 9737 // SELECT(Mask3, In3, 9738 // SELECT(Mask2, In2, 9739 // SELECT(Mask1, In1, 9740 // In0))) 9741 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9742 // are essentially undef are taken from In0. 9743 InnerLoopVectorizer::VectorParts Entry(State.UF); 9744 for (unsigned In = 0; In < NumIncoming; ++In) { 9745 for (unsigned Part = 0; Part < State.UF; ++Part) { 9746 // We might have single edge PHIs (blocks) - use an identity 9747 // 'select' for the first PHI operand. 9748 Value *In0 = State.get(getIncomingValue(In), Part); 9749 if (In == 0) 9750 Entry[Part] = In0; // Initialize with the first incoming value. 9751 else { 9752 // Select between the current value and the previous incoming edge 9753 // based on the incoming mask. 9754 Value *Cond = State.get(getMask(In), Part); 9755 Entry[Part] = 9756 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9757 } 9758 } 9759 } 9760 for (unsigned Part = 0; Part < State.UF; ++Part) 9761 State.set(this, Entry[Part], Part); 9762 } 9763 9764 void VPInterleaveRecipe::execute(VPTransformState &State) { 9765 assert(!State.Instance && "Interleave group being replicated."); 9766 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9767 getStoredValues(), getMask()); 9768 } 9769 9770 void VPReductionRecipe::execute(VPTransformState &State) { 9771 assert(!State.Instance && "Reduction being replicated."); 9772 Value *PrevInChain = State.get(getChainOp(), 0); 9773 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9774 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9775 // Propagate the fast-math flags carried by the underlying instruction. 9776 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9777 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9778 for (unsigned Part = 0; Part < State.UF; ++Part) { 9779 Value *NewVecOp = State.get(getVecOp(), Part); 9780 if (VPValue *Cond = getCondOp()) { 9781 Value *NewCond = State.get(Cond, Part); 9782 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9783 Value *Iden = RdxDesc->getRecurrenceIdentity( 9784 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9785 Value *IdenVec = 9786 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9787 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9788 NewVecOp = Select; 9789 } 9790 Value *NewRed; 9791 Value *NextInChain; 9792 if (IsOrdered) { 9793 if (State.VF.isVector()) 9794 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9795 PrevInChain); 9796 else 9797 NewRed = State.Builder.CreateBinOp( 9798 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9799 NewVecOp); 9800 PrevInChain = NewRed; 9801 } else { 9802 PrevInChain = State.get(getChainOp(), Part); 9803 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9804 } 9805 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9806 NextInChain = 9807 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9808 NewRed, PrevInChain); 9809 } else if (IsOrdered) 9810 NextInChain = NewRed; 9811 else 9812 NextInChain = State.Builder.CreateBinOp( 9813 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9814 PrevInChain); 9815 State.set(this, NextInChain, Part); 9816 } 9817 } 9818 9819 void VPReplicateRecipe::execute(VPTransformState &State) { 9820 if (State.Instance) { // Generate a single instance. 9821 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9822 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9823 IsPredicated, State); 9824 // Insert scalar instance packing it into a vector. 9825 if (AlsoPack && State.VF.isVector()) { 9826 // If we're constructing lane 0, initialize to start from poison. 9827 if (State.Instance->Lane.isFirstLane()) { 9828 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9829 Value *Poison = PoisonValue::get( 9830 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9831 State.set(this, Poison, State.Instance->Part); 9832 } 9833 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9834 } 9835 return; 9836 } 9837 9838 // Generate scalar instances for all VF lanes of all UF parts, unless the 9839 // instruction is uniform inwhich case generate only the first lane for each 9840 // of the UF parts. 9841 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9842 assert((!State.VF.isScalable() || IsUniform) && 9843 "Can't scalarize a scalable vector"); 9844 for (unsigned Part = 0; Part < State.UF; ++Part) 9845 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9846 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9847 VPIteration(Part, Lane), IsPredicated, 9848 State); 9849 } 9850 9851 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9852 assert(State.Instance && "Branch on Mask works only on single instance."); 9853 9854 unsigned Part = State.Instance->Part; 9855 unsigned Lane = State.Instance->Lane.getKnownLane(); 9856 9857 Value *ConditionBit = nullptr; 9858 VPValue *BlockInMask = getMask(); 9859 if (BlockInMask) { 9860 ConditionBit = State.get(BlockInMask, Part); 9861 if (ConditionBit->getType()->isVectorTy()) 9862 ConditionBit = State.Builder.CreateExtractElement( 9863 ConditionBit, State.Builder.getInt32(Lane)); 9864 } else // Block in mask is all-one. 9865 ConditionBit = State.Builder.getTrue(); 9866 9867 // Replace the temporary unreachable terminator with a new conditional branch, 9868 // whose two destinations will be set later when they are created. 9869 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9870 assert(isa<UnreachableInst>(CurrentTerminator) && 9871 "Expected to replace unreachable terminator with conditional branch."); 9872 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9873 CondBr->setSuccessor(0, nullptr); 9874 ReplaceInstWithInst(CurrentTerminator, CondBr); 9875 } 9876 9877 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9878 assert(State.Instance && "Predicated instruction PHI works per instance."); 9879 Instruction *ScalarPredInst = 9880 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9881 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9882 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9883 assert(PredicatingBB && "Predicated block has no single predecessor."); 9884 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9885 "operand must be VPReplicateRecipe"); 9886 9887 // By current pack/unpack logic we need to generate only a single phi node: if 9888 // a vector value for the predicated instruction exists at this point it means 9889 // the instruction has vector users only, and a phi for the vector value is 9890 // needed. In this case the recipe of the predicated instruction is marked to 9891 // also do that packing, thereby "hoisting" the insert-element sequence. 9892 // Otherwise, a phi node for the scalar value is needed. 9893 unsigned Part = State.Instance->Part; 9894 if (State.hasVectorValue(getOperand(0), Part)) { 9895 Value *VectorValue = State.get(getOperand(0), Part); 9896 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9897 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9898 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9899 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9900 if (State.hasVectorValue(this, Part)) 9901 State.reset(this, VPhi, Part); 9902 else 9903 State.set(this, VPhi, Part); 9904 // NOTE: Currently we need to update the value of the operand, so the next 9905 // predicated iteration inserts its generated value in the correct vector. 9906 State.reset(getOperand(0), VPhi, Part); 9907 } else { 9908 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9909 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9910 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9911 PredicatingBB); 9912 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9913 if (State.hasScalarValue(this, *State.Instance)) 9914 State.reset(this, Phi, *State.Instance); 9915 else 9916 State.set(this, Phi, *State.Instance); 9917 // NOTE: Currently we need to update the value of the operand, so the next 9918 // predicated iteration inserts its generated value in the correct vector. 9919 State.reset(getOperand(0), Phi, *State.Instance); 9920 } 9921 } 9922 9923 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9924 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9925 9926 // Attempt to issue a wide load. 9927 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9928 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9929 9930 assert((LI || SI) && "Invalid Load/Store instruction"); 9931 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9932 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9933 9934 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9935 9936 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9937 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9938 bool CreateGatherScatter = !Consecutive; 9939 9940 auto &Builder = State.Builder; 9941 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9942 bool isMaskRequired = getMask(); 9943 if (isMaskRequired) 9944 for (unsigned Part = 0; Part < State.UF; ++Part) 9945 BlockInMaskParts[Part] = State.get(getMask(), Part); 9946 9947 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9948 // Calculate the pointer for the specific unroll-part. 9949 GetElementPtrInst *PartPtr = nullptr; 9950 9951 bool InBounds = false; 9952 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9953 InBounds = gep->isInBounds(); 9954 if (Reverse) { 9955 // If the address is consecutive but reversed, then the 9956 // wide store needs to start at the last vector element. 9957 // RunTimeVF = VScale * VF.getKnownMinValue() 9958 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9959 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9960 // NumElt = -Part * RunTimeVF 9961 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9962 // LastLane = 1 - RunTimeVF 9963 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9964 PartPtr = 9965 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9966 PartPtr->setIsInBounds(InBounds); 9967 PartPtr = cast<GetElementPtrInst>( 9968 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9969 PartPtr->setIsInBounds(InBounds); 9970 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9971 BlockInMaskParts[Part] = 9972 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9973 } else { 9974 Value *Increment = 9975 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9976 PartPtr = cast<GetElementPtrInst>( 9977 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9978 PartPtr->setIsInBounds(InBounds); 9979 } 9980 9981 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9982 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9983 }; 9984 9985 // Handle Stores: 9986 if (SI) { 9987 State.ILV->setDebugLocFromInst(SI); 9988 9989 for (unsigned Part = 0; Part < State.UF; ++Part) { 9990 Instruction *NewSI = nullptr; 9991 Value *StoredVal = State.get(StoredValue, Part); 9992 if (CreateGatherScatter) { 9993 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9994 Value *VectorGep = State.get(getAddr(), Part); 9995 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9996 MaskPart); 9997 } else { 9998 if (Reverse) { 9999 // If we store to reverse consecutive memory locations, then we need 10000 // to reverse the order of elements in the stored value. 10001 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 10002 // We don't want to update the value in the map as it might be used in 10003 // another expression. So don't call resetVectorValue(StoredVal). 10004 } 10005 auto *VecPtr = 10006 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10007 if (isMaskRequired) 10008 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 10009 BlockInMaskParts[Part]); 10010 else 10011 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 10012 } 10013 State.ILV->addMetadata(NewSI, SI); 10014 } 10015 return; 10016 } 10017 10018 // Handle loads. 10019 assert(LI && "Must have a load instruction"); 10020 State.ILV->setDebugLocFromInst(LI); 10021 for (unsigned Part = 0; Part < State.UF; ++Part) { 10022 Value *NewLI; 10023 if (CreateGatherScatter) { 10024 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10025 Value *VectorGep = State.get(getAddr(), Part); 10026 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10027 nullptr, "wide.masked.gather"); 10028 State.ILV->addMetadata(NewLI, LI); 10029 } else { 10030 auto *VecPtr = 10031 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10032 if (isMaskRequired) 10033 NewLI = Builder.CreateMaskedLoad( 10034 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10035 PoisonValue::get(DataTy), "wide.masked.load"); 10036 else 10037 NewLI = 10038 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10039 10040 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10041 State.ILV->addMetadata(NewLI, LI); 10042 if (Reverse) 10043 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10044 } 10045 10046 State.set(this, NewLI, Part); 10047 } 10048 } 10049 10050 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10051 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10052 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10053 // for predication. 10054 static ScalarEpilogueLowering getScalarEpilogueLowering( 10055 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10056 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10057 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10058 LoopVectorizationLegality &LVL) { 10059 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10060 // don't look at hints or options, and don't request a scalar epilogue. 10061 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10062 // LoopAccessInfo (due to code dependency and not being able to reliably get 10063 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10064 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10065 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10066 // back to the old way and vectorize with versioning when forced. See D81345.) 10067 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10068 PGSOQueryType::IRPass) && 10069 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10070 return CM_ScalarEpilogueNotAllowedOptSize; 10071 10072 // 2) If set, obey the directives 10073 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10074 switch (PreferPredicateOverEpilogue) { 10075 case PreferPredicateTy::ScalarEpilogue: 10076 return CM_ScalarEpilogueAllowed; 10077 case PreferPredicateTy::PredicateElseScalarEpilogue: 10078 return CM_ScalarEpilogueNotNeededUsePredicate; 10079 case PreferPredicateTy::PredicateOrDontVectorize: 10080 return CM_ScalarEpilogueNotAllowedUsePredicate; 10081 }; 10082 } 10083 10084 // 3) If set, obey the hints 10085 switch (Hints.getPredicate()) { 10086 case LoopVectorizeHints::FK_Enabled: 10087 return CM_ScalarEpilogueNotNeededUsePredicate; 10088 case LoopVectorizeHints::FK_Disabled: 10089 return CM_ScalarEpilogueAllowed; 10090 }; 10091 10092 // 4) if the TTI hook indicates this is profitable, request predication. 10093 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10094 LVL.getLAI())) 10095 return CM_ScalarEpilogueNotNeededUsePredicate; 10096 10097 return CM_ScalarEpilogueAllowed; 10098 } 10099 10100 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10101 // If Values have been set for this Def return the one relevant for \p Part. 10102 if (hasVectorValue(Def, Part)) 10103 return Data.PerPartOutput[Def][Part]; 10104 10105 if (!hasScalarValue(Def, {Part, 0})) { 10106 Value *IRV = Def->getLiveInIRValue(); 10107 Value *B = ILV->getBroadcastInstrs(IRV); 10108 set(Def, B, Part); 10109 return B; 10110 } 10111 10112 Value *ScalarValue = get(Def, {Part, 0}); 10113 // If we aren't vectorizing, we can just copy the scalar map values over 10114 // to the vector map. 10115 if (VF.isScalar()) { 10116 set(Def, ScalarValue, Part); 10117 return ScalarValue; 10118 } 10119 10120 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10121 bool IsUniform = RepR && RepR->isUniform(); 10122 10123 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10124 // Check if there is a scalar value for the selected lane. 10125 if (!hasScalarValue(Def, {Part, LastLane})) { 10126 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10127 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10128 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10129 "unexpected recipe found to be invariant"); 10130 IsUniform = true; 10131 LastLane = 0; 10132 } 10133 10134 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10135 // Set the insert point after the last scalarized instruction or after the 10136 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10137 // will directly follow the scalar definitions. 10138 auto OldIP = Builder.saveIP(); 10139 auto NewIP = 10140 isa<PHINode>(LastInst) 10141 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10142 : std::next(BasicBlock::iterator(LastInst)); 10143 Builder.SetInsertPoint(&*NewIP); 10144 10145 // However, if we are vectorizing, we need to construct the vector values. 10146 // If the value is known to be uniform after vectorization, we can just 10147 // broadcast the scalar value corresponding to lane zero for each unroll 10148 // iteration. Otherwise, we construct the vector values using 10149 // insertelement instructions. Since the resulting vectors are stored in 10150 // State, we will only generate the insertelements once. 10151 Value *VectorValue = nullptr; 10152 if (IsUniform) { 10153 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10154 set(Def, VectorValue, Part); 10155 } else { 10156 // Initialize packing with insertelements to start from undef. 10157 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10158 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10159 set(Def, Undef, Part); 10160 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10161 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10162 VectorValue = get(Def, Part); 10163 } 10164 Builder.restoreIP(OldIP); 10165 return VectorValue; 10166 } 10167 10168 // Process the loop in the VPlan-native vectorization path. This path builds 10169 // VPlan upfront in the vectorization pipeline, which allows to apply 10170 // VPlan-to-VPlan transformations from the very beginning without modifying the 10171 // input LLVM IR. 10172 static bool processLoopInVPlanNativePath( 10173 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10174 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10175 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10176 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10177 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10178 LoopVectorizationRequirements &Requirements) { 10179 10180 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10181 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10182 return false; 10183 } 10184 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10185 Function *F = L->getHeader()->getParent(); 10186 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10187 10188 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10189 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10190 10191 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10192 &Hints, IAI); 10193 // Use the planner for outer loop vectorization. 10194 // TODO: CM is not used at this point inside the planner. Turn CM into an 10195 // optional argument if we don't need it in the future. 10196 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10197 Requirements, ORE); 10198 10199 // Get user vectorization factor. 10200 ElementCount UserVF = Hints.getWidth(); 10201 10202 CM.collectElementTypesForWidening(); 10203 10204 // Plan how to best vectorize, return the best VF and its cost. 10205 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10206 10207 // If we are stress testing VPlan builds, do not attempt to generate vector 10208 // code. Masked vector code generation support will follow soon. 10209 // Also, do not attempt to vectorize if no vector code will be produced. 10210 if (VPlanBuildStressTest || EnableVPlanPredication || 10211 VectorizationFactor::Disabled() == VF) 10212 return false; 10213 10214 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10215 10216 { 10217 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10218 F->getParent()->getDataLayout()); 10219 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10220 &CM, BFI, PSI, Checks); 10221 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10222 << L->getHeader()->getParent()->getName() << "\"\n"); 10223 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10224 } 10225 10226 // Mark the loop as already vectorized to avoid vectorizing again. 10227 Hints.setAlreadyVectorized(); 10228 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10229 return true; 10230 } 10231 10232 // Emit a remark if there are stores to floats that required a floating point 10233 // extension. If the vectorized loop was generated with floating point there 10234 // will be a performance penalty from the conversion overhead and the change in 10235 // the vector width. 10236 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10237 SmallVector<Instruction *, 4> Worklist; 10238 for (BasicBlock *BB : L->getBlocks()) { 10239 for (Instruction &Inst : *BB) { 10240 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10241 if (S->getValueOperand()->getType()->isFloatTy()) 10242 Worklist.push_back(S); 10243 } 10244 } 10245 } 10246 10247 // Traverse the floating point stores upwards searching, for floating point 10248 // conversions. 10249 SmallPtrSet<const Instruction *, 4> Visited; 10250 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10251 while (!Worklist.empty()) { 10252 auto *I = Worklist.pop_back_val(); 10253 if (!L->contains(I)) 10254 continue; 10255 if (!Visited.insert(I).second) 10256 continue; 10257 10258 // Emit a remark if the floating point store required a floating 10259 // point conversion. 10260 // TODO: More work could be done to identify the root cause such as a 10261 // constant or a function return type and point the user to it. 10262 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10263 ORE->emit([&]() { 10264 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10265 I->getDebugLoc(), L->getHeader()) 10266 << "floating point conversion changes vector width. " 10267 << "Mixed floating point precision requires an up/down " 10268 << "cast that will negatively impact performance."; 10269 }); 10270 10271 for (Use &Op : I->operands()) 10272 if (auto *OpI = dyn_cast<Instruction>(Op)) 10273 Worklist.push_back(OpI); 10274 } 10275 } 10276 10277 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10278 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10279 !EnableLoopInterleaving), 10280 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10281 !EnableLoopVectorization) {} 10282 10283 bool LoopVectorizePass::processLoop(Loop *L) { 10284 assert((EnableVPlanNativePath || L->isInnermost()) && 10285 "VPlan-native path is not enabled. Only process inner loops."); 10286 10287 #ifndef NDEBUG 10288 const std::string DebugLocStr = getDebugLocString(L); 10289 #endif /* NDEBUG */ 10290 10291 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10292 << L->getHeader()->getParent()->getName() << "' from " 10293 << DebugLocStr << "\n"); 10294 10295 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10296 10297 LLVM_DEBUG( 10298 dbgs() << "LV: Loop hints:" 10299 << " force=" 10300 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10301 ? "disabled" 10302 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10303 ? "enabled" 10304 : "?")) 10305 << " width=" << Hints.getWidth() 10306 << " interleave=" << Hints.getInterleave() << "\n"); 10307 10308 // Function containing loop 10309 Function *F = L->getHeader()->getParent(); 10310 10311 // Looking at the diagnostic output is the only way to determine if a loop 10312 // was vectorized (other than looking at the IR or machine code), so it 10313 // is important to generate an optimization remark for each loop. Most of 10314 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10315 // generated as OptimizationRemark and OptimizationRemarkMissed are 10316 // less verbose reporting vectorized loops and unvectorized loops that may 10317 // benefit from vectorization, respectively. 10318 10319 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10320 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10321 return false; 10322 } 10323 10324 PredicatedScalarEvolution PSE(*SE, *L); 10325 10326 // Check if it is legal to vectorize the loop. 10327 LoopVectorizationRequirements Requirements; 10328 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10329 &Requirements, &Hints, DB, AC, BFI, PSI); 10330 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10331 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10332 Hints.emitRemarkWithHints(); 10333 return false; 10334 } 10335 10336 // Check the function attributes and profiles to find out if this function 10337 // should be optimized for size. 10338 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10339 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10340 10341 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10342 // here. They may require CFG and instruction level transformations before 10343 // even evaluating whether vectorization is profitable. Since we cannot modify 10344 // the incoming IR, we need to build VPlan upfront in the vectorization 10345 // pipeline. 10346 if (!L->isInnermost()) 10347 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10348 ORE, BFI, PSI, Hints, Requirements); 10349 10350 assert(L->isInnermost() && "Inner loop expected."); 10351 10352 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10353 // count by optimizing for size, to minimize overheads. 10354 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10355 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10356 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10357 << "This loop is worth vectorizing only if no scalar " 10358 << "iteration overheads are incurred."); 10359 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10360 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10361 else { 10362 LLVM_DEBUG(dbgs() << "\n"); 10363 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10364 } 10365 } 10366 10367 // Check the function attributes to see if implicit floats are allowed. 10368 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10369 // an integer loop and the vector instructions selected are purely integer 10370 // vector instructions? 10371 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10372 reportVectorizationFailure( 10373 "Can't vectorize when the NoImplicitFloat attribute is used", 10374 "loop not vectorized due to NoImplicitFloat attribute", 10375 "NoImplicitFloat", ORE, L); 10376 Hints.emitRemarkWithHints(); 10377 return false; 10378 } 10379 10380 // Check if the target supports potentially unsafe FP vectorization. 10381 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10382 // for the target we're vectorizing for, to make sure none of the 10383 // additional fp-math flags can help. 10384 if (Hints.isPotentiallyUnsafe() && 10385 TTI->isFPVectorizationPotentiallyUnsafe()) { 10386 reportVectorizationFailure( 10387 "Potentially unsafe FP op prevents vectorization", 10388 "loop not vectorized due to unsafe FP support.", 10389 "UnsafeFP", ORE, L); 10390 Hints.emitRemarkWithHints(); 10391 return false; 10392 } 10393 10394 bool AllowOrderedReductions; 10395 // If the flag is set, use that instead and override the TTI behaviour. 10396 if (ForceOrderedReductions.getNumOccurrences() > 0) 10397 AllowOrderedReductions = ForceOrderedReductions; 10398 else 10399 AllowOrderedReductions = TTI->enableOrderedReductions(); 10400 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10401 ORE->emit([&]() { 10402 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10403 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10404 ExactFPMathInst->getDebugLoc(), 10405 ExactFPMathInst->getParent()) 10406 << "loop not vectorized: cannot prove it is safe to reorder " 10407 "floating-point operations"; 10408 }); 10409 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10410 "reorder floating-point operations\n"); 10411 Hints.emitRemarkWithHints(); 10412 return false; 10413 } 10414 10415 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10416 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10417 10418 // If an override option has been passed in for interleaved accesses, use it. 10419 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10420 UseInterleaved = EnableInterleavedMemAccesses; 10421 10422 // Analyze interleaved memory accesses. 10423 if (UseInterleaved) { 10424 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10425 } 10426 10427 // Use the cost model. 10428 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10429 F, &Hints, IAI); 10430 CM.collectValuesToIgnore(); 10431 CM.collectElementTypesForWidening(); 10432 10433 // Use the planner for vectorization. 10434 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10435 Requirements, ORE); 10436 10437 // Get user vectorization factor and interleave count. 10438 ElementCount UserVF = Hints.getWidth(); 10439 unsigned UserIC = Hints.getInterleave(); 10440 10441 // Plan how to best vectorize, return the best VF and its cost. 10442 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10443 10444 VectorizationFactor VF = VectorizationFactor::Disabled(); 10445 unsigned IC = 1; 10446 10447 if (MaybeVF) { 10448 VF = *MaybeVF; 10449 // Select the interleave count. 10450 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10451 } 10452 10453 // Identify the diagnostic messages that should be produced. 10454 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10455 bool VectorizeLoop = true, InterleaveLoop = true; 10456 if (VF.Width.isScalar()) { 10457 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10458 VecDiagMsg = std::make_pair( 10459 "VectorizationNotBeneficial", 10460 "the cost-model indicates that vectorization is not beneficial"); 10461 VectorizeLoop = false; 10462 } 10463 10464 if (!MaybeVF && UserIC > 1) { 10465 // Tell the user interleaving was avoided up-front, despite being explicitly 10466 // requested. 10467 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10468 "interleaving should be avoided up front\n"); 10469 IntDiagMsg = std::make_pair( 10470 "InterleavingAvoided", 10471 "Ignoring UserIC, because interleaving was avoided up front"); 10472 InterleaveLoop = false; 10473 } else if (IC == 1 && UserIC <= 1) { 10474 // Tell the user interleaving is not beneficial. 10475 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10476 IntDiagMsg = std::make_pair( 10477 "InterleavingNotBeneficial", 10478 "the cost-model indicates that interleaving is not beneficial"); 10479 InterleaveLoop = false; 10480 if (UserIC == 1) { 10481 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10482 IntDiagMsg.second += 10483 " and is explicitly disabled or interleave count is set to 1"; 10484 } 10485 } else if (IC > 1 && UserIC == 1) { 10486 // Tell the user interleaving is beneficial, but it explicitly disabled. 10487 LLVM_DEBUG( 10488 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10489 IntDiagMsg = std::make_pair( 10490 "InterleavingBeneficialButDisabled", 10491 "the cost-model indicates that interleaving is beneficial " 10492 "but is explicitly disabled or interleave count is set to 1"); 10493 InterleaveLoop = false; 10494 } 10495 10496 // Override IC if user provided an interleave count. 10497 IC = UserIC > 0 ? UserIC : IC; 10498 10499 // Emit diagnostic messages, if any. 10500 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10501 if (!VectorizeLoop && !InterleaveLoop) { 10502 // Do not vectorize or interleaving the loop. 10503 ORE->emit([&]() { 10504 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10505 L->getStartLoc(), L->getHeader()) 10506 << VecDiagMsg.second; 10507 }); 10508 ORE->emit([&]() { 10509 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10510 L->getStartLoc(), L->getHeader()) 10511 << IntDiagMsg.second; 10512 }); 10513 return false; 10514 } else if (!VectorizeLoop && InterleaveLoop) { 10515 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10516 ORE->emit([&]() { 10517 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10518 L->getStartLoc(), L->getHeader()) 10519 << VecDiagMsg.second; 10520 }); 10521 } else if (VectorizeLoop && !InterleaveLoop) { 10522 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10523 << ") in " << DebugLocStr << '\n'); 10524 ORE->emit([&]() { 10525 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10526 L->getStartLoc(), L->getHeader()) 10527 << IntDiagMsg.second; 10528 }); 10529 } else if (VectorizeLoop && InterleaveLoop) { 10530 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10531 << ") in " << DebugLocStr << '\n'); 10532 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10533 } 10534 10535 bool DisableRuntimeUnroll = false; 10536 MDNode *OrigLoopID = L->getLoopID(); 10537 { 10538 // Optimistically generate runtime checks. Drop them if they turn out to not 10539 // be profitable. Limit the scope of Checks, so the cleanup happens 10540 // immediately after vector codegeneration is done. 10541 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10542 F->getParent()->getDataLayout()); 10543 if (!VF.Width.isScalar() || IC > 1) 10544 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate()); 10545 10546 using namespace ore; 10547 if (!VectorizeLoop) { 10548 assert(IC > 1 && "interleave count should not be 1 or 0"); 10549 // If we decided that it is not legal to vectorize the loop, then 10550 // interleave it. 10551 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10552 &CM, BFI, PSI, Checks); 10553 10554 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10555 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10556 10557 ORE->emit([&]() { 10558 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10559 L->getHeader()) 10560 << "interleaved loop (interleaved count: " 10561 << NV("InterleaveCount", IC) << ")"; 10562 }); 10563 } else { 10564 // If we decided that it is *legal* to vectorize the loop, then do it. 10565 10566 // Consider vectorizing the epilogue too if it's profitable. 10567 VectorizationFactor EpilogueVF = 10568 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10569 if (EpilogueVF.Width.isVector()) { 10570 10571 // The first pass vectorizes the main loop and creates a scalar epilogue 10572 // to be vectorized by executing the plan (potentially with a different 10573 // factor) again shortly afterwards. 10574 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10575 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10576 EPI, &LVL, &CM, BFI, PSI, Checks); 10577 10578 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10579 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10580 DT); 10581 ++LoopsVectorized; 10582 10583 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10584 formLCSSARecursively(*L, *DT, LI, SE); 10585 10586 // Second pass vectorizes the epilogue and adjusts the control flow 10587 // edges from the first pass. 10588 EPI.MainLoopVF = EPI.EpilogueVF; 10589 EPI.MainLoopUF = EPI.EpilogueUF; 10590 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10591 ORE, EPI, &LVL, &CM, BFI, PSI, 10592 Checks); 10593 10594 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10595 10596 // Ensure that the start values for any VPReductionPHIRecipes are 10597 // updated before vectorising the epilogue loop. 10598 VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); 10599 for (VPRecipeBase &R : Header->phis()) { 10600 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10601 if (auto *Resume = MainILV.getReductionResumeValue( 10602 ReductionPhi->getRecurrenceDescriptor())) { 10603 VPValue *StartVal = new VPValue(Resume); 10604 BestEpiPlan.addExternalDef(StartVal); 10605 ReductionPhi->setOperand(0, StartVal); 10606 } 10607 } 10608 } 10609 10610 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10611 DT); 10612 ++LoopsEpilogueVectorized; 10613 10614 if (!MainILV.areSafetyChecksAdded()) 10615 DisableRuntimeUnroll = true; 10616 } else { 10617 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10618 &LVL, &CM, BFI, PSI, Checks); 10619 10620 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10621 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10622 ++LoopsVectorized; 10623 10624 // Add metadata to disable runtime unrolling a scalar loop when there 10625 // are no runtime checks about strides and memory. A scalar loop that is 10626 // rarely used is not worth unrolling. 10627 if (!LB.areSafetyChecksAdded()) 10628 DisableRuntimeUnroll = true; 10629 } 10630 // Report the vectorization decision. 10631 ORE->emit([&]() { 10632 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10633 L->getHeader()) 10634 << "vectorized loop (vectorization width: " 10635 << NV("VectorizationFactor", VF.Width) 10636 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10637 }); 10638 } 10639 10640 if (ORE->allowExtraAnalysis(LV_NAME)) 10641 checkMixedPrecision(L, ORE); 10642 } 10643 10644 Optional<MDNode *> RemainderLoopID = 10645 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10646 LLVMLoopVectorizeFollowupEpilogue}); 10647 if (RemainderLoopID.hasValue()) { 10648 L->setLoopID(RemainderLoopID.getValue()); 10649 } else { 10650 if (DisableRuntimeUnroll) 10651 AddRuntimeUnrollDisableMetaData(L); 10652 10653 // Mark the loop as already vectorized to avoid vectorizing again. 10654 Hints.setAlreadyVectorized(); 10655 } 10656 10657 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10658 return true; 10659 } 10660 10661 LoopVectorizeResult LoopVectorizePass::runImpl( 10662 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10663 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10664 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10665 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10666 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10667 SE = &SE_; 10668 LI = &LI_; 10669 TTI = &TTI_; 10670 DT = &DT_; 10671 BFI = &BFI_; 10672 TLI = TLI_; 10673 AA = &AA_; 10674 AC = &AC_; 10675 GetLAA = &GetLAA_; 10676 DB = &DB_; 10677 ORE = &ORE_; 10678 PSI = PSI_; 10679 10680 // Don't attempt if 10681 // 1. the target claims to have no vector registers, and 10682 // 2. interleaving won't help ILP. 10683 // 10684 // The second condition is necessary because, even if the target has no 10685 // vector registers, loop vectorization may still enable scalar 10686 // interleaving. 10687 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10688 TTI->getMaxInterleaveFactor(1) < 2) 10689 return LoopVectorizeResult(false, false); 10690 10691 bool Changed = false, CFGChanged = false; 10692 10693 // The vectorizer requires loops to be in simplified form. 10694 // Since simplification may add new inner loops, it has to run before the 10695 // legality and profitability checks. This means running the loop vectorizer 10696 // will simplify all loops, regardless of whether anything end up being 10697 // vectorized. 10698 for (auto &L : *LI) 10699 Changed |= CFGChanged |= 10700 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10701 10702 // Build up a worklist of inner-loops to vectorize. This is necessary as 10703 // the act of vectorizing or partially unrolling a loop creates new loops 10704 // and can invalidate iterators across the loops. 10705 SmallVector<Loop *, 8> Worklist; 10706 10707 for (Loop *L : *LI) 10708 collectSupportedLoops(*L, LI, ORE, Worklist); 10709 10710 LoopsAnalyzed += Worklist.size(); 10711 10712 // Now walk the identified inner loops. 10713 while (!Worklist.empty()) { 10714 Loop *L = Worklist.pop_back_val(); 10715 10716 // For the inner loops we actually process, form LCSSA to simplify the 10717 // transform. 10718 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10719 10720 Changed |= CFGChanged |= processLoop(L); 10721 } 10722 10723 // Process each loop nest in the function. 10724 return LoopVectorizeResult(Changed, CFGChanged); 10725 } 10726 10727 PreservedAnalyses LoopVectorizePass::run(Function &F, 10728 FunctionAnalysisManager &AM) { 10729 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10730 auto &LI = AM.getResult<LoopAnalysis>(F); 10731 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10732 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10733 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10734 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10735 auto &AA = AM.getResult<AAManager>(F); 10736 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10737 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10738 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10739 10740 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10741 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10742 [&](Loop &L) -> const LoopAccessInfo & { 10743 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10744 TLI, TTI, nullptr, nullptr, nullptr}; 10745 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10746 }; 10747 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10748 ProfileSummaryInfo *PSI = 10749 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10750 LoopVectorizeResult Result = 10751 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10752 if (!Result.MadeAnyChange) 10753 return PreservedAnalyses::all(); 10754 PreservedAnalyses PA; 10755 10756 // We currently do not preserve loopinfo/dominator analyses with outer loop 10757 // vectorization. Until this is addressed, mark these analyses as preserved 10758 // only for non-VPlan-native path. 10759 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10760 if (!EnableVPlanNativePath) { 10761 PA.preserve<LoopAnalysis>(); 10762 PA.preserve<DominatorTreeAnalysis>(); 10763 } 10764 10765 if (Result.MadeCFGChange) { 10766 // Making CFG changes likely means a loop got vectorized. Indicate that 10767 // extra simplification passes should be run. 10768 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10769 // be run if runtime checks have been added. 10770 AM.getResult<ShouldRunExtraVectorPasses>(F); 10771 PA.preserve<ShouldRunExtraVectorPasses>(); 10772 } else { 10773 PA.preserveSet<CFGAnalyses>(); 10774 } 10775 return PA; 10776 } 10777 10778 void LoopVectorizePass::printPipeline( 10779 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10780 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10781 OS, MapClassName2PassName); 10782 10783 OS << "<"; 10784 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10785 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10786 OS << ">"; 10787 } 10788