1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/None.h" 69 #include "llvm/ADT/Optional.h" 70 #include "llvm/ADT/STLExtras.h" 71 #include "llvm/ADT/SmallPtrSet.h" 72 #include "llvm/ADT/SmallSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 90 #include "llvm/Analysis/ProfileSummaryInfo.h" 91 #include "llvm/Analysis/ScalarEvolution.h" 92 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 93 #include "llvm/Analysis/TargetLibraryInfo.h" 94 #include "llvm/Analysis/TargetTransformInfo.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/InitializePasses.h" 125 #include "llvm/Pass.h" 126 #include "llvm/Support/Casting.h" 127 #include "llvm/Support/CommandLine.h" 128 #include "llvm/Support/Compiler.h" 129 #include "llvm/Support/Debug.h" 130 #include "llvm/Support/ErrorHandling.h" 131 #include "llvm/Support/InstructionCost.h" 132 #include "llvm/Support/MathExtras.h" 133 #include "llvm/Support/raw_ostream.h" 134 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 135 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks")); 202 203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 204 // that predication is preferred, and this lists all options. I.e., the 205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 206 // and predicate the instructions accordingly. If tail-folding fails, there are 207 // different fallback strategies depending on these values: 208 namespace PreferPredicateTy { 209 enum Option { 210 ScalarEpilogue = 0, 211 PredicateElseScalarEpilogue, 212 PredicateOrDontVectorize 213 }; 214 } // namespace PreferPredicateTy 215 216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 217 "prefer-predicate-over-epilogue", 218 cl::init(PreferPredicateTy::ScalarEpilogue), 219 cl::Hidden, 220 cl::desc("Tail-folding and predication preferences over creating a scalar " 221 "epilogue loop."), 222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 223 "scalar-epilogue", 224 "Don't tail-predicate loops, create scalar epilogue"), 225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 226 "predicate-else-scalar-epilogue", 227 "prefer tail-folding, create scalar epilogue if tail " 228 "folding fails."), 229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 230 "predicate-dont-vectorize", 231 "prefers tail-folding, don't attempt vectorization if " 232 "tail-folding fails."))); 233 234 static cl::opt<bool> MaximizeBandwidth( 235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 236 cl::desc("Maximize bandwidth when selecting vectorization factor which " 237 "will be determined by the smallest type in loop.")); 238 239 static cl::opt<bool> EnableInterleavedMemAccesses( 240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 241 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 242 243 /// An interleave-group may need masking if it resides in a block that needs 244 /// predication, or in order to mask away gaps. 245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 248 249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 251 cl::desc("We don't interleave loops with a estimated constant trip count " 252 "below this number")); 253 254 static cl::opt<unsigned> ForceTargetNumScalarRegs( 255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 256 cl::desc("A flag that overrides the target's number of scalar registers.")); 257 258 static cl::opt<unsigned> ForceTargetNumVectorRegs( 259 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 260 cl::desc("A flag that overrides the target's number of vector registers.")); 261 262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 264 cl::desc("A flag that overrides the target's max interleave factor for " 265 "scalar loops.")); 266 267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 269 cl::desc("A flag that overrides the target's max interleave factor for " 270 "vectorized loops.")); 271 272 static cl::opt<unsigned> ForceTargetInstructionCost( 273 "force-target-instruction-cost", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's expected cost for " 275 "an instruction to a single constant value. Mostly " 276 "useful for getting consistent testing.")); 277 278 static cl::opt<bool> ForceTargetSupportsScalableVectors( 279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 280 cl::desc( 281 "Pretend that scalable vectors are supported, even if the target does " 282 "not support them. This flag should only be used for testing.")); 283 284 static cl::opt<unsigned> SmallLoopCost( 285 "small-loop-cost", cl::init(20), cl::Hidden, 286 cl::desc( 287 "The cost of a loop that is considered 'small' by the interleaver.")); 288 289 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 291 cl::desc("Enable the use of the block frequency analysis to access PGO " 292 "heuristics minimizing code growth in cold regions and being more " 293 "aggressive in hot regions.")); 294 295 // Runtime interleave loops for load/store throughput. 296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 298 cl::desc( 299 "Enable runtime interleaving until load/store ports are saturated")); 300 301 /// Interleave small loops with scalar reductions. 302 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 304 cl::desc("Enable interleaving for loops with small iteration counts that " 305 "contain scalar reductions to expose ILP.")); 306 307 /// The number of stores in a loop that are allowed to need predication. 308 static cl::opt<unsigned> NumberOfStoresToPredicate( 309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 310 cl::desc("Max number of stores to be predicated behind an if.")); 311 312 static cl::opt<bool> EnableIndVarRegisterHeur( 313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 314 cl::desc("Count the induction variable only once when interleaving")); 315 316 static cl::opt<bool> EnableCondStoresVectorization( 317 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 318 cl::desc("Enable if predication of stores during vectorization.")); 319 320 static cl::opt<unsigned> MaxNestedScalarReductionIC( 321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 322 cl::desc("The maximum interleave count to use when interleaving a scalar " 323 "reduction in a nested loop.")); 324 325 static cl::opt<bool> 326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 327 cl::Hidden, 328 cl::desc("Prefer in-loop vector reductions, " 329 "overriding the targets preference.")); 330 331 static cl::opt<bool> ForceOrderedReductions( 332 "force-ordered-reductions", cl::init(false), cl::Hidden, 333 cl::desc("Enable the vectorisation of loops with in-order (strict) " 334 "FP reductions")); 335 336 static cl::opt<bool> PreferPredicatedReductionSelect( 337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 338 cl::desc( 339 "Prefer predicating a reduction operation over an after loop select.")); 340 341 cl::opt<bool> EnableVPlanNativePath( 342 "enable-vplan-native-path", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 cl::opt<bool> PrintVPlansInDotFormat( 365 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 366 cl::desc("Use dot format instead of plain text when dumping VPlans")); 367 368 /// A helper function that returns true if the given type is irregular. The 369 /// type is irregular if its allocated size doesn't equal the store size of an 370 /// element of the corresponding vector type. 371 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 372 // Determine if an array of N elements of type Ty is "bitcast compatible" 373 // with a <N x Ty> vector. 374 // This is only true if there is no padding between the array elements. 375 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 376 } 377 378 /// A helper function that returns the reciprocal of the block probability of 379 /// predicated blocks. If we return X, we are assuming the predicated block 380 /// will execute once for every X iterations of the loop header. 381 /// 382 /// TODO: We should use actual block probability here, if available. Currently, 383 /// we always assume predicated blocks have a 50% chance of executing. 384 static unsigned getReciprocalPredBlockProb() { return 2; } 385 386 /// A helper function that returns an integer or floating-point constant with 387 /// value C. 388 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 389 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 390 : ConstantFP::get(Ty, C); 391 } 392 393 /// Returns "best known" trip count for the specified loop \p L as defined by 394 /// the following procedure: 395 /// 1) Returns exact trip count if it is known. 396 /// 2) Returns expected trip count according to profile data if any. 397 /// 3) Returns upper bound estimate if it is known. 398 /// 4) Returns None if all of the above failed. 399 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 400 // Check if exact trip count is known. 401 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 402 return ExpectedTC; 403 404 // Check if there is an expected trip count available from profile data. 405 if (LoopVectorizeWithBlockFrequency) 406 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 407 return EstimatedTC; 408 409 // Check if upper bound estimate is known. 410 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 411 return ExpectedTC; 412 413 return None; 414 } 415 416 // Forward declare GeneratedRTChecks. 417 class GeneratedRTChecks; 418 419 namespace llvm { 420 421 AnalysisKey ShouldRunExtraVectorPasses::Key; 422 423 /// InnerLoopVectorizer vectorizes loops which contain only one basic 424 /// block to a specified vectorization factor (VF). 425 /// This class performs the widening of scalars into vectors, or multiple 426 /// scalars. This class also implements the following features: 427 /// * It inserts an epilogue loop for handling loops that don't have iteration 428 /// counts that are known to be a multiple of the vectorization factor. 429 /// * It handles the code generation for reduction variables. 430 /// * Scalarization (implementation using scalars) of un-vectorizable 431 /// instructions. 432 /// InnerLoopVectorizer does not perform any vectorization-legality 433 /// checks, and relies on the caller to check for the different legality 434 /// aspects. The InnerLoopVectorizer relies on the 435 /// LoopVectorizationLegality class to provide information about the induction 436 /// and reduction variables that were found to a given vectorization factor. 437 class InnerLoopVectorizer { 438 public: 439 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 440 LoopInfo *LI, DominatorTree *DT, 441 const TargetLibraryInfo *TLI, 442 const TargetTransformInfo *TTI, AssumptionCache *AC, 443 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 444 ElementCount MinProfitableTripCount, 445 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 446 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 447 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 448 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 449 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 450 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 451 PSI(PSI), RTChecks(RTChecks) { 452 // Query this against the original loop and save it here because the profile 453 // of the original loop header may change as the transformation happens. 454 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 455 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 456 457 if (MinProfitableTripCount.isZero()) 458 this->MinProfitableTripCount = VecWidth; 459 else 460 this->MinProfitableTripCount = MinProfitableTripCount; 461 } 462 463 virtual ~InnerLoopVectorizer() = default; 464 465 /// Create a new empty loop that will contain vectorized instructions later 466 /// on, while the old loop will be used as the scalar remainder. Control flow 467 /// is generated around the vectorized (and scalar epilogue) loops consisting 468 /// of various checks and bypasses. Return the pre-header block of the new 469 /// loop and the start value for the canonical induction, if it is != 0. The 470 /// latter is the case when vectorizing the epilogue loop. In the case of 471 /// epilogue vectorization, this function is overriden to handle the more 472 /// complex control flow around the loops. 473 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 474 475 /// Widen a single call instruction within the innermost loop. 476 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 477 VPTransformState &State); 478 479 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 480 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 481 482 // Return true if any runtime check is added. 483 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 484 485 /// A type for vectorized values in the new loop. Each value from the 486 /// original loop, when vectorized, is represented by UF vector values in the 487 /// new unrolled loop, where UF is the unroll factor. 488 using VectorParts = SmallVector<Value *, 2>; 489 490 /// A helper function to scalarize a single Instruction in the innermost loop. 491 /// Generates a sequence of scalar instances for each lane between \p MinLane 492 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 493 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 494 /// Instr's operands. 495 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 496 const VPIteration &Instance, bool IfPredicateInstr, 497 VPTransformState &State); 498 499 /// Construct the vector value of a scalarized value \p V one lane at a time. 500 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 501 VPTransformState &State); 502 503 /// Try to vectorize interleaved access group \p Group with the base address 504 /// given in \p Addr, optionally masking the vector operations if \p 505 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 506 /// values in the vectorized loop. 507 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 508 ArrayRef<VPValue *> VPDefs, 509 VPTransformState &State, VPValue *Addr, 510 ArrayRef<VPValue *> StoredValues, 511 VPValue *BlockInMask = nullptr); 512 513 /// Fix the non-induction PHIs in \p Plan. 514 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 515 516 /// Returns true if the reordering of FP operations is not allowed, but we are 517 /// able to vectorize with strict in-order reductions for the given RdxDesc. 518 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 519 520 /// Create a broadcast instruction. This method generates a broadcast 521 /// instruction (shuffle) for loop invariant values and for the induction 522 /// value. If this is the induction variable then we extend it to N, N+1, ... 523 /// this is needed because each iteration in the loop corresponds to a SIMD 524 /// element. 525 virtual Value *getBroadcastInstrs(Value *V); 526 527 // Returns the resume value (bc.merge.rdx) for a reduction as 528 // generated by fixReduction. 529 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 530 531 protected: 532 friend class LoopVectorizationPlanner; 533 534 /// A small list of PHINodes. 535 using PhiVector = SmallVector<PHINode *, 4>; 536 537 /// A type for scalarized values in the new loop. Each value from the 538 /// original loop, when scalarized, is represented by UF x VF scalar values 539 /// in the new unrolled loop, where UF is the unroll factor and VF is the 540 /// vectorization factor. 541 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 542 543 /// Set up the values of the IVs correctly when exiting the vector loop. 544 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 545 Value *VectorTripCount, Value *EndValue, 546 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 547 VPlan &Plan); 548 549 /// Handle all cross-iteration phis in the header. 550 void fixCrossIterationPHIs(VPTransformState &State); 551 552 /// Create the exit value of first order recurrences in the middle block and 553 /// update their users. 554 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 555 VPTransformState &State); 556 557 /// Create code for the loop exit value of the reduction. 558 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 559 560 /// Clear NSW/NUW flags from reduction instructions if necessary. 561 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 562 VPTransformState &State); 563 564 /// Iteratively sink the scalarized operands of a predicated instruction into 565 /// the block that was created for it. 566 void sinkScalarOperands(Instruction *PredInst); 567 568 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 569 /// represented as. 570 void truncateToMinimalBitwidths(VPTransformState &State); 571 572 /// Returns (and creates if needed) the original loop trip count. 573 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 574 575 /// Returns (and creates if needed) the trip count of the widened loop. 576 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 577 578 /// Returns a bitcasted value to the requested vector type. 579 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 580 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 581 const DataLayout &DL); 582 583 /// Emit a bypass check to see if the vector trip count is zero, including if 584 /// it overflows. 585 void emitIterationCountCheck(BasicBlock *Bypass); 586 587 /// Emit a bypass check to see if all of the SCEV assumptions we've 588 /// had to make are correct. Returns the block containing the checks or 589 /// nullptr if no checks have been added. 590 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 591 592 /// Emit bypass checks to check any memory assumptions we may have made. 593 /// Returns the block containing the checks or nullptr if no checks have been 594 /// added. 595 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 596 597 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 598 /// vector loop preheader, middle block and scalar preheader. 599 void createVectorLoopSkeleton(StringRef Prefix); 600 601 /// Create new phi nodes for the induction variables to resume iteration count 602 /// in the scalar epilogue, from where the vectorized loop left off. 603 /// In cases where the loop skeleton is more complicated (eg. epilogue 604 /// vectorization) and the resume values can come from an additional bypass 605 /// block, the \p AdditionalBypass pair provides information about the bypass 606 /// block and the end value on the edge from bypass to this loop. 607 void createInductionResumeValues( 608 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 609 610 /// Complete the loop skeleton by adding debug MDs, creating appropriate 611 /// conditional branches in the middle block, preparing the builder and 612 /// running the verifier. Return the preheader of the completed vector loop. 613 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 614 615 /// Collect poison-generating recipes that may generate a poison value that is 616 /// used after vectorization, even when their operands are not poison. Those 617 /// recipes meet the following conditions: 618 /// * Contribute to the address computation of a recipe generating a widen 619 /// memory load/store (VPWidenMemoryInstructionRecipe or 620 /// VPInterleaveRecipe). 621 /// * Such a widen memory load/store has at least one underlying Instruction 622 /// that is in a basic block that needs predication and after vectorization 623 /// the generated instruction won't be predicated. 624 void collectPoisonGeneratingRecipes(VPTransformState &State); 625 626 /// Allow subclasses to override and print debug traces before/after vplan 627 /// execution, when trace information is requested. 628 virtual void printDebugTracesAtStart(){}; 629 virtual void printDebugTracesAtEnd(){}; 630 631 /// The original loop. 632 Loop *OrigLoop; 633 634 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 635 /// dynamic knowledge to simplify SCEV expressions and converts them to a 636 /// more usable form. 637 PredicatedScalarEvolution &PSE; 638 639 /// Loop Info. 640 LoopInfo *LI; 641 642 /// Dominator Tree. 643 DominatorTree *DT; 644 645 /// Alias Analysis. 646 AAResults *AA; 647 648 /// Target Library Info. 649 const TargetLibraryInfo *TLI; 650 651 /// Target Transform Info. 652 const TargetTransformInfo *TTI; 653 654 /// Assumption Cache. 655 AssumptionCache *AC; 656 657 /// Interface to emit optimization remarks. 658 OptimizationRemarkEmitter *ORE; 659 660 /// The vectorization SIMD factor to use. Each vector will have this many 661 /// vector elements. 662 ElementCount VF; 663 664 ElementCount MinProfitableTripCount; 665 666 /// The vectorization unroll factor to use. Each scalar is vectorized to this 667 /// many different vector instructions. 668 unsigned UF; 669 670 /// The builder that we use 671 IRBuilder<> Builder; 672 673 // --- Vectorization state --- 674 675 /// The vector-loop preheader. 676 BasicBlock *LoopVectorPreHeader; 677 678 /// The scalar-loop preheader. 679 BasicBlock *LoopScalarPreHeader; 680 681 /// Middle Block between the vector and the scalar. 682 BasicBlock *LoopMiddleBlock; 683 684 /// The unique ExitBlock of the scalar loop if one exists. Note that 685 /// there can be multiple exiting edges reaching this block. 686 BasicBlock *LoopExitBlock; 687 688 /// The scalar loop body. 689 BasicBlock *LoopScalarBody; 690 691 /// A list of all bypass blocks. The first block is the entry of the loop. 692 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 693 694 /// Store instructions that were predicated. 695 SmallVector<Instruction *, 4> PredicatedInstructions; 696 697 /// Trip count of the original loop. 698 Value *TripCount = nullptr; 699 700 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 701 Value *VectorTripCount = nullptr; 702 703 /// The legality analysis. 704 LoopVectorizationLegality *Legal; 705 706 /// The profitablity analysis. 707 LoopVectorizationCostModel *Cost; 708 709 // Record whether runtime checks are added. 710 bool AddedSafetyChecks = false; 711 712 // Holds the end values for each induction variable. We save the end values 713 // so we can later fix-up the external users of the induction variables. 714 DenseMap<PHINode *, Value *> IVEndValues; 715 716 /// BFI and PSI are used to check for profile guided size optimizations. 717 BlockFrequencyInfo *BFI; 718 ProfileSummaryInfo *PSI; 719 720 // Whether this loop should be optimized for size based on profile guided size 721 // optimizatios. 722 bool OptForSizeBasedOnProfile; 723 724 /// Structure to hold information about generated runtime checks, responsible 725 /// for cleaning the checks, if vectorization turns out unprofitable. 726 GeneratedRTChecks &RTChecks; 727 728 // Holds the resume values for reductions in the loops, used to set the 729 // correct start value of reduction PHIs when vectorizing the epilogue. 730 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 731 ReductionResumeValues; 732 }; 733 734 class InnerLoopUnroller : public InnerLoopVectorizer { 735 public: 736 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 737 LoopInfo *LI, DominatorTree *DT, 738 const TargetLibraryInfo *TLI, 739 const TargetTransformInfo *TTI, AssumptionCache *AC, 740 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 741 LoopVectorizationLegality *LVL, 742 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 743 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 744 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 745 ElementCount::getFixed(1), 746 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 747 BFI, PSI, Check) {} 748 749 private: 750 Value *getBroadcastInstrs(Value *V) override; 751 }; 752 753 /// Encapsulate information regarding vectorization of a loop and its epilogue. 754 /// This information is meant to be updated and used across two stages of 755 /// epilogue vectorization. 756 struct EpilogueLoopVectorizationInfo { 757 ElementCount MainLoopVF = ElementCount::getFixed(0); 758 unsigned MainLoopUF = 0; 759 ElementCount EpilogueVF = ElementCount::getFixed(0); 760 unsigned EpilogueUF = 0; 761 BasicBlock *MainLoopIterationCountCheck = nullptr; 762 BasicBlock *EpilogueIterationCountCheck = nullptr; 763 BasicBlock *SCEVSafetyCheck = nullptr; 764 BasicBlock *MemSafetyCheck = nullptr; 765 Value *TripCount = nullptr; 766 Value *VectorTripCount = nullptr; 767 768 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 769 ElementCount EVF, unsigned EUF) 770 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 771 assert(EUF == 1 && 772 "A high UF for the epilogue loop is likely not beneficial."); 773 } 774 }; 775 776 /// An extension of the inner loop vectorizer that creates a skeleton for a 777 /// vectorized loop that has its epilogue (residual) also vectorized. 778 /// The idea is to run the vplan on a given loop twice, firstly to setup the 779 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 780 /// from the first step and vectorize the epilogue. This is achieved by 781 /// deriving two concrete strategy classes from this base class and invoking 782 /// them in succession from the loop vectorizer planner. 783 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 784 public: 785 InnerLoopAndEpilogueVectorizer( 786 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 787 DominatorTree *DT, const TargetLibraryInfo *TLI, 788 const TargetTransformInfo *TTI, AssumptionCache *AC, 789 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 790 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 791 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 792 GeneratedRTChecks &Checks) 793 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 794 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 795 CM, BFI, PSI, Checks), 796 EPI(EPI) {} 797 798 // Override this function to handle the more complex control flow around the 799 // three loops. 800 std::pair<BasicBlock *, Value *> 801 createVectorizedLoopSkeleton() final override { 802 return createEpilogueVectorizedLoopSkeleton(); 803 } 804 805 /// The interface for creating a vectorized skeleton using one of two 806 /// different strategies, each corresponding to one execution of the vplan 807 /// as described above. 808 virtual std::pair<BasicBlock *, Value *> 809 createEpilogueVectorizedLoopSkeleton() = 0; 810 811 /// Holds and updates state information required to vectorize the main loop 812 /// and its epilogue in two separate passes. This setup helps us avoid 813 /// regenerating and recomputing runtime safety checks. It also helps us to 814 /// shorten the iteration-count-check path length for the cases where the 815 /// iteration count of the loop is so small that the main vector loop is 816 /// completely skipped. 817 EpilogueLoopVectorizationInfo &EPI; 818 }; 819 820 /// A specialized derived class of inner loop vectorizer that performs 821 /// vectorization of *main* loops in the process of vectorizing loops and their 822 /// epilogues. 823 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 824 public: 825 EpilogueVectorizerMainLoop( 826 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 827 DominatorTree *DT, const TargetLibraryInfo *TLI, 828 const TargetTransformInfo *TTI, AssumptionCache *AC, 829 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 830 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 831 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 832 GeneratedRTChecks &Check) 833 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 834 EPI, LVL, CM, BFI, PSI, Check) {} 835 /// Implements the interface for creating a vectorized skeleton using the 836 /// *main loop* strategy (ie the first pass of vplan execution). 837 std::pair<BasicBlock *, Value *> 838 createEpilogueVectorizedLoopSkeleton() final override; 839 840 protected: 841 /// Emits an iteration count bypass check once for the main loop (when \p 842 /// ForEpilogue is false) and once for the epilogue loop (when \p 843 /// ForEpilogue is true). 844 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 845 void printDebugTracesAtStart() override; 846 void printDebugTracesAtEnd() override; 847 }; 848 849 // A specialized derived class of inner loop vectorizer that performs 850 // vectorization of *epilogue* loops in the process of vectorizing loops and 851 // their epilogues. 852 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 853 public: 854 EpilogueVectorizerEpilogueLoop( 855 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 856 DominatorTree *DT, const TargetLibraryInfo *TLI, 857 const TargetTransformInfo *TTI, AssumptionCache *AC, 858 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 859 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 860 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 861 GeneratedRTChecks &Checks) 862 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 863 EPI, LVL, CM, BFI, PSI, Checks) { 864 TripCount = EPI.TripCount; 865 } 866 /// Implements the interface for creating a vectorized skeleton using the 867 /// *epilogue loop* strategy (ie the second pass of vplan execution). 868 std::pair<BasicBlock *, Value *> 869 createEpilogueVectorizedLoopSkeleton() final override; 870 871 protected: 872 /// Emits an iteration count bypass check after the main vector loop has 873 /// finished to see if there are any iterations left to execute by either 874 /// the vector epilogue or the scalar epilogue. 875 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 876 BasicBlock *Bypass, 877 BasicBlock *Insert); 878 void printDebugTracesAtStart() override; 879 void printDebugTracesAtEnd() override; 880 }; 881 } // end namespace llvm 882 883 /// Look for a meaningful debug location on the instruction or it's 884 /// operands. 885 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 886 if (!I) 887 return I; 888 889 DebugLoc Empty; 890 if (I->getDebugLoc() != Empty) 891 return I; 892 893 for (Use &Op : I->operands()) { 894 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 895 if (OpInst->getDebugLoc() != Empty) 896 return OpInst; 897 } 898 899 return I; 900 } 901 902 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 903 /// is passed, the message relates to that particular instruction. 904 #ifndef NDEBUG 905 static void debugVectorizationMessage(const StringRef Prefix, 906 const StringRef DebugMsg, 907 Instruction *I) { 908 dbgs() << "LV: " << Prefix << DebugMsg; 909 if (I != nullptr) 910 dbgs() << " " << *I; 911 else 912 dbgs() << '.'; 913 dbgs() << '\n'; 914 } 915 #endif 916 917 /// Create an analysis remark that explains why vectorization failed 918 /// 919 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 920 /// RemarkName is the identifier for the remark. If \p I is passed it is an 921 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 922 /// the location of the remark. \return the remark object that can be 923 /// streamed to. 924 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 925 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 926 Value *CodeRegion = TheLoop->getHeader(); 927 DebugLoc DL = TheLoop->getStartLoc(); 928 929 if (I) { 930 CodeRegion = I->getParent(); 931 // If there is no debug location attached to the instruction, revert back to 932 // using the loop's. 933 if (I->getDebugLoc()) 934 DL = I->getDebugLoc(); 935 } 936 937 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 938 } 939 940 namespace llvm { 941 942 /// Return a value for Step multiplied by VF. 943 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 944 int64_t Step) { 945 assert(Ty->isIntegerTy() && "Expected an integer step"); 946 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 947 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 948 } 949 950 /// Return the runtime value for VF. 951 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 952 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 953 return VF.isScalable() ? B.CreateVScale(EC) : EC; 954 } 955 956 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 957 ElementCount VF) { 958 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 959 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 960 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 961 return B.CreateUIToFP(RuntimeVF, FTy); 962 } 963 964 void reportVectorizationFailure(const StringRef DebugMsg, 965 const StringRef OREMsg, const StringRef ORETag, 966 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 967 Instruction *I) { 968 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 969 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 970 ORE->emit( 971 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 972 << "loop not vectorized: " << OREMsg); 973 } 974 975 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 976 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 977 Instruction *I) { 978 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 979 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 980 ORE->emit( 981 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 982 << Msg); 983 } 984 985 } // end namespace llvm 986 987 #ifndef NDEBUG 988 /// \return string containing a file name and a line # for the given loop. 989 static std::string getDebugLocString(const Loop *L) { 990 std::string Result; 991 if (L) { 992 raw_string_ostream OS(Result); 993 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 994 LoopDbgLoc.print(OS); 995 else 996 // Just print the module name. 997 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 998 OS.flush(); 999 } 1000 return Result; 1001 } 1002 #endif 1003 1004 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1005 VPTransformState &State) { 1006 1007 // Collect recipes in the backward slice of `Root` that may generate a poison 1008 // value that is used after vectorization. 1009 SmallPtrSet<VPRecipeBase *, 16> Visited; 1010 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1011 SmallVector<VPRecipeBase *, 16> Worklist; 1012 Worklist.push_back(Root); 1013 1014 // Traverse the backward slice of Root through its use-def chain. 1015 while (!Worklist.empty()) { 1016 VPRecipeBase *CurRec = Worklist.back(); 1017 Worklist.pop_back(); 1018 1019 if (!Visited.insert(CurRec).second) 1020 continue; 1021 1022 // Prune search if we find another recipe generating a widen memory 1023 // instruction. Widen memory instructions involved in address computation 1024 // will lead to gather/scatter instructions, which don't need to be 1025 // handled. 1026 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1027 isa<VPInterleaveRecipe>(CurRec) || 1028 isa<VPScalarIVStepsRecipe>(CurRec) || 1029 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1030 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1031 continue; 1032 1033 // This recipe contributes to the address computation of a widen 1034 // load/store. Collect recipe if its underlying instruction has 1035 // poison-generating flags. 1036 Instruction *Instr = CurRec->getUnderlyingInstr(); 1037 if (Instr && Instr->hasPoisonGeneratingFlags()) 1038 State.MayGeneratePoisonRecipes.insert(CurRec); 1039 1040 // Add new definitions to the worklist. 1041 for (VPValue *operand : CurRec->operands()) 1042 if (VPDef *OpDef = operand->getDef()) 1043 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1044 } 1045 }); 1046 1047 // Traverse all the recipes in the VPlan and collect the poison-generating 1048 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1049 // VPInterleaveRecipe. 1050 auto Iter = depth_first( 1051 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1052 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1053 for (VPRecipeBase &Recipe : *VPBB) { 1054 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1055 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1056 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1057 if (AddrDef && WidenRec->isConsecutive() && 1058 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1059 collectPoisonGeneratingInstrsInBackwardSlice( 1060 cast<VPRecipeBase>(AddrDef)); 1061 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1062 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1063 if (AddrDef) { 1064 // Check if any member of the interleave group needs predication. 1065 const InterleaveGroup<Instruction> *InterGroup = 1066 InterleaveRec->getInterleaveGroup(); 1067 bool NeedPredication = false; 1068 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1069 I < NumMembers; ++I) { 1070 Instruction *Member = InterGroup->getMember(I); 1071 if (Member) 1072 NeedPredication |= 1073 Legal->blockNeedsPredication(Member->getParent()); 1074 } 1075 1076 if (NeedPredication) 1077 collectPoisonGeneratingInstrsInBackwardSlice( 1078 cast<VPRecipeBase>(AddrDef)); 1079 } 1080 } 1081 } 1082 } 1083 } 1084 1085 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1086 const RecurrenceDescriptor &RdxDesc) { 1087 auto It = ReductionResumeValues.find(&RdxDesc); 1088 assert(It != ReductionResumeValues.end() && 1089 "Expected to find a resume value for the reduction."); 1090 return It->second; 1091 } 1092 1093 namespace llvm { 1094 1095 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1096 // lowered. 1097 enum ScalarEpilogueLowering { 1098 1099 // The default: allowing scalar epilogues. 1100 CM_ScalarEpilogueAllowed, 1101 1102 // Vectorization with OptForSize: don't allow epilogues. 1103 CM_ScalarEpilogueNotAllowedOptSize, 1104 1105 // A special case of vectorisation with OptForSize: loops with a very small 1106 // trip count are considered for vectorization under OptForSize, thereby 1107 // making sure the cost of their loop body is dominant, free of runtime 1108 // guards and scalar iteration overheads. 1109 CM_ScalarEpilogueNotAllowedLowTripLoop, 1110 1111 // Loop hint predicate indicating an epilogue is undesired. 1112 CM_ScalarEpilogueNotNeededUsePredicate, 1113 1114 // Directive indicating we must either tail fold or not vectorize 1115 CM_ScalarEpilogueNotAllowedUsePredicate 1116 }; 1117 1118 /// ElementCountComparator creates a total ordering for ElementCount 1119 /// for the purposes of using it in a set structure. 1120 struct ElementCountComparator { 1121 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1122 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1123 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1124 } 1125 }; 1126 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1127 1128 /// LoopVectorizationCostModel - estimates the expected speedups due to 1129 /// vectorization. 1130 /// In many cases vectorization is not profitable. This can happen because of 1131 /// a number of reasons. In this class we mainly attempt to predict the 1132 /// expected speedup/slowdowns due to the supported instruction set. We use the 1133 /// TargetTransformInfo to query the different backends for the cost of 1134 /// different operations. 1135 class LoopVectorizationCostModel { 1136 public: 1137 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1138 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1139 LoopVectorizationLegality *Legal, 1140 const TargetTransformInfo &TTI, 1141 const TargetLibraryInfo *TLI, DemandedBits *DB, 1142 AssumptionCache *AC, 1143 OptimizationRemarkEmitter *ORE, const Function *F, 1144 const LoopVectorizeHints *Hints, 1145 InterleavedAccessInfo &IAI) 1146 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1147 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1148 Hints(Hints), InterleaveInfo(IAI) {} 1149 1150 /// \return An upper bound for the vectorization factors (both fixed and 1151 /// scalable). If the factors are 0, vectorization and interleaving should be 1152 /// avoided up front. 1153 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1154 1155 /// \return True if runtime checks are required for vectorization, and false 1156 /// otherwise. 1157 bool runtimeChecksRequired(); 1158 1159 /// \return The most profitable vectorization factor and the cost of that VF. 1160 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1161 /// then this vectorization factor will be selected if vectorization is 1162 /// possible. 1163 VectorizationFactor 1164 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1165 1166 VectorizationFactor 1167 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1168 const LoopVectorizationPlanner &LVP); 1169 1170 /// Setup cost-based decisions for user vectorization factor. 1171 /// \return true if the UserVF is a feasible VF to be chosen. 1172 bool selectUserVectorizationFactor(ElementCount UserVF) { 1173 collectUniformsAndScalars(UserVF); 1174 collectInstsToScalarize(UserVF); 1175 return expectedCost(UserVF).first.isValid(); 1176 } 1177 1178 /// \return The size (in bits) of the smallest and widest types in the code 1179 /// that needs to be vectorized. We ignore values that remain scalar such as 1180 /// 64 bit loop indices. 1181 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1182 1183 /// \return The desired interleave count. 1184 /// If interleave count has been specified by metadata it will be returned. 1185 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1186 /// are the selected vectorization factor and the cost of the selected VF. 1187 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1188 1189 /// Memory access instruction may be vectorized in more than one way. 1190 /// Form of instruction after vectorization depends on cost. 1191 /// This function takes cost-based decisions for Load/Store instructions 1192 /// and collects them in a map. This decisions map is used for building 1193 /// the lists of loop-uniform and loop-scalar instructions. 1194 /// The calculated cost is saved with widening decision in order to 1195 /// avoid redundant calculations. 1196 void setCostBasedWideningDecision(ElementCount VF); 1197 1198 /// A struct that represents some properties of the register usage 1199 /// of a loop. 1200 struct RegisterUsage { 1201 /// Holds the number of loop invariant values that are used in the loop. 1202 /// The key is ClassID of target-provided register class. 1203 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1204 /// Holds the maximum number of concurrent live intervals in the loop. 1205 /// The key is ClassID of target-provided register class. 1206 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1207 }; 1208 1209 /// \return Returns information about the register usages of the loop for the 1210 /// given vectorization factors. 1211 SmallVector<RegisterUsage, 8> 1212 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1213 1214 /// Collect values we want to ignore in the cost model. 1215 void collectValuesToIgnore(); 1216 1217 /// Collect all element types in the loop for which widening is needed. 1218 void collectElementTypesForWidening(); 1219 1220 /// Split reductions into those that happen in the loop, and those that happen 1221 /// outside. In loop reductions are collected into InLoopReductionChains. 1222 void collectInLoopReductions(); 1223 1224 /// Returns true if we should use strict in-order reductions for the given 1225 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1226 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1227 /// of FP operations. 1228 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1229 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1230 } 1231 1232 /// \returns The smallest bitwidth each instruction can be represented with. 1233 /// The vector equivalents of these instructions should be truncated to this 1234 /// type. 1235 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1236 return MinBWs; 1237 } 1238 1239 /// \returns True if it is more profitable to scalarize instruction \p I for 1240 /// vectorization factor \p VF. 1241 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1242 assert(VF.isVector() && 1243 "Profitable to scalarize relevant only for VF > 1."); 1244 1245 // Cost model is not run in the VPlan-native path - return conservative 1246 // result until this changes. 1247 if (EnableVPlanNativePath) 1248 return false; 1249 1250 auto Scalars = InstsToScalarize.find(VF); 1251 assert(Scalars != InstsToScalarize.end() && 1252 "VF not yet analyzed for scalarization profitability"); 1253 return Scalars->second.find(I) != Scalars->second.end(); 1254 } 1255 1256 /// Returns true if \p I is known to be uniform after vectorization. 1257 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1258 if (VF.isScalar()) 1259 return true; 1260 1261 // Cost model is not run in the VPlan-native path - return conservative 1262 // result until this changes. 1263 if (EnableVPlanNativePath) 1264 return false; 1265 1266 auto UniformsPerVF = Uniforms.find(VF); 1267 assert(UniformsPerVF != Uniforms.end() && 1268 "VF not yet analyzed for uniformity"); 1269 return UniformsPerVF->second.count(I); 1270 } 1271 1272 /// Returns true if \p I is known to be scalar after vectorization. 1273 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1274 if (VF.isScalar()) 1275 return true; 1276 1277 // Cost model is not run in the VPlan-native path - return conservative 1278 // result until this changes. 1279 if (EnableVPlanNativePath) 1280 return false; 1281 1282 auto ScalarsPerVF = Scalars.find(VF); 1283 assert(ScalarsPerVF != Scalars.end() && 1284 "Scalar values are not calculated for VF"); 1285 return ScalarsPerVF->second.count(I); 1286 } 1287 1288 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1289 /// for vectorization factor \p VF. 1290 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1291 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1292 !isProfitableToScalarize(I, VF) && 1293 !isScalarAfterVectorization(I, VF); 1294 } 1295 1296 /// Decision that was taken during cost calculation for memory instruction. 1297 enum InstWidening { 1298 CM_Unknown, 1299 CM_Widen, // For consecutive accesses with stride +1. 1300 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1301 CM_Interleave, 1302 CM_GatherScatter, 1303 CM_Scalarize 1304 }; 1305 1306 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1307 /// instruction \p I and vector width \p VF. 1308 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1309 InstructionCost Cost) { 1310 assert(VF.isVector() && "Expected VF >=2"); 1311 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1312 } 1313 1314 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1315 /// interleaving group \p Grp and vector width \p VF. 1316 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1317 ElementCount VF, InstWidening W, 1318 InstructionCost Cost) { 1319 assert(VF.isVector() && "Expected VF >=2"); 1320 /// Broadcast this decicion to all instructions inside the group. 1321 /// But the cost will be assigned to one instruction only. 1322 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1323 if (auto *I = Grp->getMember(i)) { 1324 if (Grp->getInsertPos() == I) 1325 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1326 else 1327 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1328 } 1329 } 1330 } 1331 1332 /// Return the cost model decision for the given instruction \p I and vector 1333 /// width \p VF. Return CM_Unknown if this instruction did not pass 1334 /// through the cost modeling. 1335 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1336 assert(VF.isVector() && "Expected VF to be a vector VF"); 1337 // Cost model is not run in the VPlan-native path - return conservative 1338 // result until this changes. 1339 if (EnableVPlanNativePath) 1340 return CM_GatherScatter; 1341 1342 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1343 auto Itr = WideningDecisions.find(InstOnVF); 1344 if (Itr == WideningDecisions.end()) 1345 return CM_Unknown; 1346 return Itr->second.first; 1347 } 1348 1349 /// Return the vectorization cost for the given instruction \p I and vector 1350 /// width \p VF. 1351 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1352 assert(VF.isVector() && "Expected VF >=2"); 1353 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1354 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1355 "The cost is not calculated"); 1356 return WideningDecisions[InstOnVF].second; 1357 } 1358 1359 /// Return True if instruction \p I is an optimizable truncate whose operand 1360 /// is an induction variable. Such a truncate will be removed by adding a new 1361 /// induction variable with the destination type. 1362 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1363 // If the instruction is not a truncate, return false. 1364 auto *Trunc = dyn_cast<TruncInst>(I); 1365 if (!Trunc) 1366 return false; 1367 1368 // Get the source and destination types of the truncate. 1369 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1370 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1371 1372 // If the truncate is free for the given types, return false. Replacing a 1373 // free truncate with an induction variable would add an induction variable 1374 // update instruction to each iteration of the loop. We exclude from this 1375 // check the primary induction variable since it will need an update 1376 // instruction regardless. 1377 Value *Op = Trunc->getOperand(0); 1378 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1379 return false; 1380 1381 // If the truncated value is not an induction variable, return false. 1382 return Legal->isInductionPhi(Op); 1383 } 1384 1385 /// Collects the instructions to scalarize for each predicated instruction in 1386 /// the loop. 1387 void collectInstsToScalarize(ElementCount VF); 1388 1389 /// Collect Uniform and Scalar values for the given \p VF. 1390 /// The sets depend on CM decision for Load/Store instructions 1391 /// that may be vectorized as interleave, gather-scatter or scalarized. 1392 void collectUniformsAndScalars(ElementCount VF) { 1393 // Do the analysis once. 1394 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1395 return; 1396 setCostBasedWideningDecision(VF); 1397 collectLoopUniforms(VF); 1398 collectLoopScalars(VF); 1399 } 1400 1401 /// Returns true if the target machine supports masked store operation 1402 /// for the given \p DataType and kind of access to \p Ptr. 1403 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1404 return Legal->isConsecutivePtr(DataType, Ptr) && 1405 TTI.isLegalMaskedStore(DataType, Alignment); 1406 } 1407 1408 /// Returns true if the target machine supports masked load operation 1409 /// for the given \p DataType and kind of access to \p Ptr. 1410 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1411 return Legal->isConsecutivePtr(DataType, Ptr) && 1412 TTI.isLegalMaskedLoad(DataType, Alignment); 1413 } 1414 1415 /// Returns true if the target machine can represent \p V as a masked gather 1416 /// or scatter operation. 1417 bool isLegalGatherOrScatter(Value *V, 1418 ElementCount VF = ElementCount::getFixed(1)) { 1419 bool LI = isa<LoadInst>(V); 1420 bool SI = isa<StoreInst>(V); 1421 if (!LI && !SI) 1422 return false; 1423 auto *Ty = getLoadStoreType(V); 1424 Align Align = getLoadStoreAlignment(V); 1425 if (VF.isVector()) 1426 Ty = VectorType::get(Ty, VF); 1427 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1428 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1429 } 1430 1431 /// Returns true if the target machine supports all of the reduction 1432 /// variables found for the given VF. 1433 bool canVectorizeReductions(ElementCount VF) const { 1434 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1435 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1436 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1437 })); 1438 } 1439 1440 /// Returns true if \p I is an instruction that will be scalarized with 1441 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1442 /// instructions include conditional stores and instructions that may divide 1443 /// by zero. 1444 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1445 1446 // Returns true if \p I is an instruction that will be predicated either 1447 // through scalar predication or masked load/store or masked gather/scatter. 1448 // \p VF is the vectorization factor that will be used to vectorize \p I. 1449 // Superset of instructions that return true for isScalarWithPredication. 1450 bool isPredicatedInst(Instruction *I, ElementCount VF, 1451 bool IsKnownUniform = false) { 1452 // When we know the load is uniform and the original scalar loop was not 1453 // predicated we don't need to mark it as a predicated instruction. Any 1454 // vectorised blocks created when tail-folding are something artificial we 1455 // have introduced and we know there is always at least one active lane. 1456 // That's why we call Legal->blockNeedsPredication here because it doesn't 1457 // query tail-folding. 1458 if (IsKnownUniform && isa<LoadInst>(I) && 1459 !Legal->blockNeedsPredication(I->getParent())) 1460 return false; 1461 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1462 return false; 1463 // Loads and stores that need some form of masked operation are predicated 1464 // instructions. 1465 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1466 return Legal->isMaskRequired(I); 1467 return isScalarWithPredication(I, VF); 1468 } 1469 1470 /// Returns true if \p I is a memory instruction with consecutive memory 1471 /// access that can be widened. 1472 bool 1473 memoryInstructionCanBeWidened(Instruction *I, 1474 ElementCount VF = ElementCount::getFixed(1)); 1475 1476 /// Returns true if \p I is a memory instruction in an interleaved-group 1477 /// of memory accesses that can be vectorized with wide vector loads/stores 1478 /// and shuffles. 1479 bool 1480 interleavedAccessCanBeWidened(Instruction *I, 1481 ElementCount VF = ElementCount::getFixed(1)); 1482 1483 /// Check if \p Instr belongs to any interleaved access group. 1484 bool isAccessInterleaved(Instruction *Instr) { 1485 return InterleaveInfo.isInterleaved(Instr); 1486 } 1487 1488 /// Get the interleaved access group that \p Instr belongs to. 1489 const InterleaveGroup<Instruction> * 1490 getInterleavedAccessGroup(Instruction *Instr) { 1491 return InterleaveInfo.getInterleaveGroup(Instr); 1492 } 1493 1494 /// Returns true if we're required to use a scalar epilogue for at least 1495 /// the final iteration of the original loop. 1496 bool requiresScalarEpilogue(ElementCount VF) const { 1497 if (!isScalarEpilogueAllowed()) 1498 return false; 1499 // If we might exit from anywhere but the latch, must run the exiting 1500 // iteration in scalar form. 1501 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1502 return true; 1503 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1504 } 1505 1506 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1507 /// loop hint annotation. 1508 bool isScalarEpilogueAllowed() const { 1509 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1510 } 1511 1512 /// Returns true if all loop blocks should be masked to fold tail loop. 1513 bool foldTailByMasking() const { return FoldTailByMasking; } 1514 1515 /// Returns true if were tail-folding and want to use the active lane mask 1516 /// for vector loop control flow. 1517 bool useActiveLaneMaskForControlFlow() const { 1518 return FoldTailByMasking && 1519 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; 1520 } 1521 1522 /// Returns true if the instructions in this block requires predication 1523 /// for any reason, e.g. because tail folding now requires a predicate 1524 /// or because the block in the original loop was predicated. 1525 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1526 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1527 } 1528 1529 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1530 /// nodes to the chain of instructions representing the reductions. Uses a 1531 /// MapVector to ensure deterministic iteration order. 1532 using ReductionChainMap = 1533 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1534 1535 /// Return the chain of instructions representing an inloop reduction. 1536 const ReductionChainMap &getInLoopReductionChains() const { 1537 return InLoopReductionChains; 1538 } 1539 1540 /// Returns true if the Phi is part of an inloop reduction. 1541 bool isInLoopReduction(PHINode *Phi) const { 1542 return InLoopReductionChains.count(Phi); 1543 } 1544 1545 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1546 /// with factor VF. Return the cost of the instruction, including 1547 /// scalarization overhead if it's needed. 1548 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1549 1550 /// Estimate cost of a call instruction CI if it were vectorized with factor 1551 /// VF. Return the cost of the instruction, including scalarization overhead 1552 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1553 /// scalarized - 1554 /// i.e. either vector version isn't available, or is too expensive. 1555 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1556 bool &NeedToScalarize) const; 1557 1558 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1559 /// that of B. 1560 bool isMoreProfitable(const VectorizationFactor &A, 1561 const VectorizationFactor &B) const; 1562 1563 /// Invalidates decisions already taken by the cost model. 1564 void invalidateCostModelingDecisions() { 1565 WideningDecisions.clear(); 1566 Uniforms.clear(); 1567 Scalars.clear(); 1568 } 1569 1570 /// Convenience function that returns the value of vscale_range iff 1571 /// vscale_range.min == vscale_range.max or otherwise returns the value 1572 /// returned by the corresponding TLI method. 1573 Optional<unsigned> getVScaleForTuning() const; 1574 1575 private: 1576 unsigned NumPredStores = 0; 1577 1578 /// \return An upper bound for the vectorization factors for both 1579 /// fixed and scalable vectorization, where the minimum-known number of 1580 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1581 /// disabled or unsupported, then the scalable part will be equal to 1582 /// ElementCount::getScalable(0). 1583 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1584 ElementCount UserVF, 1585 bool FoldTailByMasking); 1586 1587 /// \return the maximized element count based on the targets vector 1588 /// registers and the loop trip-count, but limited to a maximum safe VF. 1589 /// This is a helper function of computeFeasibleMaxVF. 1590 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1591 unsigned SmallestType, 1592 unsigned WidestType, 1593 ElementCount MaxSafeVF, 1594 bool FoldTailByMasking); 1595 1596 /// \return the maximum legal scalable VF, based on the safe max number 1597 /// of elements. 1598 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1599 1600 /// The vectorization cost is a combination of the cost itself and a boolean 1601 /// indicating whether any of the contributing operations will actually 1602 /// operate on vector values after type legalization in the backend. If this 1603 /// latter value is false, then all operations will be scalarized (i.e. no 1604 /// vectorization has actually taken place). 1605 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1606 1607 /// Returns the expected execution cost. The unit of the cost does 1608 /// not matter because we use the 'cost' units to compare different 1609 /// vector widths. The cost that is returned is *not* normalized by 1610 /// the factor width. If \p Invalid is not nullptr, this function 1611 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1612 /// each instruction that has an Invalid cost for the given VF. 1613 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1614 VectorizationCostTy 1615 expectedCost(ElementCount VF, 1616 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1617 1618 /// Returns the execution time cost of an instruction for a given vector 1619 /// width. Vector width of one means scalar. 1620 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1621 1622 /// The cost-computation logic from getInstructionCost which provides 1623 /// the vector type as an output parameter. 1624 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1625 Type *&VectorTy); 1626 1627 /// Return the cost of instructions in an inloop reduction pattern, if I is 1628 /// part of that pattern. 1629 Optional<InstructionCost> 1630 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1631 TTI::TargetCostKind CostKind); 1632 1633 /// Calculate vectorization cost of memory instruction \p I. 1634 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1635 1636 /// The cost computation for scalarized memory instruction. 1637 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1638 1639 /// The cost computation for interleaving group of memory instructions. 1640 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1641 1642 /// The cost computation for Gather/Scatter instruction. 1643 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1644 1645 /// The cost computation for widening instruction \p I with consecutive 1646 /// memory access. 1647 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1648 1649 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1650 /// Load: scalar load + broadcast. 1651 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1652 /// element) 1653 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1654 1655 /// Estimate the overhead of scalarizing an instruction. This is a 1656 /// convenience wrapper for the type-based getScalarizationOverhead API. 1657 InstructionCost getScalarizationOverhead(Instruction *I, 1658 ElementCount VF) const; 1659 1660 /// Returns whether the instruction is a load or store and will be a emitted 1661 /// as a vector operation. 1662 bool isConsecutiveLoadOrStore(Instruction *I); 1663 1664 /// Returns true if an artificially high cost for emulated masked memrefs 1665 /// should be used. 1666 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1667 1668 /// Map of scalar integer values to the smallest bitwidth they can be legally 1669 /// represented as. The vector equivalents of these values should be truncated 1670 /// to this type. 1671 MapVector<Instruction *, uint64_t> MinBWs; 1672 1673 /// A type representing the costs for instructions if they were to be 1674 /// scalarized rather than vectorized. The entries are Instruction-Cost 1675 /// pairs. 1676 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1677 1678 /// A set containing all BasicBlocks that are known to present after 1679 /// vectorization as a predicated block. 1680 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1681 PredicatedBBsAfterVectorization; 1682 1683 /// Records whether it is allowed to have the original scalar loop execute at 1684 /// least once. This may be needed as a fallback loop in case runtime 1685 /// aliasing/dependence checks fail, or to handle the tail/remainder 1686 /// iterations when the trip count is unknown or doesn't divide by the VF, 1687 /// or as a peel-loop to handle gaps in interleave-groups. 1688 /// Under optsize and when the trip count is very small we don't allow any 1689 /// iterations to execute in the scalar loop. 1690 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1691 1692 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1693 bool FoldTailByMasking = false; 1694 1695 /// A map holding scalar costs for different vectorization factors. The 1696 /// presence of a cost for an instruction in the mapping indicates that the 1697 /// instruction will be scalarized when vectorizing with the associated 1698 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1699 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1700 1701 /// Holds the instructions known to be uniform after vectorization. 1702 /// The data is collected per VF. 1703 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1704 1705 /// Holds the instructions known to be scalar after vectorization. 1706 /// The data is collected per VF. 1707 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1708 1709 /// Holds the instructions (address computations) that are forced to be 1710 /// scalarized. 1711 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1712 1713 /// PHINodes of the reductions that should be expanded in-loop along with 1714 /// their associated chains of reduction operations, in program order from top 1715 /// (PHI) to bottom 1716 ReductionChainMap InLoopReductionChains; 1717 1718 /// A Map of inloop reduction operations and their immediate chain operand. 1719 /// FIXME: This can be removed once reductions can be costed correctly in 1720 /// vplan. This was added to allow quick lookup to the inloop operations, 1721 /// without having to loop through InLoopReductionChains. 1722 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1723 1724 /// Returns the expected difference in cost from scalarizing the expression 1725 /// feeding a predicated instruction \p PredInst. The instructions to 1726 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1727 /// non-negative return value implies the expression will be scalarized. 1728 /// Currently, only single-use chains are considered for scalarization. 1729 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1730 ElementCount VF); 1731 1732 /// Collect the instructions that are uniform after vectorization. An 1733 /// instruction is uniform if we represent it with a single scalar value in 1734 /// the vectorized loop corresponding to each vector iteration. Examples of 1735 /// uniform instructions include pointer operands of consecutive or 1736 /// interleaved memory accesses. Note that although uniformity implies an 1737 /// instruction will be scalar, the reverse is not true. In general, a 1738 /// scalarized instruction will be represented by VF scalar values in the 1739 /// vectorized loop, each corresponding to an iteration of the original 1740 /// scalar loop. 1741 void collectLoopUniforms(ElementCount VF); 1742 1743 /// Collect the instructions that are scalar after vectorization. An 1744 /// instruction is scalar if it is known to be uniform or will be scalarized 1745 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1746 /// to the list if they are used by a load/store instruction that is marked as 1747 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1748 /// VF values in the vectorized loop, each corresponding to an iteration of 1749 /// the original scalar loop. 1750 void collectLoopScalars(ElementCount VF); 1751 1752 /// Keeps cost model vectorization decision and cost for instructions. 1753 /// Right now it is used for memory instructions only. 1754 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1755 std::pair<InstWidening, InstructionCost>>; 1756 1757 DecisionList WideningDecisions; 1758 1759 /// Returns true if \p V is expected to be vectorized and it needs to be 1760 /// extracted. 1761 bool needsExtract(Value *V, ElementCount VF) const { 1762 Instruction *I = dyn_cast<Instruction>(V); 1763 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1764 TheLoop->isLoopInvariant(I)) 1765 return false; 1766 1767 // Assume we can vectorize V (and hence we need extraction) if the 1768 // scalars are not computed yet. This can happen, because it is called 1769 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1770 // the scalars are collected. That should be a safe assumption in most 1771 // cases, because we check if the operands have vectorizable types 1772 // beforehand in LoopVectorizationLegality. 1773 return Scalars.find(VF) == Scalars.end() || 1774 !isScalarAfterVectorization(I, VF); 1775 }; 1776 1777 /// Returns a range containing only operands needing to be extracted. 1778 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1779 ElementCount VF) const { 1780 return SmallVector<Value *, 4>(make_filter_range( 1781 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1782 } 1783 1784 /// Determines if we have the infrastructure to vectorize loop \p L and its 1785 /// epilogue, assuming the main loop is vectorized by \p VF. 1786 bool isCandidateForEpilogueVectorization(const Loop &L, 1787 const ElementCount VF) const; 1788 1789 /// Returns true if epilogue vectorization is considered profitable, and 1790 /// false otherwise. 1791 /// \p VF is the vectorization factor chosen for the original loop. 1792 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1793 1794 public: 1795 /// The loop that we evaluate. 1796 Loop *TheLoop; 1797 1798 /// Predicated scalar evolution analysis. 1799 PredicatedScalarEvolution &PSE; 1800 1801 /// Loop Info analysis. 1802 LoopInfo *LI; 1803 1804 /// Vectorization legality. 1805 LoopVectorizationLegality *Legal; 1806 1807 /// Vector target information. 1808 const TargetTransformInfo &TTI; 1809 1810 /// Target Library Info. 1811 const TargetLibraryInfo *TLI; 1812 1813 /// Demanded bits analysis. 1814 DemandedBits *DB; 1815 1816 /// Assumption cache. 1817 AssumptionCache *AC; 1818 1819 /// Interface to emit optimization remarks. 1820 OptimizationRemarkEmitter *ORE; 1821 1822 const Function *TheFunction; 1823 1824 /// Loop Vectorize Hint. 1825 const LoopVectorizeHints *Hints; 1826 1827 /// The interleave access information contains groups of interleaved accesses 1828 /// with the same stride and close to each other. 1829 InterleavedAccessInfo &InterleaveInfo; 1830 1831 /// Values to ignore in the cost model. 1832 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1833 1834 /// Values to ignore in the cost model when VF > 1. 1835 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1836 1837 /// All element types found in the loop. 1838 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1839 1840 /// Profitable vector factors. 1841 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1842 }; 1843 } // end namespace llvm 1844 1845 /// Helper struct to manage generating runtime checks for vectorization. 1846 /// 1847 /// The runtime checks are created up-front in temporary blocks to allow better 1848 /// estimating the cost and un-linked from the existing IR. After deciding to 1849 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1850 /// temporary blocks are completely removed. 1851 class GeneratedRTChecks { 1852 /// Basic block which contains the generated SCEV checks, if any. 1853 BasicBlock *SCEVCheckBlock = nullptr; 1854 1855 /// The value representing the result of the generated SCEV checks. If it is 1856 /// nullptr, either no SCEV checks have been generated or they have been used. 1857 Value *SCEVCheckCond = nullptr; 1858 1859 /// Basic block which contains the generated memory runtime checks, if any. 1860 BasicBlock *MemCheckBlock = nullptr; 1861 1862 /// The value representing the result of the generated memory runtime checks. 1863 /// If it is nullptr, either no memory runtime checks have been generated or 1864 /// they have been used. 1865 Value *MemRuntimeCheckCond = nullptr; 1866 1867 DominatorTree *DT; 1868 LoopInfo *LI; 1869 TargetTransformInfo *TTI; 1870 1871 SCEVExpander SCEVExp; 1872 SCEVExpander MemCheckExp; 1873 1874 bool CostTooHigh = false; 1875 1876 public: 1877 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1878 TargetTransformInfo *TTI, const DataLayout &DL) 1879 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1880 MemCheckExp(SE, DL, "scev.check") {} 1881 1882 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1883 /// accurately estimate the cost of the runtime checks. The blocks are 1884 /// un-linked from the IR and is added back during vector code generation. If 1885 /// there is no vector code generation, the check blocks are removed 1886 /// completely. 1887 void Create(Loop *L, const LoopAccessInfo &LAI, 1888 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1889 1890 // Hard cutoff to limit compile-time increase in case a very large number of 1891 // runtime checks needs to be generated. 1892 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1893 // profile info. 1894 CostTooHigh = 1895 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1896 if (CostTooHigh) 1897 return; 1898 1899 BasicBlock *LoopHeader = L->getHeader(); 1900 BasicBlock *Preheader = L->getLoopPreheader(); 1901 1902 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1903 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1904 // may be used by SCEVExpander. The blocks will be un-linked from their 1905 // predecessors and removed from LI & DT at the end of the function. 1906 if (!UnionPred.isAlwaysTrue()) { 1907 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1908 nullptr, "vector.scevcheck"); 1909 1910 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1911 &UnionPred, SCEVCheckBlock->getTerminator()); 1912 } 1913 1914 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1915 if (RtPtrChecking.Need) { 1916 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1917 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1918 "vector.memcheck"); 1919 1920 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1921 if (DiffChecks) { 1922 Value *RuntimeVF = nullptr; 1923 MemRuntimeCheckCond = addDiffRuntimeChecks( 1924 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1925 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1926 if (!RuntimeVF) 1927 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1928 return RuntimeVF; 1929 }, 1930 IC); 1931 } else { 1932 MemRuntimeCheckCond = 1933 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1934 RtPtrChecking.getChecks(), MemCheckExp); 1935 } 1936 assert(MemRuntimeCheckCond && 1937 "no RT checks generated although RtPtrChecking " 1938 "claimed checks are required"); 1939 } 1940 1941 if (!MemCheckBlock && !SCEVCheckBlock) 1942 return; 1943 1944 // Unhook the temporary block with the checks, update various places 1945 // accordingly. 1946 if (SCEVCheckBlock) 1947 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1948 if (MemCheckBlock) 1949 MemCheckBlock->replaceAllUsesWith(Preheader); 1950 1951 if (SCEVCheckBlock) { 1952 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1953 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1954 Preheader->getTerminator()->eraseFromParent(); 1955 } 1956 if (MemCheckBlock) { 1957 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1958 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1959 Preheader->getTerminator()->eraseFromParent(); 1960 } 1961 1962 DT->changeImmediateDominator(LoopHeader, Preheader); 1963 if (MemCheckBlock) { 1964 DT->eraseNode(MemCheckBlock); 1965 LI->removeBlock(MemCheckBlock); 1966 } 1967 if (SCEVCheckBlock) { 1968 DT->eraseNode(SCEVCheckBlock); 1969 LI->removeBlock(SCEVCheckBlock); 1970 } 1971 } 1972 1973 InstructionCost getCost() { 1974 if (SCEVCheckBlock || MemCheckBlock) 1975 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 1976 1977 if (CostTooHigh) { 1978 InstructionCost Cost; 1979 Cost.setInvalid(); 1980 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 1981 return Cost; 1982 } 1983 1984 InstructionCost RTCheckCost = 0; 1985 if (SCEVCheckBlock) 1986 for (Instruction &I : *SCEVCheckBlock) { 1987 if (SCEVCheckBlock->getTerminator() == &I) 1988 continue; 1989 InstructionCost C = 1990 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1991 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1992 RTCheckCost += C; 1993 } 1994 if (MemCheckBlock) 1995 for (Instruction &I : *MemCheckBlock) { 1996 if (MemCheckBlock->getTerminator() == &I) 1997 continue; 1998 InstructionCost C = 1999 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2000 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2001 RTCheckCost += C; 2002 } 2003 2004 if (SCEVCheckBlock || MemCheckBlock) 2005 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2006 << "\n"); 2007 2008 return RTCheckCost; 2009 } 2010 2011 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2012 /// unused. 2013 ~GeneratedRTChecks() { 2014 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2015 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2016 if (!SCEVCheckCond) 2017 SCEVCleaner.markResultUsed(); 2018 2019 if (!MemRuntimeCheckCond) 2020 MemCheckCleaner.markResultUsed(); 2021 2022 if (MemRuntimeCheckCond) { 2023 auto &SE = *MemCheckExp.getSE(); 2024 // Memory runtime check generation creates compares that use expanded 2025 // values. Remove them before running the SCEVExpanderCleaners. 2026 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2027 if (MemCheckExp.isInsertedInstruction(&I)) 2028 continue; 2029 SE.forgetValue(&I); 2030 I.eraseFromParent(); 2031 } 2032 } 2033 MemCheckCleaner.cleanup(); 2034 SCEVCleaner.cleanup(); 2035 2036 if (SCEVCheckCond) 2037 SCEVCheckBlock->eraseFromParent(); 2038 if (MemRuntimeCheckCond) 2039 MemCheckBlock->eraseFromParent(); 2040 } 2041 2042 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2043 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2044 /// depending on the generated condition. 2045 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2046 BasicBlock *LoopVectorPreHeader, 2047 BasicBlock *LoopExitBlock) { 2048 if (!SCEVCheckCond) 2049 return nullptr; 2050 2051 Value *Cond = SCEVCheckCond; 2052 // Mark the check as used, to prevent it from being removed during cleanup. 2053 SCEVCheckCond = nullptr; 2054 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2055 if (C->isZero()) 2056 return nullptr; 2057 2058 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2059 2060 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2061 // Create new preheader for vector loop. 2062 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2063 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2064 2065 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2066 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2067 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2068 SCEVCheckBlock); 2069 2070 DT->addNewBlock(SCEVCheckBlock, Pred); 2071 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2072 2073 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2074 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2075 return SCEVCheckBlock; 2076 } 2077 2078 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2079 /// the branches to branch to the vector preheader or \p Bypass, depending on 2080 /// the generated condition. 2081 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2082 BasicBlock *LoopVectorPreHeader) { 2083 // Check if we generated code that checks in runtime if arrays overlap. 2084 if (!MemRuntimeCheckCond) 2085 return nullptr; 2086 2087 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2088 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2089 MemCheckBlock); 2090 2091 DT->addNewBlock(MemCheckBlock, Pred); 2092 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2093 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2094 2095 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2096 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2097 2098 ReplaceInstWithInst( 2099 MemCheckBlock->getTerminator(), 2100 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2101 MemCheckBlock->getTerminator()->setDebugLoc( 2102 Pred->getTerminator()->getDebugLoc()); 2103 2104 // Mark the check as used, to prevent it from being removed during cleanup. 2105 MemRuntimeCheckCond = nullptr; 2106 return MemCheckBlock; 2107 } 2108 }; 2109 2110 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2111 // vectorization. The loop needs to be annotated with #pragma omp simd 2112 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2113 // vector length information is not provided, vectorization is not considered 2114 // explicit. Interleave hints are not allowed either. These limitations will be 2115 // relaxed in the future. 2116 // Please, note that we are currently forced to abuse the pragma 'clang 2117 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2118 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2119 // provides *explicit vectorization hints* (LV can bypass legal checks and 2120 // assume that vectorization is legal). However, both hints are implemented 2121 // using the same metadata (llvm.loop.vectorize, processed by 2122 // LoopVectorizeHints). This will be fixed in the future when the native IR 2123 // representation for pragma 'omp simd' is introduced. 2124 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2125 OptimizationRemarkEmitter *ORE) { 2126 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2127 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2128 2129 // Only outer loops with an explicit vectorization hint are supported. 2130 // Unannotated outer loops are ignored. 2131 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2132 return false; 2133 2134 Function *Fn = OuterLp->getHeader()->getParent(); 2135 if (!Hints.allowVectorization(Fn, OuterLp, 2136 true /*VectorizeOnlyWhenForced*/)) { 2137 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2138 return false; 2139 } 2140 2141 if (Hints.getInterleave() > 1) { 2142 // TODO: Interleave support is future work. 2143 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2144 "outer loops.\n"); 2145 Hints.emitRemarkWithHints(); 2146 return false; 2147 } 2148 2149 return true; 2150 } 2151 2152 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2153 OptimizationRemarkEmitter *ORE, 2154 SmallVectorImpl<Loop *> &V) { 2155 // Collect inner loops and outer loops without irreducible control flow. For 2156 // now, only collect outer loops that have explicit vectorization hints. If we 2157 // are stress testing the VPlan H-CFG construction, we collect the outermost 2158 // loop of every loop nest. 2159 if (L.isInnermost() || VPlanBuildStressTest || 2160 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2161 LoopBlocksRPO RPOT(&L); 2162 RPOT.perform(LI); 2163 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2164 V.push_back(&L); 2165 // TODO: Collect inner loops inside marked outer loops in case 2166 // vectorization fails for the outer loop. Do not invoke 2167 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2168 // already known to be reducible. We can use an inherited attribute for 2169 // that. 2170 return; 2171 } 2172 } 2173 for (Loop *InnerL : L) 2174 collectSupportedLoops(*InnerL, LI, ORE, V); 2175 } 2176 2177 namespace { 2178 2179 /// The LoopVectorize Pass. 2180 struct LoopVectorize : public FunctionPass { 2181 /// Pass identification, replacement for typeid 2182 static char ID; 2183 2184 LoopVectorizePass Impl; 2185 2186 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2187 bool VectorizeOnlyWhenForced = false) 2188 : FunctionPass(ID), 2189 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2190 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2191 } 2192 2193 bool runOnFunction(Function &F) override { 2194 if (skipFunction(F)) 2195 return false; 2196 2197 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2198 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2199 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2200 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2201 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2202 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2203 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2204 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2205 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2206 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2207 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2208 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2209 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2210 2211 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2212 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2213 2214 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2215 GetLAA, *ORE, PSI).MadeAnyChange; 2216 } 2217 2218 void getAnalysisUsage(AnalysisUsage &AU) const override { 2219 AU.addRequired<AssumptionCacheTracker>(); 2220 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2221 AU.addRequired<DominatorTreeWrapperPass>(); 2222 AU.addRequired<LoopInfoWrapperPass>(); 2223 AU.addRequired<ScalarEvolutionWrapperPass>(); 2224 AU.addRequired<TargetTransformInfoWrapperPass>(); 2225 AU.addRequired<AAResultsWrapperPass>(); 2226 AU.addRequired<LoopAccessLegacyAnalysis>(); 2227 AU.addRequired<DemandedBitsWrapperPass>(); 2228 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2229 AU.addRequired<InjectTLIMappingsLegacy>(); 2230 2231 // We currently do not preserve loopinfo/dominator analyses with outer loop 2232 // vectorization. Until this is addressed, mark these analyses as preserved 2233 // only for non-VPlan-native path. 2234 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2235 if (!EnableVPlanNativePath) { 2236 AU.addPreserved<LoopInfoWrapperPass>(); 2237 AU.addPreserved<DominatorTreeWrapperPass>(); 2238 } 2239 2240 AU.addPreserved<BasicAAWrapperPass>(); 2241 AU.addPreserved<GlobalsAAWrapperPass>(); 2242 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2243 } 2244 }; 2245 2246 } // end anonymous namespace 2247 2248 //===----------------------------------------------------------------------===// 2249 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2250 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2251 //===----------------------------------------------------------------------===// 2252 2253 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2254 // We need to place the broadcast of invariant variables outside the loop, 2255 // but only if it's proven safe to do so. Else, broadcast will be inside 2256 // vector loop body. 2257 Instruction *Instr = dyn_cast<Instruction>(V); 2258 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2259 (!Instr || 2260 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2261 // Place the code for broadcasting invariant variables in the new preheader. 2262 IRBuilder<>::InsertPointGuard Guard(Builder); 2263 if (SafeToHoist) 2264 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2265 2266 // Broadcast the scalar into all locations in the vector. 2267 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2268 2269 return Shuf; 2270 } 2271 2272 /// This function adds 2273 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2274 /// to each vector element of Val. The sequence starts at StartIndex. 2275 /// \p Opcode is relevant for FP induction variable. 2276 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2277 Instruction::BinaryOps BinOp, ElementCount VF, 2278 IRBuilderBase &Builder) { 2279 assert(VF.isVector() && "only vector VFs are supported"); 2280 2281 // Create and check the types. 2282 auto *ValVTy = cast<VectorType>(Val->getType()); 2283 ElementCount VLen = ValVTy->getElementCount(); 2284 2285 Type *STy = Val->getType()->getScalarType(); 2286 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2287 "Induction Step must be an integer or FP"); 2288 assert(Step->getType() == STy && "Step has wrong type"); 2289 2290 SmallVector<Constant *, 8> Indices; 2291 2292 // Create a vector of consecutive numbers from zero to VF. 2293 VectorType *InitVecValVTy = ValVTy; 2294 if (STy->isFloatingPointTy()) { 2295 Type *InitVecValSTy = 2296 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2297 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2298 } 2299 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2300 2301 // Splat the StartIdx 2302 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2303 2304 if (STy->isIntegerTy()) { 2305 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2306 Step = Builder.CreateVectorSplat(VLen, Step); 2307 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2308 // FIXME: The newly created binary instructions should contain nsw/nuw 2309 // flags, which can be found from the original scalar operations. 2310 Step = Builder.CreateMul(InitVec, Step); 2311 return Builder.CreateAdd(Val, Step, "induction"); 2312 } 2313 2314 // Floating point induction. 2315 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2316 "Binary Opcode should be specified for FP induction"); 2317 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2318 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2319 2320 Step = Builder.CreateVectorSplat(VLen, Step); 2321 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2322 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2323 } 2324 2325 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2326 /// variable on which to base the steps, \p Step is the size of the step. 2327 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2328 const InductionDescriptor &ID, VPValue *Def, 2329 VPTransformState &State) { 2330 IRBuilderBase &Builder = State.Builder; 2331 // We shouldn't have to build scalar steps if we aren't vectorizing. 2332 assert(State.VF.isVector() && "VF should be greater than one"); 2333 // Get the value type and ensure it and the step have the same integer type. 2334 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2335 assert(ScalarIVTy == Step->getType() && 2336 "Val and Step should have the same type"); 2337 2338 // We build scalar steps for both integer and floating-point induction 2339 // variables. Here, we determine the kind of arithmetic we will perform. 2340 Instruction::BinaryOps AddOp; 2341 Instruction::BinaryOps MulOp; 2342 if (ScalarIVTy->isIntegerTy()) { 2343 AddOp = Instruction::Add; 2344 MulOp = Instruction::Mul; 2345 } else { 2346 AddOp = ID.getInductionOpcode(); 2347 MulOp = Instruction::FMul; 2348 } 2349 2350 // Determine the number of scalars we need to generate for each unroll 2351 // iteration. 2352 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2353 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2354 // Compute the scalar steps and save the results in State. 2355 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2356 ScalarIVTy->getScalarSizeInBits()); 2357 Type *VecIVTy = nullptr; 2358 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2359 if (!FirstLaneOnly && State.VF.isScalable()) { 2360 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2361 UnitStepVec = 2362 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2363 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2364 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2365 } 2366 2367 for (unsigned Part = 0; Part < State.UF; ++Part) { 2368 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2369 2370 if (!FirstLaneOnly && State.VF.isScalable()) { 2371 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2372 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2373 if (ScalarIVTy->isFloatingPointTy()) 2374 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2375 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2376 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2377 State.set(Def, Add, Part); 2378 // It's useful to record the lane values too for the known minimum number 2379 // of elements so we do those below. This improves the code quality when 2380 // trying to extract the first element, for example. 2381 } 2382 2383 if (ScalarIVTy->isFloatingPointTy()) 2384 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2385 2386 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2387 Value *StartIdx = Builder.CreateBinOp( 2388 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2389 // The step returned by `createStepForVF` is a runtime-evaluated value 2390 // when VF is scalable. Otherwise, it should be folded into a Constant. 2391 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2392 "Expected StartIdx to be folded to a constant when VF is not " 2393 "scalable"); 2394 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2395 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2396 State.set(Def, Add, VPIteration(Part, Lane)); 2397 } 2398 } 2399 } 2400 2401 // Generate code for the induction step. Note that induction steps are 2402 // required to be loop-invariant 2403 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2404 Instruction *InsertBefore, 2405 Loop *OrigLoop = nullptr) { 2406 const DataLayout &DL = SE.getDataLayout(); 2407 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2408 "Induction step should be loop invariant"); 2409 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2410 return E->getValue(); 2411 2412 SCEVExpander Exp(SE, DL, "induction"); 2413 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2414 } 2415 2416 /// Compute the transformed value of Index at offset StartValue using step 2417 /// StepValue. 2418 /// For integer induction, returns StartValue + Index * StepValue. 2419 /// For pointer induction, returns StartValue[Index * StepValue]. 2420 /// FIXME: The newly created binary instructions should contain nsw/nuw 2421 /// flags, which can be found from the original scalar operations. 2422 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2423 Value *StartValue, Value *Step, 2424 const InductionDescriptor &ID) { 2425 assert(Index->getType()->getScalarType() == Step->getType() && 2426 "Index scalar type does not match StepValue type"); 2427 2428 // Note: the IR at this point is broken. We cannot use SE to create any new 2429 // SCEV and then expand it, hoping that SCEV's simplification will give us 2430 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2431 // lead to various SCEV crashes. So all we can do is to use builder and rely 2432 // on InstCombine for future simplifications. Here we handle some trivial 2433 // cases only. 2434 auto CreateAdd = [&B](Value *X, Value *Y) { 2435 assert(X->getType() == Y->getType() && "Types don't match!"); 2436 if (auto *CX = dyn_cast<ConstantInt>(X)) 2437 if (CX->isZero()) 2438 return Y; 2439 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2440 if (CY->isZero()) 2441 return X; 2442 return B.CreateAdd(X, Y); 2443 }; 2444 2445 // We allow X to be a vector type, in which case Y will potentially be 2446 // splatted into a vector with the same element count. 2447 auto CreateMul = [&B](Value *X, Value *Y) { 2448 assert(X->getType()->getScalarType() == Y->getType() && 2449 "Types don't match!"); 2450 if (auto *CX = dyn_cast<ConstantInt>(X)) 2451 if (CX->isOne()) 2452 return Y; 2453 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2454 if (CY->isOne()) 2455 return X; 2456 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2457 if (XVTy && !isa<VectorType>(Y->getType())) 2458 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2459 return B.CreateMul(X, Y); 2460 }; 2461 2462 switch (ID.getKind()) { 2463 case InductionDescriptor::IK_IntInduction: { 2464 assert(!isa<VectorType>(Index->getType()) && 2465 "Vector indices not supported for integer inductions yet"); 2466 assert(Index->getType() == StartValue->getType() && 2467 "Index type does not match StartValue type"); 2468 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2469 return B.CreateSub(StartValue, Index); 2470 auto *Offset = CreateMul(Index, Step); 2471 return CreateAdd(StartValue, Offset); 2472 } 2473 case InductionDescriptor::IK_PtrInduction: { 2474 assert(isa<Constant>(Step) && 2475 "Expected constant step for pointer induction"); 2476 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2477 } 2478 case InductionDescriptor::IK_FpInduction: { 2479 assert(!isa<VectorType>(Index->getType()) && 2480 "Vector indices not supported for FP inductions yet"); 2481 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2482 auto InductionBinOp = ID.getInductionBinOp(); 2483 assert(InductionBinOp && 2484 (InductionBinOp->getOpcode() == Instruction::FAdd || 2485 InductionBinOp->getOpcode() == Instruction::FSub) && 2486 "Original bin op should be defined for FP induction"); 2487 2488 Value *MulExp = B.CreateFMul(Step, Index); 2489 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2490 "induction"); 2491 } 2492 case InductionDescriptor::IK_NoInduction: 2493 return nullptr; 2494 } 2495 llvm_unreachable("invalid enum"); 2496 } 2497 2498 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2499 const VPIteration &Instance, 2500 VPTransformState &State) { 2501 Value *ScalarInst = State.get(Def, Instance); 2502 Value *VectorValue = State.get(Def, Instance.Part); 2503 VectorValue = Builder.CreateInsertElement( 2504 VectorValue, ScalarInst, 2505 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2506 State.set(Def, VectorValue, Instance.Part); 2507 } 2508 2509 // Return whether we allow using masked interleave-groups (for dealing with 2510 // strided loads/stores that reside in predicated blocks, or for dealing 2511 // with gaps). 2512 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2513 // If an override option has been passed in for interleaved accesses, use it. 2514 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2515 return EnableMaskedInterleavedMemAccesses; 2516 2517 return TTI.enableMaskedInterleavedAccessVectorization(); 2518 } 2519 2520 // Try to vectorize the interleave group that \p Instr belongs to. 2521 // 2522 // E.g. Translate following interleaved load group (factor = 3): 2523 // for (i = 0; i < N; i+=3) { 2524 // R = Pic[i]; // Member of index 0 2525 // G = Pic[i+1]; // Member of index 1 2526 // B = Pic[i+2]; // Member of index 2 2527 // ... // do something to R, G, B 2528 // } 2529 // To: 2530 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2531 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2532 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2533 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2534 // 2535 // Or translate following interleaved store group (factor = 3): 2536 // for (i = 0; i < N; i+=3) { 2537 // ... do something to R, G, B 2538 // Pic[i] = R; // Member of index 0 2539 // Pic[i+1] = G; // Member of index 1 2540 // Pic[i+2] = B; // Member of index 2 2541 // } 2542 // To: 2543 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2544 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2545 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2546 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2547 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2548 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2549 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2550 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2551 VPValue *BlockInMask) { 2552 Instruction *Instr = Group->getInsertPos(); 2553 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2554 2555 // Prepare for the vector type of the interleaved load/store. 2556 Type *ScalarTy = getLoadStoreType(Instr); 2557 unsigned InterleaveFactor = Group->getFactor(); 2558 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2559 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2560 2561 // Prepare for the new pointers. 2562 SmallVector<Value *, 2> AddrParts; 2563 unsigned Index = Group->getIndex(Instr); 2564 2565 // TODO: extend the masked interleaved-group support to reversed access. 2566 assert((!BlockInMask || !Group->isReverse()) && 2567 "Reversed masked interleave-group not supported."); 2568 2569 // If the group is reverse, adjust the index to refer to the last vector lane 2570 // instead of the first. We adjust the index from the first vector lane, 2571 // rather than directly getting the pointer for lane VF - 1, because the 2572 // pointer operand of the interleaved access is supposed to be uniform. For 2573 // uniform instructions, we're only required to generate a value for the 2574 // first vector lane in each unroll iteration. 2575 if (Group->isReverse()) 2576 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2577 2578 for (unsigned Part = 0; Part < UF; Part++) { 2579 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2580 State.setDebugLocFromInst(AddrPart); 2581 2582 // Notice current instruction could be any index. Need to adjust the address 2583 // to the member of index 0. 2584 // 2585 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2586 // b = A[i]; // Member of index 0 2587 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2588 // 2589 // E.g. A[i+1] = a; // Member of index 1 2590 // A[i] = b; // Member of index 0 2591 // A[i+2] = c; // Member of index 2 (Current instruction) 2592 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2593 2594 bool InBounds = false; 2595 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2596 InBounds = gep->isInBounds(); 2597 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2598 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2599 2600 // Cast to the vector pointer type. 2601 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2602 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2603 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2604 } 2605 2606 State.setDebugLocFromInst(Instr); 2607 Value *PoisonVec = PoisonValue::get(VecTy); 2608 2609 Value *MaskForGaps = nullptr; 2610 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2611 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2612 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2613 } 2614 2615 // Vectorize the interleaved load group. 2616 if (isa<LoadInst>(Instr)) { 2617 // For each unroll part, create a wide load for the group. 2618 SmallVector<Value *, 2> NewLoads; 2619 for (unsigned Part = 0; Part < UF; Part++) { 2620 Instruction *NewLoad; 2621 if (BlockInMask || MaskForGaps) { 2622 assert(useMaskedInterleavedAccesses(*TTI) && 2623 "masked interleaved groups are not allowed."); 2624 Value *GroupMask = MaskForGaps; 2625 if (BlockInMask) { 2626 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2627 Value *ShuffledMask = Builder.CreateShuffleVector( 2628 BlockInMaskPart, 2629 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2630 "interleaved.mask"); 2631 GroupMask = MaskForGaps 2632 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2633 MaskForGaps) 2634 : ShuffledMask; 2635 } 2636 NewLoad = 2637 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2638 GroupMask, PoisonVec, "wide.masked.vec"); 2639 } 2640 else 2641 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2642 Group->getAlign(), "wide.vec"); 2643 Group->addMetadata(NewLoad); 2644 NewLoads.push_back(NewLoad); 2645 } 2646 2647 // For each member in the group, shuffle out the appropriate data from the 2648 // wide loads. 2649 unsigned J = 0; 2650 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2651 Instruction *Member = Group->getMember(I); 2652 2653 // Skip the gaps in the group. 2654 if (!Member) 2655 continue; 2656 2657 auto StrideMask = 2658 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2659 for (unsigned Part = 0; Part < UF; Part++) { 2660 Value *StridedVec = Builder.CreateShuffleVector( 2661 NewLoads[Part], StrideMask, "strided.vec"); 2662 2663 // If this member has different type, cast the result type. 2664 if (Member->getType() != ScalarTy) { 2665 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2666 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2667 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2668 } 2669 2670 if (Group->isReverse()) 2671 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2672 2673 State.set(VPDefs[J], StridedVec, Part); 2674 } 2675 ++J; 2676 } 2677 return; 2678 } 2679 2680 // The sub vector type for current instruction. 2681 auto *SubVT = VectorType::get(ScalarTy, VF); 2682 2683 // Vectorize the interleaved store group. 2684 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2685 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2686 "masked interleaved groups are not allowed."); 2687 assert((!MaskForGaps || !VF.isScalable()) && 2688 "masking gaps for scalable vectors is not yet supported."); 2689 for (unsigned Part = 0; Part < UF; Part++) { 2690 // Collect the stored vector from each member. 2691 SmallVector<Value *, 4> StoredVecs; 2692 for (unsigned i = 0; i < InterleaveFactor; i++) { 2693 assert((Group->getMember(i) || MaskForGaps) && 2694 "Fail to get a member from an interleaved store group"); 2695 Instruction *Member = Group->getMember(i); 2696 2697 // Skip the gaps in the group. 2698 if (!Member) { 2699 Value *Undef = PoisonValue::get(SubVT); 2700 StoredVecs.push_back(Undef); 2701 continue; 2702 } 2703 2704 Value *StoredVec = State.get(StoredValues[i], Part); 2705 2706 if (Group->isReverse()) 2707 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2708 2709 // If this member has different type, cast it to a unified type. 2710 2711 if (StoredVec->getType() != SubVT) 2712 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2713 2714 StoredVecs.push_back(StoredVec); 2715 } 2716 2717 // Concatenate all vectors into a wide vector. 2718 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2719 2720 // Interleave the elements in the wide vector. 2721 Value *IVec = Builder.CreateShuffleVector( 2722 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2723 "interleaved.vec"); 2724 2725 Instruction *NewStoreInstr; 2726 if (BlockInMask || MaskForGaps) { 2727 Value *GroupMask = MaskForGaps; 2728 if (BlockInMask) { 2729 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2730 Value *ShuffledMask = Builder.CreateShuffleVector( 2731 BlockInMaskPart, 2732 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2733 "interleaved.mask"); 2734 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2735 ShuffledMask, MaskForGaps) 2736 : ShuffledMask; 2737 } 2738 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2739 Group->getAlign(), GroupMask); 2740 } else 2741 NewStoreInstr = 2742 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2743 2744 Group->addMetadata(NewStoreInstr); 2745 } 2746 } 2747 2748 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2749 VPReplicateRecipe *RepRecipe, 2750 const VPIteration &Instance, 2751 bool IfPredicateInstr, 2752 VPTransformState &State) { 2753 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2754 2755 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2756 // the first lane and part. 2757 if (isa<NoAliasScopeDeclInst>(Instr)) 2758 if (!Instance.isFirstIteration()) 2759 return; 2760 2761 // Does this instruction return a value ? 2762 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2763 2764 Instruction *Cloned = Instr->clone(); 2765 if (!IsVoidRetTy) 2766 Cloned->setName(Instr->getName() + ".cloned"); 2767 2768 // If the scalarized instruction contributes to the address computation of a 2769 // widen masked load/store which was in a basic block that needed predication 2770 // and is not predicated after vectorization, we can't propagate 2771 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2772 // instruction could feed a poison value to the base address of the widen 2773 // load/store. 2774 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2775 Cloned->dropPoisonGeneratingFlags(); 2776 2777 if (Instr->getDebugLoc()) 2778 State.setDebugLocFromInst(Instr); 2779 2780 // Replace the operands of the cloned instructions with their scalar 2781 // equivalents in the new loop. 2782 for (auto &I : enumerate(RepRecipe->operands())) { 2783 auto InputInstance = Instance; 2784 VPValue *Operand = I.value(); 2785 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2786 if (OperandR && OperandR->isUniform()) 2787 InputInstance.Lane = VPLane::getFirstLane(); 2788 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2789 } 2790 State.addNewMetadata(Cloned, Instr); 2791 2792 // Place the cloned scalar in the new loop. 2793 State.Builder.Insert(Cloned); 2794 2795 State.set(RepRecipe, Cloned, Instance); 2796 2797 // If we just cloned a new assumption, add it the assumption cache. 2798 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2799 AC->registerAssumption(II); 2800 2801 // End if-block. 2802 if (IfPredicateInstr) 2803 PredicatedInstructions.push_back(Cloned); 2804 } 2805 2806 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2807 if (TripCount) 2808 return TripCount; 2809 2810 assert(InsertBlock); 2811 IRBuilder<> Builder(InsertBlock->getTerminator()); 2812 // Find the loop boundaries. 2813 ScalarEvolution *SE = PSE.getSE(); 2814 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2815 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2816 "Invalid loop count"); 2817 2818 Type *IdxTy = Legal->getWidestInductionType(); 2819 assert(IdxTy && "No type for induction"); 2820 2821 // The exit count might have the type of i64 while the phi is i32. This can 2822 // happen if we have an induction variable that is sign extended before the 2823 // compare. The only way that we get a backedge taken count is that the 2824 // induction variable was signed and as such will not overflow. In such a case 2825 // truncation is legal. 2826 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2827 IdxTy->getPrimitiveSizeInBits()) 2828 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2829 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2830 2831 // Get the total trip count from the count by adding 1. 2832 const SCEV *ExitCount = SE->getAddExpr( 2833 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2834 2835 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2836 2837 // Expand the trip count and place the new instructions in the preheader. 2838 // Notice that the pre-header does not change, only the loop body. 2839 SCEVExpander Exp(*SE, DL, "induction"); 2840 2841 // Count holds the overall loop count (N). 2842 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2843 InsertBlock->getTerminator()); 2844 2845 if (TripCount->getType()->isPointerTy()) 2846 TripCount = 2847 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2848 InsertBlock->getTerminator()); 2849 2850 return TripCount; 2851 } 2852 2853 Value * 2854 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2855 if (VectorTripCount) 2856 return VectorTripCount; 2857 2858 Value *TC = getOrCreateTripCount(InsertBlock); 2859 IRBuilder<> Builder(InsertBlock->getTerminator()); 2860 2861 Type *Ty = TC->getType(); 2862 // This is where we can make the step a runtime constant. 2863 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2864 2865 // If the tail is to be folded by masking, round the number of iterations N 2866 // up to a multiple of Step instead of rounding down. This is done by first 2867 // adding Step-1 and then rounding down. Note that it's ok if this addition 2868 // overflows: the vector induction variable will eventually wrap to zero given 2869 // that it starts at zero and its Step is a power of two; the loop will then 2870 // exit, with the last early-exit vector comparison also producing all-true. 2871 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2872 // is accounted for in emitIterationCountCheck that adds an overflow check. 2873 if (Cost->foldTailByMasking()) { 2874 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2875 "VF*UF must be a power of 2 when folding tail by masking"); 2876 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2877 TC = Builder.CreateAdd( 2878 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2879 } 2880 2881 // Now we need to generate the expression for the part of the loop that the 2882 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2883 // iterations are not required for correctness, or N - Step, otherwise. Step 2884 // is equal to the vectorization factor (number of SIMD elements) times the 2885 // unroll factor (number of SIMD instructions). 2886 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2887 2888 // There are cases where we *must* run at least one iteration in the remainder 2889 // loop. See the cost model for when this can happen. If the step evenly 2890 // divides the trip count, we set the remainder to be equal to the step. If 2891 // the step does not evenly divide the trip count, no adjustment is necessary 2892 // since there will already be scalar iterations. Note that the minimum 2893 // iterations check ensures that N >= Step. 2894 if (Cost->requiresScalarEpilogue(VF)) { 2895 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2896 R = Builder.CreateSelect(IsZero, Step, R); 2897 } 2898 2899 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2900 2901 return VectorTripCount; 2902 } 2903 2904 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2905 const DataLayout &DL) { 2906 // Verify that V is a vector type with same number of elements as DstVTy. 2907 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2908 unsigned VF = DstFVTy->getNumElements(); 2909 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2910 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2911 Type *SrcElemTy = SrcVecTy->getElementType(); 2912 Type *DstElemTy = DstFVTy->getElementType(); 2913 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2914 "Vector elements must have same size"); 2915 2916 // Do a direct cast if element types are castable. 2917 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2918 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2919 } 2920 // V cannot be directly casted to desired vector type. 2921 // May happen when V is a floating point vector but DstVTy is a vector of 2922 // pointers or vice-versa. Handle this using a two-step bitcast using an 2923 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2924 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2925 "Only one type should be a pointer type"); 2926 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2927 "Only one type should be a floating point type"); 2928 Type *IntTy = 2929 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2930 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2931 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2932 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2933 } 2934 2935 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2936 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2937 // Reuse existing vector loop preheader for TC checks. 2938 // Note that new preheader block is generated for vector loop. 2939 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2940 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2941 2942 // Generate code to check if the loop's trip count is less than VF * UF, or 2943 // equal to it in case a scalar epilogue is required; this implies that the 2944 // vector trip count is zero. This check also covers the case where adding one 2945 // to the backedge-taken count overflowed leading to an incorrect trip count 2946 // of zero. In this case we will also jump to the scalar loop. 2947 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2948 : ICmpInst::ICMP_ULT; 2949 2950 // If tail is to be folded, vector loop takes care of all iterations. 2951 Type *CountTy = Count->getType(); 2952 Value *CheckMinIters = Builder.getFalse(); 2953 auto CreateStep = [&]() -> Value * { 2954 // Create step with max(MinProTripCount, UF * VF). 2955 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2956 return createStepForVF(Builder, CountTy, VF, UF); 2957 2958 Value *MinProfTC = 2959 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2960 if (!VF.isScalable()) 2961 return MinProfTC; 2962 return Builder.CreateBinaryIntrinsic( 2963 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2964 }; 2965 2966 if (!Cost->foldTailByMasking()) 2967 CheckMinIters = 2968 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2969 else if (VF.isScalable()) { 2970 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2971 // an overflow to zero when updating induction variables and so an 2972 // additional overflow check is required before entering the vector loop. 2973 2974 // Get the maximum unsigned value for the type. 2975 Value *MaxUIntTripCount = 2976 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2977 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2978 2979 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2980 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2981 } 2982 2983 // Create new preheader for vector loop. 2984 LoopVectorPreHeader = 2985 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2986 "vector.ph"); 2987 2988 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2989 DT->getNode(Bypass)->getIDom()) && 2990 "TC check is expected to dominate Bypass"); 2991 2992 // Update dominator for Bypass & LoopExit (if needed). 2993 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2994 if (!Cost->requiresScalarEpilogue(VF)) 2995 // If there is an epilogue which must run, there's no edge from the 2996 // middle block to exit blocks and thus no need to update the immediate 2997 // dominator of the exit blocks. 2998 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2999 3000 ReplaceInstWithInst( 3001 TCCheckBlock->getTerminator(), 3002 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3003 LoopBypassBlocks.push_back(TCCheckBlock); 3004 } 3005 3006 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 3007 BasicBlock *const SCEVCheckBlock = 3008 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3009 if (!SCEVCheckBlock) 3010 return nullptr; 3011 3012 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3013 (OptForSizeBasedOnProfile && 3014 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3015 "Cannot SCEV check stride or overflow when optimizing for size"); 3016 3017 3018 // Update dominator only if this is first RT check. 3019 if (LoopBypassBlocks.empty()) { 3020 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3021 if (!Cost->requiresScalarEpilogue(VF)) 3022 // If there is an epilogue which must run, there's no edge from the 3023 // middle block to exit blocks and thus no need to update the immediate 3024 // dominator of the exit blocks. 3025 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3026 } 3027 3028 LoopBypassBlocks.push_back(SCEVCheckBlock); 3029 AddedSafetyChecks = true; 3030 return SCEVCheckBlock; 3031 } 3032 3033 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3034 // VPlan-native path does not do any analysis for runtime checks currently. 3035 if (EnableVPlanNativePath) 3036 return nullptr; 3037 3038 BasicBlock *const MemCheckBlock = 3039 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3040 3041 // Check if we generated code that checks in runtime if arrays overlap. We put 3042 // the checks into a separate block to make the more common case of few 3043 // elements faster. 3044 if (!MemCheckBlock) 3045 return nullptr; 3046 3047 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3048 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3049 "Cannot emit memory checks when optimizing for size, unless forced " 3050 "to vectorize."); 3051 ORE->emit([&]() { 3052 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3053 OrigLoop->getStartLoc(), 3054 OrigLoop->getHeader()) 3055 << "Code-size may be reduced by not forcing " 3056 "vectorization, or by source-code modifications " 3057 "eliminating the need for runtime checks " 3058 "(e.g., adding 'restrict')."; 3059 }); 3060 } 3061 3062 LoopBypassBlocks.push_back(MemCheckBlock); 3063 3064 AddedSafetyChecks = true; 3065 3066 return MemCheckBlock; 3067 } 3068 3069 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3070 LoopScalarBody = OrigLoop->getHeader(); 3071 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3072 assert(LoopVectorPreHeader && "Invalid loop structure"); 3073 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3074 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3075 "multiple exit loop without required epilogue?"); 3076 3077 LoopMiddleBlock = 3078 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3079 LI, nullptr, Twine(Prefix) + "middle.block"); 3080 LoopScalarPreHeader = 3081 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3082 nullptr, Twine(Prefix) + "scalar.ph"); 3083 3084 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3085 3086 // Set up the middle block terminator. Two cases: 3087 // 1) If we know that we must execute the scalar epilogue, emit an 3088 // unconditional branch. 3089 // 2) Otherwise, we must have a single unique exit block (due to how we 3090 // implement the multiple exit case). In this case, set up a conditonal 3091 // branch from the middle block to the loop scalar preheader, and the 3092 // exit block. completeLoopSkeleton will update the condition to use an 3093 // iteration check, if required to decide whether to execute the remainder. 3094 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3095 BranchInst::Create(LoopScalarPreHeader) : 3096 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3097 Builder.getTrue()); 3098 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3099 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3100 3101 // Update dominator for loop exit. During skeleton creation, only the vector 3102 // pre-header and the middle block are created. The vector loop is entirely 3103 // created during VPlan exection. 3104 if (!Cost->requiresScalarEpilogue(VF)) 3105 // If there is an epilogue which must run, there's no edge from the 3106 // middle block to exit blocks and thus no need to update the immediate 3107 // dominator of the exit blocks. 3108 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3109 } 3110 3111 void InnerLoopVectorizer::createInductionResumeValues( 3112 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3113 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3114 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3115 "Inconsistent information about additional bypass."); 3116 3117 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3118 assert(VectorTripCount && "Expected valid arguments"); 3119 // We are going to resume the execution of the scalar loop. 3120 // Go over all of the induction variables that we found and fix the 3121 // PHIs that are left in the scalar version of the loop. 3122 // The starting values of PHI nodes depend on the counter of the last 3123 // iteration in the vectorized loop. 3124 // If we come from a bypass edge then we need to start from the original 3125 // start value. 3126 Instruction *OldInduction = Legal->getPrimaryInduction(); 3127 for (auto &InductionEntry : Legal->getInductionVars()) { 3128 PHINode *OrigPhi = InductionEntry.first; 3129 InductionDescriptor II = InductionEntry.second; 3130 3131 Value *&EndValue = IVEndValues[OrigPhi]; 3132 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3133 if (OrigPhi == OldInduction) { 3134 // We know what the end value is. 3135 EndValue = VectorTripCount; 3136 } else { 3137 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3138 3139 // Fast-math-flags propagate from the original induction instruction. 3140 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3141 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3142 3143 Type *StepType = II.getStep()->getType(); 3144 Instruction::CastOps CastOp = 3145 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3146 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3147 Value *Step = 3148 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3149 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3150 EndValue->setName("ind.end"); 3151 3152 // Compute the end value for the additional bypass (if applicable). 3153 if (AdditionalBypass.first) { 3154 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3155 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3156 StepType, true); 3157 Value *Step = 3158 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3159 VTC = 3160 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3161 EndValueFromAdditionalBypass = 3162 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3163 EndValueFromAdditionalBypass->setName("ind.end"); 3164 } 3165 } 3166 3167 // Create phi nodes to merge from the backedge-taken check block. 3168 PHINode *BCResumeVal = 3169 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3170 LoopScalarPreHeader->getTerminator()); 3171 // Copy original phi DL over to the new one. 3172 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3173 3174 // The new PHI merges the original incoming value, in case of a bypass, 3175 // or the value at the end of the vectorized loop. 3176 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3177 3178 // Fix the scalar body counter (PHI node). 3179 // The old induction's phi node in the scalar body needs the truncated 3180 // value. 3181 for (BasicBlock *BB : LoopBypassBlocks) 3182 BCResumeVal->addIncoming(II.getStartValue(), BB); 3183 3184 if (AdditionalBypass.first) 3185 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3186 EndValueFromAdditionalBypass); 3187 3188 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3189 } 3190 } 3191 3192 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3193 // The trip counts should be cached by now. 3194 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3195 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3196 3197 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3198 3199 // Add a check in the middle block to see if we have completed 3200 // all of the iterations in the first vector loop. Three cases: 3201 // 1) If we require a scalar epilogue, there is no conditional branch as 3202 // we unconditionally branch to the scalar preheader. Do nothing. 3203 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3204 // Thus if tail is to be folded, we know we don't need to run the 3205 // remainder and we can use the previous value for the condition (true). 3206 // 3) Otherwise, construct a runtime check. 3207 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3208 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3209 Count, VectorTripCount, "cmp.n", 3210 LoopMiddleBlock->getTerminator()); 3211 3212 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3213 // of the corresponding compare because they may have ended up with 3214 // different line numbers and we want to avoid awkward line stepping while 3215 // debugging. Eg. if the compare has got a line number inside the loop. 3216 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3217 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3218 } 3219 3220 #ifdef EXPENSIVE_CHECKS 3221 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3222 #endif 3223 3224 return LoopVectorPreHeader; 3225 } 3226 3227 std::pair<BasicBlock *, Value *> 3228 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3229 /* 3230 In this function we generate a new loop. The new loop will contain 3231 the vectorized instructions while the old loop will continue to run the 3232 scalar remainder. 3233 3234 [ ] <-- loop iteration number check. 3235 / | 3236 / v 3237 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3238 | / | 3239 | / v 3240 || [ ] <-- vector pre header. 3241 |/ | 3242 | v 3243 | [ ] \ 3244 | [ ]_| <-- vector loop (created during VPlan execution). 3245 | | 3246 | v 3247 \ -[ ] <--- middle-block. 3248 \/ | 3249 /\ v 3250 | ->[ ] <--- new preheader. 3251 | | 3252 (opt) v <-- edge from middle to exit iff epilogue is not required. 3253 | [ ] \ 3254 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3255 \ | 3256 \ v 3257 >[ ] <-- exit block(s). 3258 ... 3259 */ 3260 3261 // Get the metadata of the original loop before it gets modified. 3262 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3263 3264 // Workaround! Compute the trip count of the original loop and cache it 3265 // before we start modifying the CFG. This code has a systemic problem 3266 // wherein it tries to run analysis over partially constructed IR; this is 3267 // wrong, and not simply for SCEV. The trip count of the original loop 3268 // simply happens to be prone to hitting this in practice. In theory, we 3269 // can hit the same issue for any SCEV, or ValueTracking query done during 3270 // mutation. See PR49900. 3271 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3272 3273 // Create an empty vector loop, and prepare basic blocks for the runtime 3274 // checks. 3275 createVectorLoopSkeleton(""); 3276 3277 // Now, compare the new count to zero. If it is zero skip the vector loop and 3278 // jump to the scalar loop. This check also covers the case where the 3279 // backedge-taken count is uint##_max: adding one to it will overflow leading 3280 // to an incorrect trip count of zero. In this (rare) case we will also jump 3281 // to the scalar loop. 3282 emitIterationCountCheck(LoopScalarPreHeader); 3283 3284 // Generate the code to check any assumptions that we've made for SCEV 3285 // expressions. 3286 emitSCEVChecks(LoopScalarPreHeader); 3287 3288 // Generate the code that checks in runtime if arrays overlap. We put the 3289 // checks into a separate block to make the more common case of few elements 3290 // faster. 3291 emitMemRuntimeChecks(LoopScalarPreHeader); 3292 3293 // Emit phis for the new starting index of the scalar loop. 3294 createInductionResumeValues(); 3295 3296 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3297 } 3298 3299 // Fix up external users of the induction variable. At this point, we are 3300 // in LCSSA form, with all external PHIs that use the IV having one input value, 3301 // coming from the remainder loop. We need those PHIs to also have a correct 3302 // value for the IV when arriving directly from the middle block. 3303 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3304 const InductionDescriptor &II, 3305 Value *VectorTripCount, Value *EndValue, 3306 BasicBlock *MiddleBlock, 3307 BasicBlock *VectorHeader, VPlan &Plan) { 3308 // There are two kinds of external IV usages - those that use the value 3309 // computed in the last iteration (the PHI) and those that use the penultimate 3310 // value (the value that feeds into the phi from the loop latch). 3311 // We allow both, but they, obviously, have different values. 3312 3313 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3314 3315 DenseMap<Value *, Value *> MissingVals; 3316 3317 // An external user of the last iteration's value should see the value that 3318 // the remainder loop uses to initialize its own IV. 3319 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3320 for (User *U : PostInc->users()) { 3321 Instruction *UI = cast<Instruction>(U); 3322 if (!OrigLoop->contains(UI)) { 3323 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3324 MissingVals[UI] = EndValue; 3325 } 3326 } 3327 3328 // An external user of the penultimate value need to see EndValue - Step. 3329 // The simplest way to get this is to recompute it from the constituent SCEVs, 3330 // that is Start + (Step * (CRD - 1)). 3331 for (User *U : OrigPhi->users()) { 3332 auto *UI = cast<Instruction>(U); 3333 if (!OrigLoop->contains(UI)) { 3334 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3335 3336 IRBuilder<> B(MiddleBlock->getTerminator()); 3337 3338 // Fast-math-flags propagate from the original induction instruction. 3339 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3340 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3341 3342 Value *CountMinusOne = B.CreateSub( 3343 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3344 Value *CMO = 3345 !II.getStep()->getType()->isIntegerTy() 3346 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3347 II.getStep()->getType()) 3348 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3349 CMO->setName("cast.cmo"); 3350 3351 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3352 VectorHeader->getTerminator()); 3353 Value *Escape = 3354 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3355 Escape->setName("ind.escape"); 3356 MissingVals[UI] = Escape; 3357 } 3358 } 3359 3360 for (auto &I : MissingVals) { 3361 PHINode *PHI = cast<PHINode>(I.first); 3362 // One corner case we have to handle is two IVs "chasing" each-other, 3363 // that is %IV2 = phi [...], [ %IV1, %latch ] 3364 // In this case, if IV1 has an external use, we need to avoid adding both 3365 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3366 // don't already have an incoming value for the middle block. 3367 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3368 PHI->addIncoming(I.second, MiddleBlock); 3369 Plan.removeLiveOut(PHI); 3370 } 3371 } 3372 } 3373 3374 namespace { 3375 3376 struct CSEDenseMapInfo { 3377 static bool canHandle(const Instruction *I) { 3378 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3379 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3380 } 3381 3382 static inline Instruction *getEmptyKey() { 3383 return DenseMapInfo<Instruction *>::getEmptyKey(); 3384 } 3385 3386 static inline Instruction *getTombstoneKey() { 3387 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3388 } 3389 3390 static unsigned getHashValue(const Instruction *I) { 3391 assert(canHandle(I) && "Unknown instruction!"); 3392 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3393 I->value_op_end())); 3394 } 3395 3396 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3397 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3398 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3399 return LHS == RHS; 3400 return LHS->isIdenticalTo(RHS); 3401 } 3402 }; 3403 3404 } // end anonymous namespace 3405 3406 ///Perform cse of induction variable instructions. 3407 static void cse(BasicBlock *BB) { 3408 // Perform simple cse. 3409 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3410 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3411 if (!CSEDenseMapInfo::canHandle(&In)) 3412 continue; 3413 3414 // Check if we can replace this instruction with any of the 3415 // visited instructions. 3416 if (Instruction *V = CSEMap.lookup(&In)) { 3417 In.replaceAllUsesWith(V); 3418 In.eraseFromParent(); 3419 continue; 3420 } 3421 3422 CSEMap[&In] = &In; 3423 } 3424 } 3425 3426 InstructionCost 3427 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3428 bool &NeedToScalarize) const { 3429 Function *F = CI->getCalledFunction(); 3430 Type *ScalarRetTy = CI->getType(); 3431 SmallVector<Type *, 4> Tys, ScalarTys; 3432 for (auto &ArgOp : CI->args()) 3433 ScalarTys.push_back(ArgOp->getType()); 3434 3435 // Estimate cost of scalarized vector call. The source operands are assumed 3436 // to be vectors, so we need to extract individual elements from there, 3437 // execute VF scalar calls, and then gather the result into the vector return 3438 // value. 3439 InstructionCost ScalarCallCost = 3440 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3441 if (VF.isScalar()) 3442 return ScalarCallCost; 3443 3444 // Compute corresponding vector type for return value and arguments. 3445 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3446 for (Type *ScalarTy : ScalarTys) 3447 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3448 3449 // Compute costs of unpacking argument values for the scalar calls and 3450 // packing the return values to a vector. 3451 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3452 3453 InstructionCost Cost = 3454 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3455 3456 // If we can't emit a vector call for this function, then the currently found 3457 // cost is the cost we need to return. 3458 NeedToScalarize = true; 3459 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3460 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3461 3462 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3463 return Cost; 3464 3465 // If the corresponding vector cost is cheaper, return its cost. 3466 InstructionCost VectorCallCost = 3467 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3468 if (VectorCallCost < Cost) { 3469 NeedToScalarize = false; 3470 Cost = VectorCallCost; 3471 } 3472 return Cost; 3473 } 3474 3475 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3476 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3477 return Elt; 3478 return VectorType::get(Elt, VF); 3479 } 3480 3481 InstructionCost 3482 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3483 ElementCount VF) const { 3484 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3485 assert(ID && "Expected intrinsic call!"); 3486 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3487 FastMathFlags FMF; 3488 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3489 FMF = FPMO->getFastMathFlags(); 3490 3491 SmallVector<const Value *> Arguments(CI->args()); 3492 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3493 SmallVector<Type *> ParamTys; 3494 std::transform(FTy->param_begin(), FTy->param_end(), 3495 std::back_inserter(ParamTys), 3496 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3497 3498 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3499 dyn_cast<IntrinsicInst>(CI)); 3500 return TTI.getIntrinsicInstrCost(CostAttrs, 3501 TargetTransformInfo::TCK_RecipThroughput); 3502 } 3503 3504 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3505 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3506 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3507 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3508 } 3509 3510 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3511 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3512 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3513 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3514 } 3515 3516 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3517 // For every instruction `I` in MinBWs, truncate the operands, create a 3518 // truncated version of `I` and reextend its result. InstCombine runs 3519 // later and will remove any ext/trunc pairs. 3520 SmallPtrSet<Value *, 4> Erased; 3521 for (const auto &KV : Cost->getMinimalBitwidths()) { 3522 // If the value wasn't vectorized, we must maintain the original scalar 3523 // type. The absence of the value from State indicates that it 3524 // wasn't vectorized. 3525 // FIXME: Should not rely on getVPValue at this point. 3526 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3527 if (!State.hasAnyVectorValue(Def)) 3528 continue; 3529 for (unsigned Part = 0; Part < UF; ++Part) { 3530 Value *I = State.get(Def, Part); 3531 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3532 continue; 3533 Type *OriginalTy = I->getType(); 3534 Type *ScalarTruncatedTy = 3535 IntegerType::get(OriginalTy->getContext(), KV.second); 3536 auto *TruncatedTy = VectorType::get( 3537 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3538 if (TruncatedTy == OriginalTy) 3539 continue; 3540 3541 IRBuilder<> B(cast<Instruction>(I)); 3542 auto ShrinkOperand = [&](Value *V) -> Value * { 3543 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3544 if (ZI->getSrcTy() == TruncatedTy) 3545 return ZI->getOperand(0); 3546 return B.CreateZExtOrTrunc(V, TruncatedTy); 3547 }; 3548 3549 // The actual instruction modification depends on the instruction type, 3550 // unfortunately. 3551 Value *NewI = nullptr; 3552 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3553 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3554 ShrinkOperand(BO->getOperand(1))); 3555 3556 // Any wrapping introduced by shrinking this operation shouldn't be 3557 // considered undefined behavior. So, we can't unconditionally copy 3558 // arithmetic wrapping flags to NewI. 3559 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3560 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3561 NewI = 3562 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3563 ShrinkOperand(CI->getOperand(1))); 3564 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3565 NewI = B.CreateSelect(SI->getCondition(), 3566 ShrinkOperand(SI->getTrueValue()), 3567 ShrinkOperand(SI->getFalseValue())); 3568 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3569 switch (CI->getOpcode()) { 3570 default: 3571 llvm_unreachable("Unhandled cast!"); 3572 case Instruction::Trunc: 3573 NewI = ShrinkOperand(CI->getOperand(0)); 3574 break; 3575 case Instruction::SExt: 3576 NewI = B.CreateSExtOrTrunc( 3577 CI->getOperand(0), 3578 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3579 break; 3580 case Instruction::ZExt: 3581 NewI = B.CreateZExtOrTrunc( 3582 CI->getOperand(0), 3583 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3584 break; 3585 } 3586 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3587 auto Elements0 = 3588 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3589 auto *O0 = B.CreateZExtOrTrunc( 3590 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3591 auto Elements1 = 3592 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3593 auto *O1 = B.CreateZExtOrTrunc( 3594 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3595 3596 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3597 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3598 // Don't do anything with the operands, just extend the result. 3599 continue; 3600 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3601 auto Elements = 3602 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3603 auto *O0 = B.CreateZExtOrTrunc( 3604 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3605 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3606 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3607 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3608 auto Elements = 3609 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3610 auto *O0 = B.CreateZExtOrTrunc( 3611 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3612 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3613 } else { 3614 // If we don't know what to do, be conservative and don't do anything. 3615 continue; 3616 } 3617 3618 // Lastly, extend the result. 3619 NewI->takeName(cast<Instruction>(I)); 3620 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3621 I->replaceAllUsesWith(Res); 3622 cast<Instruction>(I)->eraseFromParent(); 3623 Erased.insert(I); 3624 State.reset(Def, Res, Part); 3625 } 3626 } 3627 3628 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3629 for (const auto &KV : Cost->getMinimalBitwidths()) { 3630 // If the value wasn't vectorized, we must maintain the original scalar 3631 // type. The absence of the value from State indicates that it 3632 // wasn't vectorized. 3633 // FIXME: Should not rely on getVPValue at this point. 3634 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3635 if (!State.hasAnyVectorValue(Def)) 3636 continue; 3637 for (unsigned Part = 0; Part < UF; ++Part) { 3638 Value *I = State.get(Def, Part); 3639 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3640 if (Inst && Inst->use_empty()) { 3641 Value *NewI = Inst->getOperand(0); 3642 Inst->eraseFromParent(); 3643 State.reset(Def, NewI, Part); 3644 } 3645 } 3646 } 3647 } 3648 3649 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3650 VPlan &Plan) { 3651 // Insert truncates and extends for any truncated instructions as hints to 3652 // InstCombine. 3653 if (VF.isVector()) 3654 truncateToMinimalBitwidths(State); 3655 3656 // Fix widened non-induction PHIs by setting up the PHI operands. 3657 if (EnableVPlanNativePath) 3658 fixNonInductionPHIs(Plan, State); 3659 3660 // At this point every instruction in the original loop is widened to a 3661 // vector form. Now we need to fix the recurrences in the loop. These PHI 3662 // nodes are currently empty because we did not want to introduce cycles. 3663 // This is the second stage of vectorizing recurrences. 3664 fixCrossIterationPHIs(State); 3665 3666 // Forget the original basic block. 3667 PSE.getSE()->forgetLoop(OrigLoop); 3668 3669 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3670 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3671 if (Cost->requiresScalarEpilogue(VF)) { 3672 // No edge from the middle block to the unique exit block has been inserted 3673 // and there is nothing to fix from vector loop; phis should have incoming 3674 // from scalar loop only. 3675 Plan.clearLiveOuts(); 3676 } else { 3677 // If we inserted an edge from the middle block to the unique exit block, 3678 // update uses outside the loop (phis) to account for the newly inserted 3679 // edge. 3680 3681 // Fix-up external users of the induction variables. 3682 for (auto &Entry : Legal->getInductionVars()) 3683 fixupIVUsers(Entry.first, Entry.second, 3684 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3685 IVEndValues[Entry.first], LoopMiddleBlock, 3686 VectorLoop->getHeader(), Plan); 3687 } 3688 3689 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3690 // in the exit block, so update the builder. 3691 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3692 for (auto &KV : Plan.getLiveOuts()) 3693 KV.second->fixPhi(Plan, State); 3694 3695 for (Instruction *PI : PredicatedInstructions) 3696 sinkScalarOperands(&*PI); 3697 3698 // Remove redundant induction instructions. 3699 cse(VectorLoop->getHeader()); 3700 3701 // Set/update profile weights for the vector and remainder loops as original 3702 // loop iterations are now distributed among them. Note that original loop 3703 // represented by LoopScalarBody becomes remainder loop after vectorization. 3704 // 3705 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3706 // end up getting slightly roughened result but that should be OK since 3707 // profile is not inherently precise anyway. Note also possible bypass of 3708 // vector code caused by legality checks is ignored, assigning all the weight 3709 // to the vector loop, optimistically. 3710 // 3711 // For scalable vectorization we can't know at compile time how many iterations 3712 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3713 // vscale of '1'. 3714 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3715 LI->getLoopFor(LoopScalarBody), 3716 VF.getKnownMinValue() * UF); 3717 } 3718 3719 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3720 // In order to support recurrences we need to be able to vectorize Phi nodes. 3721 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3722 // stage #2: We now need to fix the recurrences by adding incoming edges to 3723 // the currently empty PHI nodes. At this point every instruction in the 3724 // original loop is widened to a vector form so we can use them to construct 3725 // the incoming edges. 3726 VPBasicBlock *Header = 3727 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3728 for (VPRecipeBase &R : Header->phis()) { 3729 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3730 fixReduction(ReductionPhi, State); 3731 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3732 fixFirstOrderRecurrence(FOR, State); 3733 } 3734 } 3735 3736 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3737 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3738 // This is the second phase of vectorizing first-order recurrences. An 3739 // overview of the transformation is described below. Suppose we have the 3740 // following loop. 3741 // 3742 // for (int i = 0; i < n; ++i) 3743 // b[i] = a[i] - a[i - 1]; 3744 // 3745 // There is a first-order recurrence on "a". For this loop, the shorthand 3746 // scalar IR looks like: 3747 // 3748 // scalar.ph: 3749 // s_init = a[-1] 3750 // br scalar.body 3751 // 3752 // scalar.body: 3753 // i = phi [0, scalar.ph], [i+1, scalar.body] 3754 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3755 // s2 = a[i] 3756 // b[i] = s2 - s1 3757 // br cond, scalar.body, ... 3758 // 3759 // In this example, s1 is a recurrence because it's value depends on the 3760 // previous iteration. In the first phase of vectorization, we created a 3761 // vector phi v1 for s1. We now complete the vectorization and produce the 3762 // shorthand vector IR shown below (for VF = 4, UF = 1). 3763 // 3764 // vector.ph: 3765 // v_init = vector(..., ..., ..., a[-1]) 3766 // br vector.body 3767 // 3768 // vector.body 3769 // i = phi [0, vector.ph], [i+4, vector.body] 3770 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3771 // v2 = a[i, i+1, i+2, i+3]; 3772 // v3 = vector(v1(3), v2(0, 1, 2)) 3773 // b[i, i+1, i+2, i+3] = v2 - v3 3774 // br cond, vector.body, middle.block 3775 // 3776 // middle.block: 3777 // x = v2(3) 3778 // br scalar.ph 3779 // 3780 // scalar.ph: 3781 // s_init = phi [x, middle.block], [a[-1], otherwise] 3782 // br scalar.body 3783 // 3784 // After execution completes the vector loop, we extract the next value of 3785 // the recurrence (x) to use as the initial value in the scalar loop. 3786 3787 // Extract the last vector element in the middle block. This will be the 3788 // initial value for the recurrence when jumping to the scalar loop. 3789 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3790 Value *Incoming = State.get(PreviousDef, UF - 1); 3791 auto *ExtractForScalar = Incoming; 3792 auto *IdxTy = Builder.getInt32Ty(); 3793 if (VF.isVector()) { 3794 auto *One = ConstantInt::get(IdxTy, 1); 3795 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3796 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3797 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3798 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3799 "vector.recur.extract"); 3800 } 3801 // Extract the second last element in the middle block if the 3802 // Phi is used outside the loop. We need to extract the phi itself 3803 // and not the last element (the phi update in the current iteration). This 3804 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3805 // when the scalar loop is not run at all. 3806 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3807 if (VF.isVector()) { 3808 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3809 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3810 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3811 Incoming, Idx, "vector.recur.extract.for.phi"); 3812 } else if (UF > 1) 3813 // When loop is unrolled without vectorizing, initialize 3814 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3815 // of `Incoming`. This is analogous to the vectorized case above: extracting 3816 // the second last element when VF > 1. 3817 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3818 3819 // Fix the initial value of the original recurrence in the scalar loop. 3820 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3821 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3822 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3823 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3824 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3825 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3826 Start->addIncoming(Incoming, BB); 3827 } 3828 3829 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3830 Phi->setName("scalar.recur"); 3831 3832 // Finally, fix users of the recurrence outside the loop. The users will need 3833 // either the last value of the scalar recurrence or the last value of the 3834 // vector recurrence we extracted in the middle block. Since the loop is in 3835 // LCSSA form, we just need to find all the phi nodes for the original scalar 3836 // recurrence in the exit block, and then add an edge for the middle block. 3837 // Note that LCSSA does not imply single entry when the original scalar loop 3838 // had multiple exiting edges (as we always run the last iteration in the 3839 // scalar epilogue); in that case, there is no edge from middle to exit and 3840 // and thus no phis which needed updated. 3841 if (!Cost->requiresScalarEpilogue(VF)) 3842 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3843 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3844 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3845 State.Plan->removeLiveOut(&LCSSAPhi); 3846 } 3847 } 3848 3849 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3850 VPTransformState &State) { 3851 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3852 // Get it's reduction variable descriptor. 3853 assert(Legal->isReductionVariable(OrigPhi) && 3854 "Unable to find the reduction variable"); 3855 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3856 3857 RecurKind RK = RdxDesc.getRecurrenceKind(); 3858 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3859 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3860 State.setDebugLocFromInst(ReductionStartValue); 3861 3862 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3863 // This is the vector-clone of the value that leaves the loop. 3864 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3865 3866 // Wrap flags are in general invalid after vectorization, clear them. 3867 clearReductionWrapFlags(PhiR, State); 3868 3869 // Before each round, move the insertion point right between 3870 // the PHIs and the values we are going to write. 3871 // This allows us to write both PHINodes and the extractelement 3872 // instructions. 3873 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3874 3875 State.setDebugLocFromInst(LoopExitInst); 3876 3877 Type *PhiTy = OrigPhi->getType(); 3878 3879 VPBasicBlock *LatchVPBB = 3880 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3881 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3882 // If tail is folded by masking, the vector value to leave the loop should be 3883 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3884 // instead of the former. For an inloop reduction the reduction will already 3885 // be predicated, and does not need to be handled here. 3886 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3887 for (unsigned Part = 0; Part < UF; ++Part) { 3888 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3889 SelectInst *Sel = nullptr; 3890 for (User *U : VecLoopExitInst->users()) { 3891 if (isa<SelectInst>(U)) { 3892 assert(!Sel && "Reduction exit feeding two selects"); 3893 Sel = cast<SelectInst>(U); 3894 } else 3895 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3896 } 3897 assert(Sel && "Reduction exit feeds no select"); 3898 State.reset(LoopExitInstDef, Sel, Part); 3899 3900 if (isa<FPMathOperator>(Sel)) 3901 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3902 3903 // If the target can create a predicated operator for the reduction at no 3904 // extra cost in the loop (for example a predicated vadd), it can be 3905 // cheaper for the select to remain in the loop than be sunk out of it, 3906 // and so use the select value for the phi instead of the old 3907 // LoopExitValue. 3908 if (PreferPredicatedReductionSelect || 3909 TTI->preferPredicatedReductionSelect( 3910 RdxDesc.getOpcode(), PhiTy, 3911 TargetTransformInfo::ReductionFlags())) { 3912 auto *VecRdxPhi = 3913 cast<PHINode>(State.get(PhiR, Part)); 3914 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3915 } 3916 } 3917 } 3918 3919 // If the vector reduction can be performed in a smaller type, we truncate 3920 // then extend the loop exit value to enable InstCombine to evaluate the 3921 // entire expression in the smaller type. 3922 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3923 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3924 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3925 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3926 VectorParts RdxParts(UF); 3927 for (unsigned Part = 0; Part < UF; ++Part) { 3928 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3929 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3930 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3931 : Builder.CreateZExt(Trunc, VecTy); 3932 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3933 if (U != Trunc) { 3934 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3935 RdxParts[Part] = Extnd; 3936 } 3937 } 3938 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3939 for (unsigned Part = 0; Part < UF; ++Part) { 3940 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3941 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3942 } 3943 } 3944 3945 // Reduce all of the unrolled parts into a single vector. 3946 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3947 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3948 3949 // The middle block terminator has already been assigned a DebugLoc here (the 3950 // OrigLoop's single latch terminator). We want the whole middle block to 3951 // appear to execute on this line because: (a) it is all compiler generated, 3952 // (b) these instructions are always executed after evaluating the latch 3953 // conditional branch, and (c) other passes may add new predecessors which 3954 // terminate on this line. This is the easiest way to ensure we don't 3955 // accidentally cause an extra step back into the loop while debugging. 3956 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3957 if (PhiR->isOrdered()) 3958 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3959 else { 3960 // Floating-point operations should have some FMF to enable the reduction. 3961 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3962 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3963 for (unsigned Part = 1; Part < UF; ++Part) { 3964 Value *RdxPart = State.get(LoopExitInstDef, Part); 3965 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3966 ReducedPartRdx = Builder.CreateBinOp( 3967 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3968 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3969 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3970 ReducedPartRdx, RdxPart); 3971 else 3972 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3973 } 3974 } 3975 3976 // Create the reduction after the loop. Note that inloop reductions create the 3977 // target reduction in the loop using a Reduction recipe. 3978 if (VF.isVector() && !PhiR->isInLoop()) { 3979 ReducedPartRdx = 3980 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3981 // If the reduction can be performed in a smaller type, we need to extend 3982 // the reduction to the wider type before we branch to the original loop. 3983 if (PhiTy != RdxDesc.getRecurrenceType()) 3984 ReducedPartRdx = RdxDesc.isSigned() 3985 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3986 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3987 } 3988 3989 PHINode *ResumePhi = 3990 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 3991 3992 // Create a phi node that merges control-flow from the backedge-taken check 3993 // block and the middle block. 3994 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 3995 LoopScalarPreHeader->getTerminator()); 3996 3997 // If we are fixing reductions in the epilogue loop then we should already 3998 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 3999 // we carry over the incoming values correctly. 4000 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4001 if (Incoming == LoopMiddleBlock) 4002 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4003 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4004 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4005 Incoming); 4006 else 4007 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4008 } 4009 4010 // Set the resume value for this reduction 4011 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4012 4013 // If there were stores of the reduction value to a uniform memory address 4014 // inside the loop, create the final store here. 4015 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4016 StoreInst *NewSI = 4017 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4018 propagateMetadata(NewSI, SI); 4019 4020 // If the reduction value is used in other places, 4021 // then let the code below create PHI's for that. 4022 } 4023 4024 // Now, we need to fix the users of the reduction variable 4025 // inside and outside of the scalar remainder loop. 4026 4027 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4028 // in the exit blocks. See comment on analogous loop in 4029 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4030 if (!Cost->requiresScalarEpilogue(VF)) 4031 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4032 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4033 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4034 State.Plan->removeLiveOut(&LCSSAPhi); 4035 } 4036 4037 // Fix the scalar loop reduction variable with the incoming reduction sum 4038 // from the vector body and from the backedge value. 4039 int IncomingEdgeBlockIdx = 4040 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4041 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4042 // Pick the other block. 4043 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4044 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4045 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4046 } 4047 4048 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4049 VPTransformState &State) { 4050 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4051 RecurKind RK = RdxDesc.getRecurrenceKind(); 4052 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4053 return; 4054 4055 SmallVector<VPValue *, 8> Worklist; 4056 SmallPtrSet<VPValue *, 8> Visited; 4057 Worklist.push_back(PhiR); 4058 Visited.insert(PhiR); 4059 4060 while (!Worklist.empty()) { 4061 VPValue *Cur = Worklist.pop_back_val(); 4062 for (unsigned Part = 0; Part < UF; ++Part) { 4063 Value *V = State.get(Cur, Part); 4064 if (!isa<OverflowingBinaryOperator>(V)) 4065 break; 4066 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4067 } 4068 4069 for (VPUser *U : Cur->users()) { 4070 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4071 if (!UserRecipe) 4072 continue; 4073 for (VPValue *V : UserRecipe->definedValues()) 4074 if (Visited.insert(V).second) 4075 Worklist.push_back(V); 4076 } 4077 } 4078 } 4079 4080 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4081 // The basic block and loop containing the predicated instruction. 4082 auto *PredBB = PredInst->getParent(); 4083 auto *VectorLoop = LI->getLoopFor(PredBB); 4084 4085 // Initialize a worklist with the operands of the predicated instruction. 4086 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4087 4088 // Holds instructions that we need to analyze again. An instruction may be 4089 // reanalyzed if we don't yet know if we can sink it or not. 4090 SmallVector<Instruction *, 8> InstsToReanalyze; 4091 4092 // Returns true if a given use occurs in the predicated block. Phi nodes use 4093 // their operands in their corresponding predecessor blocks. 4094 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4095 auto *I = cast<Instruction>(U.getUser()); 4096 BasicBlock *BB = I->getParent(); 4097 if (auto *Phi = dyn_cast<PHINode>(I)) 4098 BB = Phi->getIncomingBlock( 4099 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4100 return BB == PredBB; 4101 }; 4102 4103 // Iteratively sink the scalarized operands of the predicated instruction 4104 // into the block we created for it. When an instruction is sunk, it's 4105 // operands are then added to the worklist. The algorithm ends after one pass 4106 // through the worklist doesn't sink a single instruction. 4107 bool Changed; 4108 do { 4109 // Add the instructions that need to be reanalyzed to the worklist, and 4110 // reset the changed indicator. 4111 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4112 InstsToReanalyze.clear(); 4113 Changed = false; 4114 4115 while (!Worklist.empty()) { 4116 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4117 4118 // We can't sink an instruction if it is a phi node, is not in the loop, 4119 // or may have side effects. 4120 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4121 I->mayHaveSideEffects()) 4122 continue; 4123 4124 // If the instruction is already in PredBB, check if we can sink its 4125 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4126 // sinking the scalar instruction I, hence it appears in PredBB; but it 4127 // may have failed to sink I's operands (recursively), which we try 4128 // (again) here. 4129 if (I->getParent() == PredBB) { 4130 Worklist.insert(I->op_begin(), I->op_end()); 4131 continue; 4132 } 4133 4134 // It's legal to sink the instruction if all its uses occur in the 4135 // predicated block. Otherwise, there's nothing to do yet, and we may 4136 // need to reanalyze the instruction. 4137 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4138 InstsToReanalyze.push_back(I); 4139 continue; 4140 } 4141 4142 // Move the instruction to the beginning of the predicated block, and add 4143 // it's operands to the worklist. 4144 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4145 Worklist.insert(I->op_begin(), I->op_end()); 4146 4147 // The sinking may have enabled other instructions to be sunk, so we will 4148 // need to iterate. 4149 Changed = true; 4150 } 4151 } while (Changed); 4152 } 4153 4154 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4155 VPTransformState &State) { 4156 auto Iter = depth_first( 4157 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); 4158 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4159 for (VPRecipeBase &P : VPBB->phis()) { 4160 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4161 if (!VPPhi) 4162 continue; 4163 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4164 // Make sure the builder has a valid insert point. 4165 Builder.SetInsertPoint(NewPhi); 4166 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4167 VPValue *Inc = VPPhi->getIncomingValue(i); 4168 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4169 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4170 } 4171 } 4172 } 4173 } 4174 4175 bool InnerLoopVectorizer::useOrderedReductions( 4176 const RecurrenceDescriptor &RdxDesc) { 4177 return Cost->useOrderedReductions(RdxDesc); 4178 } 4179 4180 /// A helper function for checking whether an integer division-related 4181 /// instruction may divide by zero (in which case it must be predicated if 4182 /// executed conditionally in the scalar code). 4183 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4184 /// Non-zero divisors that are non compile-time constants will not be 4185 /// converted into multiplication, so we will still end up scalarizing 4186 /// the division, but can do so w/o predication. 4187 static bool mayDivideByZero(Instruction &I) { 4188 assert((I.getOpcode() == Instruction::UDiv || 4189 I.getOpcode() == Instruction::SDiv || 4190 I.getOpcode() == Instruction::URem || 4191 I.getOpcode() == Instruction::SRem) && 4192 "Unexpected instruction"); 4193 Value *Divisor = I.getOperand(1); 4194 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4195 return !CInt || CInt->isZero(); 4196 } 4197 4198 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4199 VPUser &ArgOperands, 4200 VPTransformState &State) { 4201 assert(!isa<DbgInfoIntrinsic>(I) && 4202 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4203 State.setDebugLocFromInst(&I); 4204 4205 Module *M = I.getParent()->getParent()->getParent(); 4206 auto *CI = cast<CallInst>(&I); 4207 4208 SmallVector<Type *, 4> Tys; 4209 for (Value *ArgOperand : CI->args()) 4210 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4211 4212 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4213 4214 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4215 // version of the instruction. 4216 // Is it beneficial to perform intrinsic call compared to lib call? 4217 bool NeedToScalarize = false; 4218 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4219 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4220 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4221 assert((UseVectorIntrinsic || !NeedToScalarize) && 4222 "Instruction should be scalarized elsewhere."); 4223 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4224 "Either the intrinsic cost or vector call cost must be valid"); 4225 4226 for (unsigned Part = 0; Part < UF; ++Part) { 4227 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4228 SmallVector<Value *, 4> Args; 4229 for (auto &I : enumerate(ArgOperands.operands())) { 4230 // Some intrinsics have a scalar argument - don't replace it with a 4231 // vector. 4232 Value *Arg; 4233 if (!UseVectorIntrinsic || 4234 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4235 Arg = State.get(I.value(), Part); 4236 else 4237 Arg = State.get(I.value(), VPIteration(0, 0)); 4238 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4239 TysForDecl.push_back(Arg->getType()); 4240 Args.push_back(Arg); 4241 } 4242 4243 Function *VectorF; 4244 if (UseVectorIntrinsic) { 4245 // Use vector version of the intrinsic. 4246 if (VF.isVector()) 4247 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4248 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4249 assert(VectorF && "Can't retrieve vector intrinsic."); 4250 } else { 4251 // Use vector version of the function call. 4252 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4253 #ifndef NDEBUG 4254 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4255 "Can't create vector function."); 4256 #endif 4257 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4258 } 4259 SmallVector<OperandBundleDef, 1> OpBundles; 4260 CI->getOperandBundlesAsDefs(OpBundles); 4261 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4262 4263 if (isa<FPMathOperator>(V)) 4264 V->copyFastMathFlags(CI); 4265 4266 State.set(Def, V, Part); 4267 State.addMetadata(V, &I); 4268 } 4269 } 4270 4271 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4272 // We should not collect Scalars more than once per VF. Right now, this 4273 // function is called from collectUniformsAndScalars(), which already does 4274 // this check. Collecting Scalars for VF=1 does not make any sense. 4275 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4276 "This function should not be visited twice for the same VF"); 4277 4278 // This avoids any chances of creating a REPLICATE recipe during planning 4279 // since that would result in generation of scalarized code during execution, 4280 // which is not supported for scalable vectors. 4281 if (VF.isScalable()) { 4282 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4283 return; 4284 } 4285 4286 SmallSetVector<Instruction *, 8> Worklist; 4287 4288 // These sets are used to seed the analysis with pointers used by memory 4289 // accesses that will remain scalar. 4290 SmallSetVector<Instruction *, 8> ScalarPtrs; 4291 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4292 auto *Latch = TheLoop->getLoopLatch(); 4293 4294 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4295 // The pointer operands of loads and stores will be scalar as long as the 4296 // memory access is not a gather or scatter operation. The value operand of a 4297 // store will remain scalar if the store is scalarized. 4298 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4299 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4300 assert(WideningDecision != CM_Unknown && 4301 "Widening decision should be ready at this moment"); 4302 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4303 if (Ptr == Store->getValueOperand()) 4304 return WideningDecision == CM_Scalarize; 4305 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4306 "Ptr is neither a value or pointer operand"); 4307 return WideningDecision != CM_GatherScatter; 4308 }; 4309 4310 // A helper that returns true if the given value is a bitcast or 4311 // getelementptr instruction contained in the loop. 4312 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4313 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4314 isa<GetElementPtrInst>(V)) && 4315 !TheLoop->isLoopInvariant(V); 4316 }; 4317 4318 // A helper that evaluates a memory access's use of a pointer. If the use will 4319 // be a scalar use and the pointer is only used by memory accesses, we place 4320 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4321 // PossibleNonScalarPtrs. 4322 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4323 // We only care about bitcast and getelementptr instructions contained in 4324 // the loop. 4325 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4326 return; 4327 4328 // If the pointer has already been identified as scalar (e.g., if it was 4329 // also identified as uniform), there's nothing to do. 4330 auto *I = cast<Instruction>(Ptr); 4331 if (Worklist.count(I)) 4332 return; 4333 4334 // If the use of the pointer will be a scalar use, and all users of the 4335 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4336 // place the pointer in PossibleNonScalarPtrs. 4337 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4338 return isa<LoadInst>(U) || isa<StoreInst>(U); 4339 })) 4340 ScalarPtrs.insert(I); 4341 else 4342 PossibleNonScalarPtrs.insert(I); 4343 }; 4344 4345 // We seed the scalars analysis with three classes of instructions: (1) 4346 // instructions marked uniform-after-vectorization and (2) bitcast, 4347 // getelementptr and (pointer) phi instructions used by memory accesses 4348 // requiring a scalar use. 4349 // 4350 // (1) Add to the worklist all instructions that have been identified as 4351 // uniform-after-vectorization. 4352 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4353 4354 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4355 // memory accesses requiring a scalar use. The pointer operands of loads and 4356 // stores will be scalar as long as the memory accesses is not a gather or 4357 // scatter operation. The value operand of a store will remain scalar if the 4358 // store is scalarized. 4359 for (auto *BB : TheLoop->blocks()) 4360 for (auto &I : *BB) { 4361 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4362 evaluatePtrUse(Load, Load->getPointerOperand()); 4363 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4364 evaluatePtrUse(Store, Store->getPointerOperand()); 4365 evaluatePtrUse(Store, Store->getValueOperand()); 4366 } 4367 } 4368 for (auto *I : ScalarPtrs) 4369 if (!PossibleNonScalarPtrs.count(I)) { 4370 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4371 Worklist.insert(I); 4372 } 4373 4374 // Insert the forced scalars. 4375 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4376 // induction variable when the PHI user is scalarized. 4377 auto ForcedScalar = ForcedScalars.find(VF); 4378 if (ForcedScalar != ForcedScalars.end()) 4379 for (auto *I : ForcedScalar->second) 4380 Worklist.insert(I); 4381 4382 // Expand the worklist by looking through any bitcasts and getelementptr 4383 // instructions we've already identified as scalar. This is similar to the 4384 // expansion step in collectLoopUniforms(); however, here we're only 4385 // expanding to include additional bitcasts and getelementptr instructions. 4386 unsigned Idx = 0; 4387 while (Idx != Worklist.size()) { 4388 Instruction *Dst = Worklist[Idx++]; 4389 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4390 continue; 4391 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4392 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4393 auto *J = cast<Instruction>(U); 4394 return !TheLoop->contains(J) || Worklist.count(J) || 4395 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4396 isScalarUse(J, Src)); 4397 })) { 4398 Worklist.insert(Src); 4399 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4400 } 4401 } 4402 4403 // An induction variable will remain scalar if all users of the induction 4404 // variable and induction variable update remain scalar. 4405 for (auto &Induction : Legal->getInductionVars()) { 4406 auto *Ind = Induction.first; 4407 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4408 4409 // If tail-folding is applied, the primary induction variable will be used 4410 // to feed a vector compare. 4411 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4412 continue; 4413 4414 // Returns true if \p Indvar is a pointer induction that is used directly by 4415 // load/store instruction \p I. 4416 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4417 Instruction *I) { 4418 return Induction.second.getKind() == 4419 InductionDescriptor::IK_PtrInduction && 4420 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4421 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4422 }; 4423 4424 // Determine if all users of the induction variable are scalar after 4425 // vectorization. 4426 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4427 auto *I = cast<Instruction>(U); 4428 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4429 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4430 }); 4431 if (!ScalarInd) 4432 continue; 4433 4434 // Determine if all users of the induction variable update instruction are 4435 // scalar after vectorization. 4436 auto ScalarIndUpdate = 4437 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4438 auto *I = cast<Instruction>(U); 4439 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4440 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4441 }); 4442 if (!ScalarIndUpdate) 4443 continue; 4444 4445 // The induction variable and its update instruction will remain scalar. 4446 Worklist.insert(Ind); 4447 Worklist.insert(IndUpdate); 4448 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4449 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4450 << "\n"); 4451 } 4452 4453 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4454 } 4455 4456 bool LoopVectorizationCostModel::isScalarWithPredication( 4457 Instruction *I, ElementCount VF) const { 4458 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4459 return false; 4460 switch(I->getOpcode()) { 4461 default: 4462 break; 4463 case Instruction::Load: 4464 case Instruction::Store: { 4465 if (!Legal->isMaskRequired(I)) 4466 return false; 4467 auto *Ptr = getLoadStorePointerOperand(I); 4468 auto *Ty = getLoadStoreType(I); 4469 Type *VTy = Ty; 4470 if (VF.isVector()) 4471 VTy = VectorType::get(Ty, VF); 4472 const Align Alignment = getLoadStoreAlignment(I); 4473 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4474 TTI.isLegalMaskedGather(VTy, Alignment)) 4475 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4476 TTI.isLegalMaskedScatter(VTy, Alignment)); 4477 } 4478 case Instruction::UDiv: 4479 case Instruction::SDiv: 4480 case Instruction::SRem: 4481 case Instruction::URem: 4482 return mayDivideByZero(*I); 4483 } 4484 return false; 4485 } 4486 4487 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4488 Instruction *I, ElementCount VF) { 4489 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4490 assert(getWideningDecision(I, VF) == CM_Unknown && 4491 "Decision should not be set yet."); 4492 auto *Group = getInterleavedAccessGroup(I); 4493 assert(Group && "Must have a group."); 4494 4495 // If the instruction's allocated size doesn't equal it's type size, it 4496 // requires padding and will be scalarized. 4497 auto &DL = I->getModule()->getDataLayout(); 4498 auto *ScalarTy = getLoadStoreType(I); 4499 if (hasIrregularType(ScalarTy, DL)) 4500 return false; 4501 4502 // If the group involves a non-integral pointer, we may not be able to 4503 // losslessly cast all values to a common type. 4504 unsigned InterleaveFactor = Group->getFactor(); 4505 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4506 for (unsigned i = 0; i < InterleaveFactor; i++) { 4507 Instruction *Member = Group->getMember(i); 4508 if (!Member) 4509 continue; 4510 auto *MemberTy = getLoadStoreType(Member); 4511 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4512 // Don't coerce non-integral pointers to integers or vice versa. 4513 if (MemberNI != ScalarNI) { 4514 // TODO: Consider adding special nullptr value case here 4515 return false; 4516 } else if (MemberNI && ScalarNI && 4517 ScalarTy->getPointerAddressSpace() != 4518 MemberTy->getPointerAddressSpace()) { 4519 return false; 4520 } 4521 } 4522 4523 // Check if masking is required. 4524 // A Group may need masking for one of two reasons: it resides in a block that 4525 // needs predication, or it was decided to use masking to deal with gaps 4526 // (either a gap at the end of a load-access that may result in a speculative 4527 // load, or any gaps in a store-access). 4528 bool PredicatedAccessRequiresMasking = 4529 blockNeedsPredicationForAnyReason(I->getParent()) && 4530 Legal->isMaskRequired(I); 4531 bool LoadAccessWithGapsRequiresEpilogMasking = 4532 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4533 !isScalarEpilogueAllowed(); 4534 bool StoreAccessWithGapsRequiresMasking = 4535 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4536 if (!PredicatedAccessRequiresMasking && 4537 !LoadAccessWithGapsRequiresEpilogMasking && 4538 !StoreAccessWithGapsRequiresMasking) 4539 return true; 4540 4541 // If masked interleaving is required, we expect that the user/target had 4542 // enabled it, because otherwise it either wouldn't have been created or 4543 // it should have been invalidated by the CostModel. 4544 assert(useMaskedInterleavedAccesses(TTI) && 4545 "Masked interleave-groups for predicated accesses are not enabled."); 4546 4547 if (Group->isReverse()) 4548 return false; 4549 4550 auto *Ty = getLoadStoreType(I); 4551 const Align Alignment = getLoadStoreAlignment(I); 4552 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4553 : TTI.isLegalMaskedStore(Ty, Alignment); 4554 } 4555 4556 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4557 Instruction *I, ElementCount VF) { 4558 // Get and ensure we have a valid memory instruction. 4559 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4560 4561 auto *Ptr = getLoadStorePointerOperand(I); 4562 auto *ScalarTy = getLoadStoreType(I); 4563 4564 // In order to be widened, the pointer should be consecutive, first of all. 4565 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4566 return false; 4567 4568 // If the instruction is a store located in a predicated block, it will be 4569 // scalarized. 4570 if (isScalarWithPredication(I, VF)) 4571 return false; 4572 4573 // If the instruction's allocated size doesn't equal it's type size, it 4574 // requires padding and will be scalarized. 4575 auto &DL = I->getModule()->getDataLayout(); 4576 if (hasIrregularType(ScalarTy, DL)) 4577 return false; 4578 4579 return true; 4580 } 4581 4582 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4583 // We should not collect Uniforms more than once per VF. Right now, 4584 // this function is called from collectUniformsAndScalars(), which 4585 // already does this check. Collecting Uniforms for VF=1 does not make any 4586 // sense. 4587 4588 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4589 "This function should not be visited twice for the same VF"); 4590 4591 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4592 // not analyze again. Uniforms.count(VF) will return 1. 4593 Uniforms[VF].clear(); 4594 4595 // We now know that the loop is vectorizable! 4596 // Collect instructions inside the loop that will remain uniform after 4597 // vectorization. 4598 4599 // Global values, params and instructions outside of current loop are out of 4600 // scope. 4601 auto isOutOfScope = [&](Value *V) -> bool { 4602 Instruction *I = dyn_cast<Instruction>(V); 4603 return (!I || !TheLoop->contains(I)); 4604 }; 4605 4606 // Worklist containing uniform instructions demanding lane 0. 4607 SetVector<Instruction *> Worklist; 4608 BasicBlock *Latch = TheLoop->getLoopLatch(); 4609 4610 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4611 // that are scalar with predication must not be considered uniform after 4612 // vectorization, because that would create an erroneous replicating region 4613 // where only a single instance out of VF should be formed. 4614 // TODO: optimize such seldom cases if found important, see PR40816. 4615 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4616 if (isOutOfScope(I)) { 4617 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4618 << *I << "\n"); 4619 return; 4620 } 4621 if (isScalarWithPredication(I, VF)) { 4622 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4623 << *I << "\n"); 4624 return; 4625 } 4626 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4627 Worklist.insert(I); 4628 }; 4629 4630 // Start with the conditional branch. If the branch condition is an 4631 // instruction contained in the loop that is only used by the branch, it is 4632 // uniform. 4633 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4634 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4635 addToWorklistIfAllowed(Cmp); 4636 4637 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4638 InstWidening WideningDecision = getWideningDecision(I, VF); 4639 assert(WideningDecision != CM_Unknown && 4640 "Widening decision should be ready at this moment"); 4641 4642 // A uniform memory op is itself uniform. We exclude uniform stores 4643 // here as they demand the last lane, not the first one. 4644 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4645 assert(WideningDecision == CM_Scalarize); 4646 return true; 4647 } 4648 4649 return (WideningDecision == CM_Widen || 4650 WideningDecision == CM_Widen_Reverse || 4651 WideningDecision == CM_Interleave); 4652 }; 4653 4654 4655 // Returns true if Ptr is the pointer operand of a memory access instruction 4656 // I, and I is known to not require scalarization. 4657 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4658 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4659 }; 4660 4661 // Holds a list of values which are known to have at least one uniform use. 4662 // Note that there may be other uses which aren't uniform. A "uniform use" 4663 // here is something which only demands lane 0 of the unrolled iterations; 4664 // it does not imply that all lanes produce the same value (e.g. this is not 4665 // the usual meaning of uniform) 4666 SetVector<Value *> HasUniformUse; 4667 4668 // Scan the loop for instructions which are either a) known to have only 4669 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4670 for (auto *BB : TheLoop->blocks()) 4671 for (auto &I : *BB) { 4672 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4673 switch (II->getIntrinsicID()) { 4674 case Intrinsic::sideeffect: 4675 case Intrinsic::experimental_noalias_scope_decl: 4676 case Intrinsic::assume: 4677 case Intrinsic::lifetime_start: 4678 case Intrinsic::lifetime_end: 4679 if (TheLoop->hasLoopInvariantOperands(&I)) 4680 addToWorklistIfAllowed(&I); 4681 break; 4682 default: 4683 break; 4684 } 4685 } 4686 4687 // ExtractValue instructions must be uniform, because the operands are 4688 // known to be loop-invariant. 4689 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4690 assert(isOutOfScope(EVI->getAggregateOperand()) && 4691 "Expected aggregate value to be loop invariant"); 4692 addToWorklistIfAllowed(EVI); 4693 continue; 4694 } 4695 4696 // If there's no pointer operand, there's nothing to do. 4697 auto *Ptr = getLoadStorePointerOperand(&I); 4698 if (!Ptr) 4699 continue; 4700 4701 // A uniform memory op is itself uniform. We exclude uniform stores 4702 // here as they demand the last lane, not the first one. 4703 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4704 addToWorklistIfAllowed(&I); 4705 4706 if (isUniformDecision(&I, VF)) { 4707 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4708 HasUniformUse.insert(Ptr); 4709 } 4710 } 4711 4712 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4713 // demanding) users. Since loops are assumed to be in LCSSA form, this 4714 // disallows uses outside the loop as well. 4715 for (auto *V : HasUniformUse) { 4716 if (isOutOfScope(V)) 4717 continue; 4718 auto *I = cast<Instruction>(V); 4719 auto UsersAreMemAccesses = 4720 llvm::all_of(I->users(), [&](User *U) -> bool { 4721 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4722 }); 4723 if (UsersAreMemAccesses) 4724 addToWorklistIfAllowed(I); 4725 } 4726 4727 // Expand Worklist in topological order: whenever a new instruction 4728 // is added , its users should be already inside Worklist. It ensures 4729 // a uniform instruction will only be used by uniform instructions. 4730 unsigned idx = 0; 4731 while (idx != Worklist.size()) { 4732 Instruction *I = Worklist[idx++]; 4733 4734 for (auto OV : I->operand_values()) { 4735 // isOutOfScope operands cannot be uniform instructions. 4736 if (isOutOfScope(OV)) 4737 continue; 4738 // First order recurrence Phi's should typically be considered 4739 // non-uniform. 4740 auto *OP = dyn_cast<PHINode>(OV); 4741 if (OP && Legal->isFirstOrderRecurrence(OP)) 4742 continue; 4743 // If all the users of the operand are uniform, then add the 4744 // operand into the uniform worklist. 4745 auto *OI = cast<Instruction>(OV); 4746 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4747 auto *J = cast<Instruction>(U); 4748 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4749 })) 4750 addToWorklistIfAllowed(OI); 4751 } 4752 } 4753 4754 // For an instruction to be added into Worklist above, all its users inside 4755 // the loop should also be in Worklist. However, this condition cannot be 4756 // true for phi nodes that form a cyclic dependence. We must process phi 4757 // nodes separately. An induction variable will remain uniform if all users 4758 // of the induction variable and induction variable update remain uniform. 4759 // The code below handles both pointer and non-pointer induction variables. 4760 for (auto &Induction : Legal->getInductionVars()) { 4761 auto *Ind = Induction.first; 4762 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4763 4764 // Determine if all users of the induction variable are uniform after 4765 // vectorization. 4766 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4767 auto *I = cast<Instruction>(U); 4768 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4769 isVectorizedMemAccessUse(I, Ind); 4770 }); 4771 if (!UniformInd) 4772 continue; 4773 4774 // Determine if all users of the induction variable update instruction are 4775 // uniform after vectorization. 4776 auto UniformIndUpdate = 4777 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4778 auto *I = cast<Instruction>(U); 4779 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4780 isVectorizedMemAccessUse(I, IndUpdate); 4781 }); 4782 if (!UniformIndUpdate) 4783 continue; 4784 4785 // The induction variable and its update instruction will remain uniform. 4786 addToWorklistIfAllowed(Ind); 4787 addToWorklistIfAllowed(IndUpdate); 4788 } 4789 4790 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4791 } 4792 4793 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4794 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4795 4796 if (Legal->getRuntimePointerChecking()->Need) { 4797 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4798 "runtime pointer checks needed. Enable vectorization of this " 4799 "loop with '#pragma clang loop vectorize(enable)' when " 4800 "compiling with -Os/-Oz", 4801 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4802 return true; 4803 } 4804 4805 if (!PSE.getPredicate().isAlwaysTrue()) { 4806 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4807 "runtime SCEV checks needed. Enable vectorization of this " 4808 "loop with '#pragma clang loop vectorize(enable)' when " 4809 "compiling with -Os/-Oz", 4810 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4811 return true; 4812 } 4813 4814 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4815 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4816 reportVectorizationFailure("Runtime stride check for small trip count", 4817 "runtime stride == 1 checks needed. Enable vectorization of " 4818 "this loop without such check by compiling with -Os/-Oz", 4819 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4820 return true; 4821 } 4822 4823 return false; 4824 } 4825 4826 ElementCount 4827 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4828 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4829 return ElementCount::getScalable(0); 4830 4831 if (Hints->isScalableVectorizationDisabled()) { 4832 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4833 "ScalableVectorizationDisabled", ORE, TheLoop); 4834 return ElementCount::getScalable(0); 4835 } 4836 4837 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4838 4839 auto MaxScalableVF = ElementCount::getScalable( 4840 std::numeric_limits<ElementCount::ScalarTy>::max()); 4841 4842 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4843 // FIXME: While for scalable vectors this is currently sufficient, this should 4844 // be replaced by a more detailed mechanism that filters out specific VFs, 4845 // instead of invalidating vectorization for a whole set of VFs based on the 4846 // MaxVF. 4847 4848 // Disable scalable vectorization if the loop contains unsupported reductions. 4849 if (!canVectorizeReductions(MaxScalableVF)) { 4850 reportVectorizationInfo( 4851 "Scalable vectorization not supported for the reduction " 4852 "operations found in this loop.", 4853 "ScalableVFUnfeasible", ORE, TheLoop); 4854 return ElementCount::getScalable(0); 4855 } 4856 4857 // Disable scalable vectorization if the loop contains any instructions 4858 // with element types not supported for scalable vectors. 4859 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4860 return !Ty->isVoidTy() && 4861 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4862 })) { 4863 reportVectorizationInfo("Scalable vectorization is not supported " 4864 "for all element types found in this loop.", 4865 "ScalableVFUnfeasible", ORE, TheLoop); 4866 return ElementCount::getScalable(0); 4867 } 4868 4869 if (Legal->isSafeForAnyVectorWidth()) 4870 return MaxScalableVF; 4871 4872 // Limit MaxScalableVF by the maximum safe dependence distance. 4873 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4874 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4875 MaxVScale = 4876 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4877 MaxScalableVF = ElementCount::getScalable( 4878 MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0); 4879 if (!MaxScalableVF) 4880 reportVectorizationInfo( 4881 "Max legal vector width too small, scalable vectorization " 4882 "unfeasible.", 4883 "ScalableVFUnfeasible", ORE, TheLoop); 4884 4885 return MaxScalableVF; 4886 } 4887 4888 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4889 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4890 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4891 unsigned SmallestType, WidestType; 4892 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4893 4894 // Get the maximum safe dependence distance in bits computed by LAA. 4895 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4896 // the memory accesses that is most restrictive (involved in the smallest 4897 // dependence distance). 4898 unsigned MaxSafeElements = 4899 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4900 4901 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4902 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4903 4904 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4905 << ".\n"); 4906 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4907 << ".\n"); 4908 4909 // First analyze the UserVF, fall back if the UserVF should be ignored. 4910 if (UserVF) { 4911 auto MaxSafeUserVF = 4912 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4913 4914 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4915 // If `VF=vscale x N` is safe, then so is `VF=N` 4916 if (UserVF.isScalable()) 4917 return FixedScalableVFPair( 4918 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4919 else 4920 return UserVF; 4921 } 4922 4923 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4924 4925 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4926 // is better to ignore the hint and let the compiler choose a suitable VF. 4927 if (!UserVF.isScalable()) { 4928 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4929 << " is unsafe, clamping to max safe VF=" 4930 << MaxSafeFixedVF << ".\n"); 4931 ORE->emit([&]() { 4932 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4933 TheLoop->getStartLoc(), 4934 TheLoop->getHeader()) 4935 << "User-specified vectorization factor " 4936 << ore::NV("UserVectorizationFactor", UserVF) 4937 << " is unsafe, clamping to maximum safe vectorization factor " 4938 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4939 }); 4940 return MaxSafeFixedVF; 4941 } 4942 4943 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4944 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4945 << " is ignored because scalable vectors are not " 4946 "available.\n"); 4947 ORE->emit([&]() { 4948 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4949 TheLoop->getStartLoc(), 4950 TheLoop->getHeader()) 4951 << "User-specified vectorization factor " 4952 << ore::NV("UserVectorizationFactor", UserVF) 4953 << " is ignored because the target does not support scalable " 4954 "vectors. The compiler will pick a more suitable value."; 4955 }); 4956 } else { 4957 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4958 << " is unsafe. Ignoring scalable UserVF.\n"); 4959 ORE->emit([&]() { 4960 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4961 TheLoop->getStartLoc(), 4962 TheLoop->getHeader()) 4963 << "User-specified vectorization factor " 4964 << ore::NV("UserVectorizationFactor", UserVF) 4965 << " is unsafe. Ignoring the hint to let the compiler pick a " 4966 "more suitable value."; 4967 }); 4968 } 4969 } 4970 4971 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4972 << " / " << WidestType << " bits.\n"); 4973 4974 FixedScalableVFPair Result(ElementCount::getFixed(1), 4975 ElementCount::getScalable(0)); 4976 if (auto MaxVF = 4977 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4978 MaxSafeFixedVF, FoldTailByMasking)) 4979 Result.FixedVF = MaxVF; 4980 4981 if (auto MaxVF = 4982 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4983 MaxSafeScalableVF, FoldTailByMasking)) 4984 if (MaxVF.isScalable()) { 4985 Result.ScalableVF = MaxVF; 4986 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4987 << "\n"); 4988 } 4989 4990 return Result; 4991 } 4992 4993 FixedScalableVFPair 4994 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4995 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4996 // TODO: It may by useful to do since it's still likely to be dynamically 4997 // uniform if the target can skip. 4998 reportVectorizationFailure( 4999 "Not inserting runtime ptr check for divergent target", 5000 "runtime pointer checks needed. Not enabled for divergent target", 5001 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5002 return FixedScalableVFPair::getNone(); 5003 } 5004 5005 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5006 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5007 if (TC == 1) { 5008 reportVectorizationFailure("Single iteration (non) loop", 5009 "loop trip count is one, irrelevant for vectorization", 5010 "SingleIterationLoop", ORE, TheLoop); 5011 return FixedScalableVFPair::getNone(); 5012 } 5013 5014 switch (ScalarEpilogueStatus) { 5015 case CM_ScalarEpilogueAllowed: 5016 return computeFeasibleMaxVF(TC, UserVF, false); 5017 case CM_ScalarEpilogueNotAllowedUsePredicate: 5018 LLVM_FALLTHROUGH; 5019 case CM_ScalarEpilogueNotNeededUsePredicate: 5020 LLVM_DEBUG( 5021 dbgs() << "LV: vector predicate hint/switch found.\n" 5022 << "LV: Not allowing scalar epilogue, creating predicated " 5023 << "vector loop.\n"); 5024 break; 5025 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5026 // fallthrough as a special case of OptForSize 5027 case CM_ScalarEpilogueNotAllowedOptSize: 5028 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5029 LLVM_DEBUG( 5030 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5031 else 5032 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5033 << "count.\n"); 5034 5035 // Bail if runtime checks are required, which are not good when optimising 5036 // for size. 5037 if (runtimeChecksRequired()) 5038 return FixedScalableVFPair::getNone(); 5039 5040 break; 5041 } 5042 5043 // The only loops we can vectorize without a scalar epilogue, are loops with 5044 // a bottom-test and a single exiting block. We'd have to handle the fact 5045 // that not every instruction executes on the last iteration. This will 5046 // require a lane mask which varies through the vector loop body. (TODO) 5047 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5048 // If there was a tail-folding hint/switch, but we can't fold the tail by 5049 // masking, fallback to a vectorization with a scalar epilogue. 5050 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5051 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5052 "scalar epilogue instead.\n"); 5053 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5054 return computeFeasibleMaxVF(TC, UserVF, false); 5055 } 5056 return FixedScalableVFPair::getNone(); 5057 } 5058 5059 // Now try the tail folding 5060 5061 // Invalidate interleave groups that require an epilogue if we can't mask 5062 // the interleave-group. 5063 if (!useMaskedInterleavedAccesses(TTI)) { 5064 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5065 "No decisions should have been taken at this point"); 5066 // Note: There is no need to invalidate any cost modeling decisions here, as 5067 // non where taken so far. 5068 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5069 } 5070 5071 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5072 // Avoid tail folding if the trip count is known to be a multiple of any VF 5073 // we chose. 5074 // FIXME: The condition below pessimises the case for fixed-width vectors, 5075 // when scalable VFs are also candidates for vectorization. 5076 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5077 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5078 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5079 "MaxFixedVF must be a power of 2"); 5080 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5081 : MaxFixedVF.getFixedValue(); 5082 ScalarEvolution *SE = PSE.getSE(); 5083 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5084 const SCEV *ExitCount = SE->getAddExpr( 5085 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5086 const SCEV *Rem = SE->getURemExpr( 5087 SE->applyLoopGuards(ExitCount, TheLoop), 5088 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5089 if (Rem->isZero()) { 5090 // Accept MaxFixedVF if we do not have a tail. 5091 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5092 return MaxFactors; 5093 } 5094 } 5095 5096 // If we don't know the precise trip count, or if the trip count that we 5097 // found modulo the vectorization factor is not zero, try to fold the tail 5098 // by masking. 5099 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5100 if (Legal->prepareToFoldTailByMasking()) { 5101 FoldTailByMasking = true; 5102 return MaxFactors; 5103 } 5104 5105 // If there was a tail-folding hint/switch, but we can't fold the tail by 5106 // masking, fallback to a vectorization with a scalar epilogue. 5107 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5108 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5109 "scalar epilogue instead.\n"); 5110 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5111 return MaxFactors; 5112 } 5113 5114 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5115 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5116 return FixedScalableVFPair::getNone(); 5117 } 5118 5119 if (TC == 0) { 5120 reportVectorizationFailure( 5121 "Unable to calculate the loop count due to complex control flow", 5122 "unable to calculate the loop count due to complex control flow", 5123 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5124 return FixedScalableVFPair::getNone(); 5125 } 5126 5127 reportVectorizationFailure( 5128 "Cannot optimize for size and vectorize at the same time.", 5129 "cannot optimize for size and vectorize at the same time. " 5130 "Enable vectorization of this loop with '#pragma clang loop " 5131 "vectorize(enable)' when compiling with -Os/-Oz", 5132 "NoTailLoopWithOptForSize", ORE, TheLoop); 5133 return FixedScalableVFPair::getNone(); 5134 } 5135 5136 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5137 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5138 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5139 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5140 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5141 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5142 : TargetTransformInfo::RGK_FixedWidthVector); 5143 5144 // Convenience function to return the minimum of two ElementCounts. 5145 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5146 assert((LHS.isScalable() == RHS.isScalable()) && 5147 "Scalable flags must match"); 5148 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5149 }; 5150 5151 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5152 // Note that both WidestRegister and WidestType may not be a powers of 2. 5153 auto MaxVectorElementCount = ElementCount::get( 5154 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5155 ComputeScalableMaxVF); 5156 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5157 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5158 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5159 5160 if (!MaxVectorElementCount) { 5161 LLVM_DEBUG(dbgs() << "LV: The target has no " 5162 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5163 << " vector registers.\n"); 5164 return ElementCount::getFixed(1); 5165 } 5166 5167 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5168 if (ConstTripCount && 5169 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5170 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5171 // If loop trip count (TC) is known at compile time there is no point in 5172 // choosing VF greater than TC (as done in the loop below). Select maximum 5173 // power of two which doesn't exceed TC. 5174 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5175 // when the TC is less than or equal to the known number of lanes. 5176 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5177 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5178 "exceeding the constant trip count: " 5179 << ClampedConstTripCount << "\n"); 5180 return ElementCount::getFixed(ClampedConstTripCount); 5181 } 5182 5183 TargetTransformInfo::RegisterKind RegKind = 5184 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5185 : TargetTransformInfo::RGK_FixedWidthVector; 5186 ElementCount MaxVF = MaxVectorElementCount; 5187 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5188 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5189 auto MaxVectorElementCountMaxBW = ElementCount::get( 5190 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5191 ComputeScalableMaxVF); 5192 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5193 5194 // Collect all viable vectorization factors larger than the default MaxVF 5195 // (i.e. MaxVectorElementCount). 5196 SmallVector<ElementCount, 8> VFs; 5197 for (ElementCount VS = MaxVectorElementCount * 2; 5198 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5199 VFs.push_back(VS); 5200 5201 // For each VF calculate its register usage. 5202 auto RUs = calculateRegisterUsage(VFs); 5203 5204 // Select the largest VF which doesn't require more registers than existing 5205 // ones. 5206 for (int i = RUs.size() - 1; i >= 0; --i) { 5207 bool Selected = true; 5208 for (auto &pair : RUs[i].MaxLocalUsers) { 5209 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5210 if (pair.second > TargetNumRegisters) 5211 Selected = false; 5212 } 5213 if (Selected) { 5214 MaxVF = VFs[i]; 5215 break; 5216 } 5217 } 5218 if (ElementCount MinVF = 5219 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5220 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5221 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5222 << ") with target's minimum: " << MinVF << '\n'); 5223 MaxVF = MinVF; 5224 } 5225 } 5226 5227 // Invalidate any widening decisions we might have made, in case the loop 5228 // requires prediction (decided later), but we have already made some 5229 // load/store widening decisions. 5230 invalidateCostModelingDecisions(); 5231 } 5232 return MaxVF; 5233 } 5234 5235 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5236 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5237 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5238 auto Min = Attr.getVScaleRangeMin(); 5239 auto Max = Attr.getVScaleRangeMax(); 5240 if (Max && Min == Max) 5241 return Max; 5242 } 5243 5244 return TTI.getVScaleForTuning(); 5245 } 5246 5247 bool LoopVectorizationCostModel::isMoreProfitable( 5248 const VectorizationFactor &A, const VectorizationFactor &B) const { 5249 InstructionCost CostA = A.Cost; 5250 InstructionCost CostB = B.Cost; 5251 5252 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5253 5254 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5255 MaxTripCount) { 5256 // If we are folding the tail and the trip count is a known (possibly small) 5257 // constant, the trip count will be rounded up to an integer number of 5258 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5259 // which we compare directly. When not folding the tail, the total cost will 5260 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5261 // approximated with the per-lane cost below instead of using the tripcount 5262 // as here. 5263 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5264 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5265 return RTCostA < RTCostB; 5266 } 5267 5268 // Improve estimate for the vector width if it is scalable. 5269 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5270 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5271 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5272 if (A.Width.isScalable()) 5273 EstimatedWidthA *= VScale.value(); 5274 if (B.Width.isScalable()) 5275 EstimatedWidthB *= VScale.value(); 5276 } 5277 5278 // Assume vscale may be larger than 1 (or the value being tuned for), 5279 // so that scalable vectorization is slightly favorable over fixed-width 5280 // vectorization. 5281 if (A.Width.isScalable() && !B.Width.isScalable()) 5282 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5283 5284 // To avoid the need for FP division: 5285 // (CostA / A.Width) < (CostB / B.Width) 5286 // <=> (CostA * B.Width) < (CostB * A.Width) 5287 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5288 } 5289 5290 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5291 const ElementCountSet &VFCandidates) { 5292 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5293 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5294 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5295 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5296 "Expected Scalar VF to be a candidate"); 5297 5298 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5299 ExpectedCost); 5300 VectorizationFactor ChosenFactor = ScalarCost; 5301 5302 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5303 if (ForceVectorization && VFCandidates.size() > 1) { 5304 // Ignore scalar width, because the user explicitly wants vectorization. 5305 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5306 // evaluation. 5307 ChosenFactor.Cost = InstructionCost::getMax(); 5308 } 5309 5310 SmallVector<InstructionVFPair> InvalidCosts; 5311 for (const auto &i : VFCandidates) { 5312 // The cost for scalar VF=1 is already calculated, so ignore it. 5313 if (i.isScalar()) 5314 continue; 5315 5316 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5317 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5318 5319 #ifndef NDEBUG 5320 unsigned AssumedMinimumVscale = 1; 5321 if (Optional<unsigned> VScale = getVScaleForTuning()) 5322 AssumedMinimumVscale = *VScale; 5323 unsigned Width = 5324 Candidate.Width.isScalable() 5325 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5326 : Candidate.Width.getFixedValue(); 5327 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5328 << " costs: " << (Candidate.Cost / Width)); 5329 if (i.isScalable()) 5330 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5331 << AssumedMinimumVscale << ")"); 5332 LLVM_DEBUG(dbgs() << ".\n"); 5333 #endif 5334 5335 if (!C.second && !ForceVectorization) { 5336 LLVM_DEBUG( 5337 dbgs() << "LV: Not considering vector loop of width " << i 5338 << " because it will not generate any vector instructions.\n"); 5339 continue; 5340 } 5341 5342 // If profitable add it to ProfitableVF list. 5343 if (isMoreProfitable(Candidate, ScalarCost)) 5344 ProfitableVFs.push_back(Candidate); 5345 5346 if (isMoreProfitable(Candidate, ChosenFactor)) 5347 ChosenFactor = Candidate; 5348 } 5349 5350 // Emit a report of VFs with invalid costs in the loop. 5351 if (!InvalidCosts.empty()) { 5352 // Group the remarks per instruction, keeping the instruction order from 5353 // InvalidCosts. 5354 std::map<Instruction *, unsigned> Numbering; 5355 unsigned I = 0; 5356 for (auto &Pair : InvalidCosts) 5357 if (!Numbering.count(Pair.first)) 5358 Numbering[Pair.first] = I++; 5359 5360 // Sort the list, first on instruction(number) then on VF. 5361 llvm::sort(InvalidCosts, 5362 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5363 if (Numbering[A.first] != Numbering[B.first]) 5364 return Numbering[A.first] < Numbering[B.first]; 5365 ElementCountComparator ECC; 5366 return ECC(A.second, B.second); 5367 }); 5368 5369 // For a list of ordered instruction-vf pairs: 5370 // [(load, vf1), (load, vf2), (store, vf1)] 5371 // Group the instructions together to emit separate remarks for: 5372 // load (vf1, vf2) 5373 // store (vf1) 5374 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5375 auto Subset = ArrayRef<InstructionVFPair>(); 5376 do { 5377 if (Subset.empty()) 5378 Subset = Tail.take_front(1); 5379 5380 Instruction *I = Subset.front().first; 5381 5382 // If the next instruction is different, or if there are no other pairs, 5383 // emit a remark for the collated subset. e.g. 5384 // [(load, vf1), (load, vf2))] 5385 // to emit: 5386 // remark: invalid costs for 'load' at VF=(vf, vf2) 5387 if (Subset == Tail || Tail[Subset.size()].first != I) { 5388 std::string OutString; 5389 raw_string_ostream OS(OutString); 5390 assert(!Subset.empty() && "Unexpected empty range"); 5391 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5392 for (auto &Pair : Subset) 5393 OS << (Pair.second == Subset.front().second ? "" : ", ") 5394 << Pair.second; 5395 OS << "):"; 5396 if (auto *CI = dyn_cast<CallInst>(I)) 5397 OS << " call to " << CI->getCalledFunction()->getName(); 5398 else 5399 OS << " " << I->getOpcodeName(); 5400 OS.flush(); 5401 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5402 Tail = Tail.drop_front(Subset.size()); 5403 Subset = {}; 5404 } else 5405 // Grow the subset by one element 5406 Subset = Tail.take_front(Subset.size() + 1); 5407 } while (!Tail.empty()); 5408 } 5409 5410 if (!EnableCondStoresVectorization && NumPredStores) { 5411 reportVectorizationFailure("There are conditional stores.", 5412 "store that is conditionally executed prevents vectorization", 5413 "ConditionalStore", ORE, TheLoop); 5414 ChosenFactor = ScalarCost; 5415 } 5416 5417 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5418 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 5419 << "LV: Vectorization seems to be not beneficial, " 5420 << "but was forced by a user.\n"); 5421 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5422 return ChosenFactor; 5423 } 5424 5425 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5426 const Loop &L, ElementCount VF) const { 5427 // Cross iteration phis such as reductions need special handling and are 5428 // currently unsupported. 5429 if (any_of(L.getHeader()->phis(), 5430 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5431 return false; 5432 5433 // Phis with uses outside of the loop require special handling and are 5434 // currently unsupported. 5435 for (auto &Entry : Legal->getInductionVars()) { 5436 // Look for uses of the value of the induction at the last iteration. 5437 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5438 for (User *U : PostInc->users()) 5439 if (!L.contains(cast<Instruction>(U))) 5440 return false; 5441 // Look for uses of penultimate value of the induction. 5442 for (User *U : Entry.first->users()) 5443 if (!L.contains(cast<Instruction>(U))) 5444 return false; 5445 } 5446 5447 // Induction variables that are widened require special handling that is 5448 // currently not supported. 5449 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5450 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5451 this->isProfitableToScalarize(Entry.first, VF)); 5452 })) 5453 return false; 5454 5455 // Epilogue vectorization code has not been auditted to ensure it handles 5456 // non-latch exits properly. It may be fine, but it needs auditted and 5457 // tested. 5458 if (L.getExitingBlock() != L.getLoopLatch()) 5459 return false; 5460 5461 return true; 5462 } 5463 5464 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5465 const ElementCount VF) const { 5466 // FIXME: We need a much better cost-model to take different parameters such 5467 // as register pressure, code size increase and cost of extra branches into 5468 // account. For now we apply a very crude heuristic and only consider loops 5469 // with vectorization factors larger than a certain value. 5470 // We also consider epilogue vectorization unprofitable for targets that don't 5471 // consider interleaving beneficial (eg. MVE). 5472 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5473 return false; 5474 // FIXME: We should consider changing the threshold for scalable 5475 // vectors to take VScaleForTuning into account. 5476 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5477 return true; 5478 return false; 5479 } 5480 5481 VectorizationFactor 5482 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5483 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5484 VectorizationFactor Result = VectorizationFactor::Disabled(); 5485 if (!EnableEpilogueVectorization) { 5486 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5487 return Result; 5488 } 5489 5490 if (!isScalarEpilogueAllowed()) { 5491 LLVM_DEBUG( 5492 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5493 "allowed.\n";); 5494 return Result; 5495 } 5496 5497 // Not really a cost consideration, but check for unsupported cases here to 5498 // simplify the logic. 5499 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5500 LLVM_DEBUG( 5501 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5502 "not a supported candidate.\n";); 5503 return Result; 5504 } 5505 5506 if (EpilogueVectorizationForceVF > 1) { 5507 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5508 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5509 if (LVP.hasPlanWithVF(ForcedEC)) 5510 return {ForcedEC, 0, 0}; 5511 else { 5512 LLVM_DEBUG( 5513 dbgs() 5514 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5515 return Result; 5516 } 5517 } 5518 5519 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5520 TheLoop->getHeader()->getParent()->hasMinSize()) { 5521 LLVM_DEBUG( 5522 dbgs() 5523 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5524 return Result; 5525 } 5526 5527 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5528 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5529 "this loop\n"); 5530 return Result; 5531 } 5532 5533 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5534 // the main loop handles 8 lanes per iteration. We could still benefit from 5535 // vectorizing the epilogue loop with VF=4. 5536 ElementCount EstimatedRuntimeVF = MainLoopVF; 5537 if (MainLoopVF.isScalable()) { 5538 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5539 if (Optional<unsigned> VScale = getVScaleForTuning()) 5540 EstimatedRuntimeVF *= *VScale; 5541 } 5542 5543 for (auto &NextVF : ProfitableVFs) 5544 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5545 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5546 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5547 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5548 LVP.hasPlanWithVF(NextVF.Width)) 5549 Result = NextVF; 5550 5551 if (Result != VectorizationFactor::Disabled()) 5552 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5553 << Result.Width << "\n";); 5554 return Result; 5555 } 5556 5557 std::pair<unsigned, unsigned> 5558 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5559 unsigned MinWidth = -1U; 5560 unsigned MaxWidth = 8; 5561 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5562 // For in-loop reductions, no element types are added to ElementTypesInLoop 5563 // if there are no loads/stores in the loop. In this case, check through the 5564 // reduction variables to determine the maximum width. 5565 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5566 // Reset MaxWidth so that we can find the smallest type used by recurrences 5567 // in the loop. 5568 MaxWidth = -1U; 5569 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5570 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5571 // When finding the min width used by the recurrence we need to account 5572 // for casts on the input operands of the recurrence. 5573 MaxWidth = std::min<unsigned>( 5574 MaxWidth, std::min<unsigned>( 5575 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5576 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5577 } 5578 } else { 5579 for (Type *T : ElementTypesInLoop) { 5580 MinWidth = std::min<unsigned>( 5581 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5582 MaxWidth = std::max<unsigned>( 5583 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5584 } 5585 } 5586 return {MinWidth, MaxWidth}; 5587 } 5588 5589 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5590 ElementTypesInLoop.clear(); 5591 // For each block. 5592 for (BasicBlock *BB : TheLoop->blocks()) { 5593 // For each instruction in the loop. 5594 for (Instruction &I : BB->instructionsWithoutDebug()) { 5595 Type *T = I.getType(); 5596 5597 // Skip ignored values. 5598 if (ValuesToIgnore.count(&I)) 5599 continue; 5600 5601 // Only examine Loads, Stores and PHINodes. 5602 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5603 continue; 5604 5605 // Examine PHI nodes that are reduction variables. Update the type to 5606 // account for the recurrence type. 5607 if (auto *PN = dyn_cast<PHINode>(&I)) { 5608 if (!Legal->isReductionVariable(PN)) 5609 continue; 5610 const RecurrenceDescriptor &RdxDesc = 5611 Legal->getReductionVars().find(PN)->second; 5612 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5613 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5614 RdxDesc.getRecurrenceType(), 5615 TargetTransformInfo::ReductionFlags())) 5616 continue; 5617 T = RdxDesc.getRecurrenceType(); 5618 } 5619 5620 // Examine the stored values. 5621 if (auto *ST = dyn_cast<StoreInst>(&I)) 5622 T = ST->getValueOperand()->getType(); 5623 5624 assert(T->isSized() && 5625 "Expected the load/store/recurrence type to be sized"); 5626 5627 ElementTypesInLoop.insert(T); 5628 } 5629 } 5630 } 5631 5632 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5633 unsigned LoopCost) { 5634 // -- The interleave heuristics -- 5635 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5636 // There are many micro-architectural considerations that we can't predict 5637 // at this level. For example, frontend pressure (on decode or fetch) due to 5638 // code size, or the number and capabilities of the execution ports. 5639 // 5640 // We use the following heuristics to select the interleave count: 5641 // 1. If the code has reductions, then we interleave to break the cross 5642 // iteration dependency. 5643 // 2. If the loop is really small, then we interleave to reduce the loop 5644 // overhead. 5645 // 3. We don't interleave if we think that we will spill registers to memory 5646 // due to the increased register pressure. 5647 5648 if (!isScalarEpilogueAllowed()) 5649 return 1; 5650 5651 // We used the distance for the interleave count. 5652 if (Legal->getMaxSafeDepDistBytes() != -1U) 5653 return 1; 5654 5655 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5656 const bool HasReductions = !Legal->getReductionVars().empty(); 5657 // Do not interleave loops with a relatively small known or estimated trip 5658 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5659 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5660 // because with the above conditions interleaving can expose ILP and break 5661 // cross iteration dependences for reductions. 5662 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5663 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5664 return 1; 5665 5666 // If we did not calculate the cost for VF (because the user selected the VF) 5667 // then we calculate the cost of VF here. 5668 if (LoopCost == 0) { 5669 InstructionCost C = expectedCost(VF).first; 5670 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5671 LoopCost = *C.getValue(); 5672 5673 // Loop body is free and there is no need for interleaving. 5674 if (LoopCost == 0) 5675 return 1; 5676 } 5677 5678 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5679 // We divide by these constants so assume that we have at least one 5680 // instruction that uses at least one register. 5681 for (auto& pair : R.MaxLocalUsers) { 5682 pair.second = std::max(pair.second, 1U); 5683 } 5684 5685 // We calculate the interleave count using the following formula. 5686 // Subtract the number of loop invariants from the number of available 5687 // registers. These registers are used by all of the interleaved instances. 5688 // Next, divide the remaining registers by the number of registers that is 5689 // required by the loop, in order to estimate how many parallel instances 5690 // fit without causing spills. All of this is rounded down if necessary to be 5691 // a power of two. We want power of two interleave count to simplify any 5692 // addressing operations or alignment considerations. 5693 // We also want power of two interleave counts to ensure that the induction 5694 // variable of the vector loop wraps to zero, when tail is folded by masking; 5695 // this currently happens when OptForSize, in which case IC is set to 1 above. 5696 unsigned IC = UINT_MAX; 5697 5698 for (auto& pair : R.MaxLocalUsers) { 5699 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5700 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5701 << " registers of " 5702 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5703 if (VF.isScalar()) { 5704 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5705 TargetNumRegisters = ForceTargetNumScalarRegs; 5706 } else { 5707 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5708 TargetNumRegisters = ForceTargetNumVectorRegs; 5709 } 5710 unsigned MaxLocalUsers = pair.second; 5711 unsigned LoopInvariantRegs = 0; 5712 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5713 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5714 5715 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5716 // Don't count the induction variable as interleaved. 5717 if (EnableIndVarRegisterHeur) { 5718 TmpIC = 5719 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5720 std::max(1U, (MaxLocalUsers - 1))); 5721 } 5722 5723 IC = std::min(IC, TmpIC); 5724 } 5725 5726 // Clamp the interleave ranges to reasonable counts. 5727 unsigned MaxInterleaveCount = 5728 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5729 5730 // Check if the user has overridden the max. 5731 if (VF.isScalar()) { 5732 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5733 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5734 } else { 5735 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5736 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5737 } 5738 5739 // If trip count is known or estimated compile time constant, limit the 5740 // interleave count to be less than the trip count divided by VF, provided it 5741 // is at least 1. 5742 // 5743 // For scalable vectors we can't know if interleaving is beneficial. It may 5744 // not be beneficial for small loops if none of the lanes in the second vector 5745 // iterations is enabled. However, for larger loops, there is likely to be a 5746 // similar benefit as for fixed-width vectors. For now, we choose to leave 5747 // the InterleaveCount as if vscale is '1', although if some information about 5748 // the vector is known (e.g. min vector size), we can make a better decision. 5749 if (BestKnownTC) { 5750 MaxInterleaveCount = 5751 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5752 // Make sure MaxInterleaveCount is greater than 0. 5753 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5754 } 5755 5756 assert(MaxInterleaveCount > 0 && 5757 "Maximum interleave count must be greater than 0"); 5758 5759 // Clamp the calculated IC to be between the 1 and the max interleave count 5760 // that the target and trip count allows. 5761 if (IC > MaxInterleaveCount) 5762 IC = MaxInterleaveCount; 5763 else 5764 // Make sure IC is greater than 0. 5765 IC = std::max(1u, IC); 5766 5767 assert(IC > 0 && "Interleave count must be greater than 0."); 5768 5769 // Interleave if we vectorized this loop and there is a reduction that could 5770 // benefit from interleaving. 5771 if (VF.isVector() && HasReductions) { 5772 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5773 return IC; 5774 } 5775 5776 // For any scalar loop that either requires runtime checks or predication we 5777 // are better off leaving this to the unroller. Note that if we've already 5778 // vectorized the loop we will have done the runtime check and so interleaving 5779 // won't require further checks. 5780 bool ScalarInterleavingRequiresPredication = 5781 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5782 return Legal->blockNeedsPredication(BB); 5783 })); 5784 bool ScalarInterleavingRequiresRuntimePointerCheck = 5785 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5786 5787 // We want to interleave small loops in order to reduce the loop overhead and 5788 // potentially expose ILP opportunities. 5789 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5790 << "LV: IC is " << IC << '\n' 5791 << "LV: VF is " << VF << '\n'); 5792 const bool AggressivelyInterleaveReductions = 5793 TTI.enableAggressiveInterleaving(HasReductions); 5794 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5795 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5796 // We assume that the cost overhead is 1 and we use the cost model 5797 // to estimate the cost of the loop and interleave until the cost of the 5798 // loop overhead is about 5% of the cost of the loop. 5799 unsigned SmallIC = 5800 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5801 5802 // Interleave until store/load ports (estimated by max interleave count) are 5803 // saturated. 5804 unsigned NumStores = Legal->getNumStores(); 5805 unsigned NumLoads = Legal->getNumLoads(); 5806 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5807 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5808 5809 // There is little point in interleaving for reductions containing selects 5810 // and compares when VF=1 since it may just create more overhead than it's 5811 // worth for loops with small trip counts. This is because we still have to 5812 // do the final reduction after the loop. 5813 bool HasSelectCmpReductions = 5814 HasReductions && 5815 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5816 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5817 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5818 RdxDesc.getRecurrenceKind()); 5819 }); 5820 if (HasSelectCmpReductions) { 5821 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5822 return 1; 5823 } 5824 5825 // If we have a scalar reduction (vector reductions are already dealt with 5826 // by this point), we can increase the critical path length if the loop 5827 // we're interleaving is inside another loop. For tree-wise reductions 5828 // set the limit to 2, and for ordered reductions it's best to disable 5829 // interleaving entirely. 5830 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5831 bool HasOrderedReductions = 5832 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5833 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5834 return RdxDesc.isOrdered(); 5835 }); 5836 if (HasOrderedReductions) { 5837 LLVM_DEBUG( 5838 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5839 return 1; 5840 } 5841 5842 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5843 SmallIC = std::min(SmallIC, F); 5844 StoresIC = std::min(StoresIC, F); 5845 LoadsIC = std::min(LoadsIC, F); 5846 } 5847 5848 if (EnableLoadStoreRuntimeInterleave && 5849 std::max(StoresIC, LoadsIC) > SmallIC) { 5850 LLVM_DEBUG( 5851 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5852 return std::max(StoresIC, LoadsIC); 5853 } 5854 5855 // If there are scalar reductions and TTI has enabled aggressive 5856 // interleaving for reductions, we will interleave to expose ILP. 5857 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5858 AggressivelyInterleaveReductions) { 5859 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5860 // Interleave no less than SmallIC but not as aggressive as the normal IC 5861 // to satisfy the rare situation when resources are too limited. 5862 return std::max(IC / 2, SmallIC); 5863 } else { 5864 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5865 return SmallIC; 5866 } 5867 } 5868 5869 // Interleave if this is a large loop (small loops are already dealt with by 5870 // this point) that could benefit from interleaving. 5871 if (AggressivelyInterleaveReductions) { 5872 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5873 return IC; 5874 } 5875 5876 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5877 return 1; 5878 } 5879 5880 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5881 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5882 // This function calculates the register usage by measuring the highest number 5883 // of values that are alive at a single location. Obviously, this is a very 5884 // rough estimation. We scan the loop in a topological order in order and 5885 // assign a number to each instruction. We use RPO to ensure that defs are 5886 // met before their users. We assume that each instruction that has in-loop 5887 // users starts an interval. We record every time that an in-loop value is 5888 // used, so we have a list of the first and last occurrences of each 5889 // instruction. Next, we transpose this data structure into a multi map that 5890 // holds the list of intervals that *end* at a specific location. This multi 5891 // map allows us to perform a linear search. We scan the instructions linearly 5892 // and record each time that a new interval starts, by placing it in a set. 5893 // If we find this value in the multi-map then we remove it from the set. 5894 // The max register usage is the maximum size of the set. 5895 // We also search for instructions that are defined outside the loop, but are 5896 // used inside the loop. We need this number separately from the max-interval 5897 // usage number because when we unroll, loop-invariant values do not take 5898 // more register. 5899 LoopBlocksDFS DFS(TheLoop); 5900 DFS.perform(LI); 5901 5902 RegisterUsage RU; 5903 5904 // Each 'key' in the map opens a new interval. The values 5905 // of the map are the index of the 'last seen' usage of the 5906 // instruction that is the key. 5907 using IntervalMap = DenseMap<Instruction *, unsigned>; 5908 5909 // Maps instruction to its index. 5910 SmallVector<Instruction *, 64> IdxToInstr; 5911 // Marks the end of each interval. 5912 IntervalMap EndPoint; 5913 // Saves the list of instruction indices that are used in the loop. 5914 SmallPtrSet<Instruction *, 8> Ends; 5915 // Saves the list of values that are used in the loop but are 5916 // defined outside the loop, such as arguments and constants. 5917 SmallPtrSet<Value *, 8> LoopInvariants; 5918 5919 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5920 for (Instruction &I : BB->instructionsWithoutDebug()) { 5921 IdxToInstr.push_back(&I); 5922 5923 // Save the end location of each USE. 5924 for (Value *U : I.operands()) { 5925 auto *Instr = dyn_cast<Instruction>(U); 5926 5927 // Ignore non-instruction values such as arguments, constants, etc. 5928 if (!Instr) 5929 continue; 5930 5931 // If this instruction is outside the loop then record it and continue. 5932 if (!TheLoop->contains(Instr)) { 5933 LoopInvariants.insert(Instr); 5934 continue; 5935 } 5936 5937 // Overwrite previous end points. 5938 EndPoint[Instr] = IdxToInstr.size(); 5939 Ends.insert(Instr); 5940 } 5941 } 5942 } 5943 5944 // Saves the list of intervals that end with the index in 'key'. 5945 using InstrList = SmallVector<Instruction *, 2>; 5946 DenseMap<unsigned, InstrList> TransposeEnds; 5947 5948 // Transpose the EndPoints to a list of values that end at each index. 5949 for (auto &Interval : EndPoint) 5950 TransposeEnds[Interval.second].push_back(Interval.first); 5951 5952 SmallPtrSet<Instruction *, 8> OpenIntervals; 5953 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5954 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5955 5956 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5957 5958 const auto &TTICapture = TTI; 5959 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5960 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5961 return 0; 5962 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5963 }; 5964 5965 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5966 Instruction *I = IdxToInstr[i]; 5967 5968 // Remove all of the instructions that end at this location. 5969 InstrList &List = TransposeEnds[i]; 5970 for (Instruction *ToRemove : List) 5971 OpenIntervals.erase(ToRemove); 5972 5973 // Ignore instructions that are never used within the loop. 5974 if (!Ends.count(I)) 5975 continue; 5976 5977 // Skip ignored values. 5978 if (ValuesToIgnore.count(I)) 5979 continue; 5980 5981 // For each VF find the maximum usage of registers. 5982 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5983 // Count the number of live intervals. 5984 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5985 5986 if (VFs[j].isScalar()) { 5987 for (auto Inst : OpenIntervals) { 5988 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5989 if (RegUsage.find(ClassID) == RegUsage.end()) 5990 RegUsage[ClassID] = 1; 5991 else 5992 RegUsage[ClassID] += 1; 5993 } 5994 } else { 5995 collectUniformsAndScalars(VFs[j]); 5996 for (auto Inst : OpenIntervals) { 5997 // Skip ignored values for VF > 1. 5998 if (VecValuesToIgnore.count(Inst)) 5999 continue; 6000 if (isScalarAfterVectorization(Inst, VFs[j])) { 6001 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6002 if (RegUsage.find(ClassID) == RegUsage.end()) 6003 RegUsage[ClassID] = 1; 6004 else 6005 RegUsage[ClassID] += 1; 6006 } else { 6007 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6008 if (RegUsage.find(ClassID) == RegUsage.end()) 6009 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6010 else 6011 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6012 } 6013 } 6014 } 6015 6016 for (auto& pair : RegUsage) { 6017 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6018 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6019 else 6020 MaxUsages[j][pair.first] = pair.second; 6021 } 6022 } 6023 6024 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6025 << OpenIntervals.size() << '\n'); 6026 6027 // Add the current instruction to the list of open intervals. 6028 OpenIntervals.insert(I); 6029 } 6030 6031 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6032 SmallMapVector<unsigned, unsigned, 4> Invariant; 6033 6034 for (auto Inst : LoopInvariants) { 6035 unsigned Usage = 6036 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6037 unsigned ClassID = 6038 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6039 if (Invariant.find(ClassID) == Invariant.end()) 6040 Invariant[ClassID] = Usage; 6041 else 6042 Invariant[ClassID] += Usage; 6043 } 6044 6045 LLVM_DEBUG({ 6046 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6047 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6048 << " item\n"; 6049 for (const auto &pair : MaxUsages[i]) { 6050 dbgs() << "LV(REG): RegisterClass: " 6051 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6052 << " registers\n"; 6053 } 6054 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6055 << " item\n"; 6056 for (const auto &pair : Invariant) { 6057 dbgs() << "LV(REG): RegisterClass: " 6058 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6059 << " registers\n"; 6060 } 6061 }); 6062 6063 RU.LoopInvariantRegs = Invariant; 6064 RU.MaxLocalUsers = MaxUsages[i]; 6065 RUs[i] = RU; 6066 } 6067 6068 return RUs; 6069 } 6070 6071 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6072 ElementCount VF) { 6073 // TODO: Cost model for emulated masked load/store is completely 6074 // broken. This hack guides the cost model to use an artificially 6075 // high enough value to practically disable vectorization with such 6076 // operations, except where previously deployed legality hack allowed 6077 // using very low cost values. This is to avoid regressions coming simply 6078 // from moving "masked load/store" check from legality to cost model. 6079 // Masked Load/Gather emulation was previously never allowed. 6080 // Limited number of Masked Store/Scatter emulation was allowed. 6081 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6082 return isa<LoadInst>(I) || 6083 (isa<StoreInst>(I) && 6084 NumPredStores > NumberOfStoresToPredicate); 6085 } 6086 6087 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6088 // If we aren't vectorizing the loop, or if we've already collected the 6089 // instructions to scalarize, there's nothing to do. Collection may already 6090 // have occurred if we have a user-selected VF and are now computing the 6091 // expected cost for interleaving. 6092 if (VF.isScalar() || VF.isZero() || 6093 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6094 return; 6095 6096 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6097 // not profitable to scalarize any instructions, the presence of VF in the 6098 // map will indicate that we've analyzed it already. 6099 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6100 6101 PredicatedBBsAfterVectorization[VF].clear(); 6102 6103 // Find all the instructions that are scalar with predication in the loop and 6104 // determine if it would be better to not if-convert the blocks they are in. 6105 // If so, we also record the instructions to scalarize. 6106 for (BasicBlock *BB : TheLoop->blocks()) { 6107 if (!blockNeedsPredicationForAnyReason(BB)) 6108 continue; 6109 for (Instruction &I : *BB) 6110 if (isScalarWithPredication(&I, VF)) { 6111 ScalarCostsTy ScalarCosts; 6112 // Do not apply discount if scalable, because that would lead to 6113 // invalid scalarization costs. 6114 // Do not apply discount logic if hacked cost is needed 6115 // for emulated masked memrefs. 6116 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6117 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6118 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6119 // Remember that BB will remain after vectorization. 6120 PredicatedBBsAfterVectorization[VF].insert(BB); 6121 } 6122 } 6123 } 6124 6125 int LoopVectorizationCostModel::computePredInstDiscount( 6126 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6127 assert(!isUniformAfterVectorization(PredInst, VF) && 6128 "Instruction marked uniform-after-vectorization will be predicated"); 6129 6130 // Initialize the discount to zero, meaning that the scalar version and the 6131 // vector version cost the same. 6132 InstructionCost Discount = 0; 6133 6134 // Holds instructions to analyze. The instructions we visit are mapped in 6135 // ScalarCosts. Those instructions are the ones that would be scalarized if 6136 // we find that the scalar version costs less. 6137 SmallVector<Instruction *, 8> Worklist; 6138 6139 // Returns true if the given instruction can be scalarized. 6140 auto canBeScalarized = [&](Instruction *I) -> bool { 6141 // We only attempt to scalarize instructions forming a single-use chain 6142 // from the original predicated block that would otherwise be vectorized. 6143 // Although not strictly necessary, we give up on instructions we know will 6144 // already be scalar to avoid traversing chains that are unlikely to be 6145 // beneficial. 6146 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6147 isScalarAfterVectorization(I, VF)) 6148 return false; 6149 6150 // If the instruction is scalar with predication, it will be analyzed 6151 // separately. We ignore it within the context of PredInst. 6152 if (isScalarWithPredication(I, VF)) 6153 return false; 6154 6155 // If any of the instruction's operands are uniform after vectorization, 6156 // the instruction cannot be scalarized. This prevents, for example, a 6157 // masked load from being scalarized. 6158 // 6159 // We assume we will only emit a value for lane zero of an instruction 6160 // marked uniform after vectorization, rather than VF identical values. 6161 // Thus, if we scalarize an instruction that uses a uniform, we would 6162 // create uses of values corresponding to the lanes we aren't emitting code 6163 // for. This behavior can be changed by allowing getScalarValue to clone 6164 // the lane zero values for uniforms rather than asserting. 6165 for (Use &U : I->operands()) 6166 if (auto *J = dyn_cast<Instruction>(U.get())) 6167 if (isUniformAfterVectorization(J, VF)) 6168 return false; 6169 6170 // Otherwise, we can scalarize the instruction. 6171 return true; 6172 }; 6173 6174 // Compute the expected cost discount from scalarizing the entire expression 6175 // feeding the predicated instruction. We currently only consider expressions 6176 // that are single-use instruction chains. 6177 Worklist.push_back(PredInst); 6178 while (!Worklist.empty()) { 6179 Instruction *I = Worklist.pop_back_val(); 6180 6181 // If we've already analyzed the instruction, there's nothing to do. 6182 if (ScalarCosts.find(I) != ScalarCosts.end()) 6183 continue; 6184 6185 // Compute the cost of the vector instruction. Note that this cost already 6186 // includes the scalarization overhead of the predicated instruction. 6187 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6188 6189 // Compute the cost of the scalarized instruction. This cost is the cost of 6190 // the instruction as if it wasn't if-converted and instead remained in the 6191 // predicated block. We will scale this cost by block probability after 6192 // computing the scalarization overhead. 6193 InstructionCost ScalarCost = 6194 VF.getFixedValue() * 6195 getInstructionCost(I, ElementCount::getFixed(1)).first; 6196 6197 // Compute the scalarization overhead of needed insertelement instructions 6198 // and phi nodes. 6199 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6200 ScalarCost += TTI.getScalarizationOverhead( 6201 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6202 APInt::getAllOnes(VF.getFixedValue()), true, false); 6203 ScalarCost += 6204 VF.getFixedValue() * 6205 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6206 } 6207 6208 // Compute the scalarization overhead of needed extractelement 6209 // instructions. For each of the instruction's operands, if the operand can 6210 // be scalarized, add it to the worklist; otherwise, account for the 6211 // overhead. 6212 for (Use &U : I->operands()) 6213 if (auto *J = dyn_cast<Instruction>(U.get())) { 6214 assert(VectorType::isValidElementType(J->getType()) && 6215 "Instruction has non-scalar type"); 6216 if (canBeScalarized(J)) 6217 Worklist.push_back(J); 6218 else if (needsExtract(J, VF)) { 6219 ScalarCost += TTI.getScalarizationOverhead( 6220 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6221 APInt::getAllOnes(VF.getFixedValue()), false, true); 6222 } 6223 } 6224 6225 // Scale the total scalar cost by block probability. 6226 ScalarCost /= getReciprocalPredBlockProb(); 6227 6228 // Compute the discount. A non-negative discount means the vector version 6229 // of the instruction costs more, and scalarizing would be beneficial. 6230 Discount += VectorCost - ScalarCost; 6231 ScalarCosts[I] = ScalarCost; 6232 } 6233 6234 return *Discount.getValue(); 6235 } 6236 6237 LoopVectorizationCostModel::VectorizationCostTy 6238 LoopVectorizationCostModel::expectedCost( 6239 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6240 VectorizationCostTy Cost; 6241 6242 // For each block. 6243 for (BasicBlock *BB : TheLoop->blocks()) { 6244 VectorizationCostTy BlockCost; 6245 6246 // For each instruction in the old loop. 6247 for (Instruction &I : BB->instructionsWithoutDebug()) { 6248 // Skip ignored values. 6249 if (ValuesToIgnore.count(&I) || 6250 (VF.isVector() && VecValuesToIgnore.count(&I))) 6251 continue; 6252 6253 VectorizationCostTy C = getInstructionCost(&I, VF); 6254 6255 // Check if we should override the cost. 6256 if (C.first.isValid() && 6257 ForceTargetInstructionCost.getNumOccurrences() > 0) 6258 C.first = InstructionCost(ForceTargetInstructionCost); 6259 6260 // Keep a list of instructions with invalid costs. 6261 if (Invalid && !C.first.isValid()) 6262 Invalid->emplace_back(&I, VF); 6263 6264 BlockCost.first += C.first; 6265 BlockCost.second |= C.second; 6266 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6267 << " for VF " << VF << " For instruction: " << I 6268 << '\n'); 6269 } 6270 6271 // If we are vectorizing a predicated block, it will have been 6272 // if-converted. This means that the block's instructions (aside from 6273 // stores and instructions that may divide by zero) will now be 6274 // unconditionally executed. For the scalar case, we may not always execute 6275 // the predicated block, if it is an if-else block. Thus, scale the block's 6276 // cost by the probability of executing it. blockNeedsPredication from 6277 // Legal is used so as to not include all blocks in tail folded loops. 6278 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6279 BlockCost.first /= getReciprocalPredBlockProb(); 6280 6281 Cost.first += BlockCost.first; 6282 Cost.second |= BlockCost.second; 6283 } 6284 6285 return Cost; 6286 } 6287 6288 /// Gets Address Access SCEV after verifying that the access pattern 6289 /// is loop invariant except the induction variable dependence. 6290 /// 6291 /// This SCEV can be sent to the Target in order to estimate the address 6292 /// calculation cost. 6293 static const SCEV *getAddressAccessSCEV( 6294 Value *Ptr, 6295 LoopVectorizationLegality *Legal, 6296 PredicatedScalarEvolution &PSE, 6297 const Loop *TheLoop) { 6298 6299 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6300 if (!Gep) 6301 return nullptr; 6302 6303 // We are looking for a gep with all loop invariant indices except for one 6304 // which should be an induction variable. 6305 auto SE = PSE.getSE(); 6306 unsigned NumOperands = Gep->getNumOperands(); 6307 for (unsigned i = 1; i < NumOperands; ++i) { 6308 Value *Opd = Gep->getOperand(i); 6309 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6310 !Legal->isInductionVariable(Opd)) 6311 return nullptr; 6312 } 6313 6314 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6315 return PSE.getSCEV(Ptr); 6316 } 6317 6318 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6319 return Legal->hasStride(I->getOperand(0)) || 6320 Legal->hasStride(I->getOperand(1)); 6321 } 6322 6323 InstructionCost 6324 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6325 ElementCount VF) { 6326 assert(VF.isVector() && 6327 "Scalarization cost of instruction implies vectorization."); 6328 if (VF.isScalable()) 6329 return InstructionCost::getInvalid(); 6330 6331 Type *ValTy = getLoadStoreType(I); 6332 auto SE = PSE.getSE(); 6333 6334 unsigned AS = getLoadStoreAddressSpace(I); 6335 Value *Ptr = getLoadStorePointerOperand(I); 6336 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6337 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6338 // that it is being called from this specific place. 6339 6340 // Figure out whether the access is strided and get the stride value 6341 // if it's known in compile time 6342 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6343 6344 // Get the cost of the scalar memory instruction and address computation. 6345 InstructionCost Cost = 6346 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6347 6348 // Don't pass *I here, since it is scalar but will actually be part of a 6349 // vectorized loop where the user of it is a vectorized instruction. 6350 const Align Alignment = getLoadStoreAlignment(I); 6351 Cost += VF.getKnownMinValue() * 6352 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6353 AS, TTI::TCK_RecipThroughput); 6354 6355 // Get the overhead of the extractelement and insertelement instructions 6356 // we might create due to scalarization. 6357 Cost += getScalarizationOverhead(I, VF); 6358 6359 // If we have a predicated load/store, it will need extra i1 extracts and 6360 // conditional branches, but may not be executed for each vector lane. Scale 6361 // the cost by the probability of executing the predicated block. 6362 if (isPredicatedInst(I, VF)) { 6363 Cost /= getReciprocalPredBlockProb(); 6364 6365 // Add the cost of an i1 extract and a branch 6366 auto *Vec_i1Ty = 6367 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6368 Cost += TTI.getScalarizationOverhead( 6369 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6370 /*Insert=*/false, /*Extract=*/true); 6371 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6372 6373 if (useEmulatedMaskMemRefHack(I, VF)) 6374 // Artificially setting to a high enough value to practically disable 6375 // vectorization with such operations. 6376 Cost = 3000000; 6377 } 6378 6379 return Cost; 6380 } 6381 6382 InstructionCost 6383 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6384 ElementCount VF) { 6385 Type *ValTy = getLoadStoreType(I); 6386 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6387 Value *Ptr = getLoadStorePointerOperand(I); 6388 unsigned AS = getLoadStoreAddressSpace(I); 6389 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6390 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6391 6392 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6393 "Stride should be 1 or -1 for consecutive memory access"); 6394 const Align Alignment = getLoadStoreAlignment(I); 6395 InstructionCost Cost = 0; 6396 if (Legal->isMaskRequired(I)) 6397 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6398 CostKind); 6399 else 6400 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6401 CostKind, I); 6402 6403 bool Reverse = ConsecutiveStride < 0; 6404 if (Reverse) 6405 Cost += 6406 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6407 return Cost; 6408 } 6409 6410 InstructionCost 6411 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6412 ElementCount VF) { 6413 assert(Legal->isUniformMemOp(*I)); 6414 6415 Type *ValTy = getLoadStoreType(I); 6416 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6417 const Align Alignment = getLoadStoreAlignment(I); 6418 unsigned AS = getLoadStoreAddressSpace(I); 6419 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6420 if (isa<LoadInst>(I)) { 6421 return TTI.getAddressComputationCost(ValTy) + 6422 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6423 CostKind) + 6424 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6425 } 6426 StoreInst *SI = cast<StoreInst>(I); 6427 6428 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6429 return TTI.getAddressComputationCost(ValTy) + 6430 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6431 CostKind) + 6432 (isLoopInvariantStoreValue 6433 ? 0 6434 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6435 VF.getKnownMinValue() - 1)); 6436 } 6437 6438 InstructionCost 6439 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6440 ElementCount VF) { 6441 Type *ValTy = getLoadStoreType(I); 6442 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6443 const Align Alignment = getLoadStoreAlignment(I); 6444 const Value *Ptr = getLoadStorePointerOperand(I); 6445 6446 return TTI.getAddressComputationCost(VectorTy) + 6447 TTI.getGatherScatterOpCost( 6448 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6449 TargetTransformInfo::TCK_RecipThroughput, I); 6450 } 6451 6452 InstructionCost 6453 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6454 ElementCount VF) { 6455 // TODO: Once we have support for interleaving with scalable vectors 6456 // we can calculate the cost properly here. 6457 if (VF.isScalable()) 6458 return InstructionCost::getInvalid(); 6459 6460 Type *ValTy = getLoadStoreType(I); 6461 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6462 unsigned AS = getLoadStoreAddressSpace(I); 6463 6464 auto Group = getInterleavedAccessGroup(I); 6465 assert(Group && "Fail to get an interleaved access group."); 6466 6467 unsigned InterleaveFactor = Group->getFactor(); 6468 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6469 6470 // Holds the indices of existing members in the interleaved group. 6471 SmallVector<unsigned, 4> Indices; 6472 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6473 if (Group->getMember(IF)) 6474 Indices.push_back(IF); 6475 6476 // Calculate the cost of the whole interleaved group. 6477 bool UseMaskForGaps = 6478 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6479 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6480 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6481 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6482 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6483 6484 if (Group->isReverse()) { 6485 // TODO: Add support for reversed masked interleaved access. 6486 assert(!Legal->isMaskRequired(I) && 6487 "Reverse masked interleaved access not supported."); 6488 Cost += 6489 Group->getNumMembers() * 6490 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6491 } 6492 return Cost; 6493 } 6494 6495 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6496 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6497 using namespace llvm::PatternMatch; 6498 // Early exit for no inloop reductions 6499 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6500 return None; 6501 auto *VectorTy = cast<VectorType>(Ty); 6502 6503 // We are looking for a pattern of, and finding the minimal acceptable cost: 6504 // reduce(mul(ext(A), ext(B))) or 6505 // reduce(mul(A, B)) or 6506 // reduce(ext(A)) or 6507 // reduce(A). 6508 // The basic idea is that we walk down the tree to do that, finding the root 6509 // reduction instruction in InLoopReductionImmediateChains. From there we find 6510 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6511 // of the components. If the reduction cost is lower then we return it for the 6512 // reduction instruction and 0 for the other instructions in the pattern. If 6513 // it is not we return an invalid cost specifying the orignal cost method 6514 // should be used. 6515 Instruction *RetI = I; 6516 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6517 if (!RetI->hasOneUser()) 6518 return None; 6519 RetI = RetI->user_back(); 6520 } 6521 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6522 RetI->user_back()->getOpcode() == Instruction::Add) { 6523 if (!RetI->hasOneUser()) 6524 return None; 6525 RetI = RetI->user_back(); 6526 } 6527 6528 // Test if the found instruction is a reduction, and if not return an invalid 6529 // cost specifying the parent to use the original cost modelling. 6530 if (!InLoopReductionImmediateChains.count(RetI)) 6531 return None; 6532 6533 // Find the reduction this chain is a part of and calculate the basic cost of 6534 // the reduction on its own. 6535 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6536 Instruction *ReductionPhi = LastChain; 6537 while (!isa<PHINode>(ReductionPhi)) 6538 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6539 6540 const RecurrenceDescriptor &RdxDesc = 6541 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6542 6543 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6544 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6545 6546 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6547 // normal fmul instruction to the cost of the fadd reduction. 6548 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6549 BaseCost += 6550 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6551 6552 // If we're using ordered reductions then we can just return the base cost 6553 // here, since getArithmeticReductionCost calculates the full ordered 6554 // reduction cost when FP reassociation is not allowed. 6555 if (useOrderedReductions(RdxDesc)) 6556 return BaseCost; 6557 6558 // Get the operand that was not the reduction chain and match it to one of the 6559 // patterns, returning the better cost if it is found. 6560 Instruction *RedOp = RetI->getOperand(1) == LastChain 6561 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6562 : dyn_cast<Instruction>(RetI->getOperand(1)); 6563 6564 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6565 6566 Instruction *Op0, *Op1; 6567 if (RedOp && 6568 match(RedOp, 6569 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6570 match(Op0, m_ZExtOrSExt(m_Value())) && 6571 Op0->getOpcode() == Op1->getOpcode() && 6572 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6573 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6574 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6575 6576 // Matched reduce(ext(mul(ext(A), ext(B))) 6577 // Note that the extend opcodes need to all match, or if A==B they will have 6578 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6579 // which is equally fine. 6580 bool IsUnsigned = isa<ZExtInst>(Op0); 6581 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6582 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6583 6584 InstructionCost ExtCost = 6585 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6586 TTI::CastContextHint::None, CostKind, Op0); 6587 InstructionCost MulCost = 6588 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6589 InstructionCost Ext2Cost = 6590 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6591 TTI::CastContextHint::None, CostKind, RedOp); 6592 6593 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6594 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6595 CostKind); 6596 6597 if (RedCost.isValid() && 6598 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6599 return I == RetI ? RedCost : 0; 6600 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6601 !TheLoop->isLoopInvariant(RedOp)) { 6602 // Matched reduce(ext(A)) 6603 bool IsUnsigned = isa<ZExtInst>(RedOp); 6604 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6605 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6606 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6607 CostKind); 6608 6609 InstructionCost ExtCost = 6610 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6611 TTI::CastContextHint::None, CostKind, RedOp); 6612 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6613 return I == RetI ? RedCost : 0; 6614 } else if (RedOp && 6615 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6616 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6617 Op0->getOpcode() == Op1->getOpcode() && 6618 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6619 bool IsUnsigned = isa<ZExtInst>(Op0); 6620 Type *Op0Ty = Op0->getOperand(0)->getType(); 6621 Type *Op1Ty = Op1->getOperand(0)->getType(); 6622 Type *LargestOpTy = 6623 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6624 : Op0Ty; 6625 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6626 6627 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6628 // different sizes. We take the largest type as the ext to reduce, and add 6629 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6630 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6631 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6632 TTI::CastContextHint::None, CostKind, Op0); 6633 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6634 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6635 TTI::CastContextHint::None, CostKind, Op1); 6636 InstructionCost MulCost = 6637 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6638 6639 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6640 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6641 CostKind); 6642 InstructionCost ExtraExtCost = 0; 6643 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6644 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6645 ExtraExtCost = TTI.getCastInstrCost( 6646 ExtraExtOp->getOpcode(), ExtType, 6647 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6648 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6649 } 6650 6651 if (RedCost.isValid() && 6652 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6653 return I == RetI ? RedCost : 0; 6654 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6655 // Matched reduce(mul()) 6656 InstructionCost MulCost = 6657 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6658 6659 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6660 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6661 CostKind); 6662 6663 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6664 return I == RetI ? RedCost : 0; 6665 } 6666 } 6667 6668 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6669 } 6670 6671 InstructionCost 6672 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6673 ElementCount VF) { 6674 // Calculate scalar cost only. Vectorization cost should be ready at this 6675 // moment. 6676 if (VF.isScalar()) { 6677 Type *ValTy = getLoadStoreType(I); 6678 const Align Alignment = getLoadStoreAlignment(I); 6679 unsigned AS = getLoadStoreAddressSpace(I); 6680 6681 return TTI.getAddressComputationCost(ValTy) + 6682 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6683 TTI::TCK_RecipThroughput, I); 6684 } 6685 return getWideningCost(I, VF); 6686 } 6687 6688 LoopVectorizationCostModel::VectorizationCostTy 6689 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6690 ElementCount VF) { 6691 // If we know that this instruction will remain uniform, check the cost of 6692 // the scalar version. 6693 if (isUniformAfterVectorization(I, VF)) 6694 VF = ElementCount::getFixed(1); 6695 6696 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6697 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6698 6699 // Forced scalars do not have any scalarization overhead. 6700 auto ForcedScalar = ForcedScalars.find(VF); 6701 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6702 auto InstSet = ForcedScalar->second; 6703 if (InstSet.count(I)) 6704 return VectorizationCostTy( 6705 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6706 VF.getKnownMinValue()), 6707 false); 6708 } 6709 6710 Type *VectorTy; 6711 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6712 6713 bool TypeNotScalarized = false; 6714 if (VF.isVector() && VectorTy->isVectorTy()) { 6715 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6716 if (VF.isScalable()) 6717 // <vscale x 1 x iN> is assumed to be profitable over iN because 6718 // scalable registers are a distinct register class from scalar ones. 6719 // If we ever find a target which wants to lower scalable vectors 6720 // back to scalars, we'll need to update this code to explicitly 6721 // ask TTI about the register class uses for each part. 6722 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6723 else 6724 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6725 } else 6726 C = InstructionCost::getInvalid(); 6727 } 6728 return VectorizationCostTy(C, TypeNotScalarized); 6729 } 6730 6731 InstructionCost 6732 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6733 ElementCount VF) const { 6734 6735 // There is no mechanism yet to create a scalable scalarization loop, 6736 // so this is currently Invalid. 6737 if (VF.isScalable()) 6738 return InstructionCost::getInvalid(); 6739 6740 if (VF.isScalar()) 6741 return 0; 6742 6743 InstructionCost Cost = 0; 6744 Type *RetTy = ToVectorTy(I->getType(), VF); 6745 if (!RetTy->isVoidTy() && 6746 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6747 Cost += TTI.getScalarizationOverhead( 6748 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6749 false); 6750 6751 // Some targets keep addresses scalar. 6752 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6753 return Cost; 6754 6755 // Some targets support efficient element stores. 6756 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6757 return Cost; 6758 6759 // Collect operands to consider. 6760 CallInst *CI = dyn_cast<CallInst>(I); 6761 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6762 6763 // Skip operands that do not require extraction/scalarization and do not incur 6764 // any overhead. 6765 SmallVector<Type *> Tys; 6766 for (auto *V : filterExtractingOperands(Ops, VF)) 6767 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6768 return Cost + TTI.getOperandsScalarizationOverhead( 6769 filterExtractingOperands(Ops, VF), Tys); 6770 } 6771 6772 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6773 if (VF.isScalar()) 6774 return; 6775 NumPredStores = 0; 6776 for (BasicBlock *BB : TheLoop->blocks()) { 6777 // For each instruction in the old loop. 6778 for (Instruction &I : *BB) { 6779 Value *Ptr = getLoadStorePointerOperand(&I); 6780 if (!Ptr) 6781 continue; 6782 6783 // TODO: We should generate better code and update the cost model for 6784 // predicated uniform stores. Today they are treated as any other 6785 // predicated store (see added test cases in 6786 // invariant-store-vectorization.ll). 6787 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6788 NumPredStores++; 6789 6790 if (Legal->isUniformMemOp(I)) { 6791 // TODO: Avoid replicating loads and stores instead of 6792 // relying on instcombine to remove them. 6793 // Load: Scalar load + broadcast 6794 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6795 InstructionCost Cost; 6796 if (isa<StoreInst>(&I) && VF.isScalable() && 6797 isLegalGatherOrScatter(&I, VF)) { 6798 Cost = getGatherScatterCost(&I, VF); 6799 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6800 } else { 6801 Cost = getUniformMemOpCost(&I, VF); 6802 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6803 } 6804 continue; 6805 } 6806 6807 // We assume that widening is the best solution when possible. 6808 if (memoryInstructionCanBeWidened(&I, VF)) { 6809 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6810 int ConsecutiveStride = Legal->isConsecutivePtr( 6811 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6812 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6813 "Expected consecutive stride."); 6814 InstWidening Decision = 6815 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6816 setWideningDecision(&I, VF, Decision, Cost); 6817 continue; 6818 } 6819 6820 // Choose between Interleaving, Gather/Scatter or Scalarization. 6821 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6822 unsigned NumAccesses = 1; 6823 if (isAccessInterleaved(&I)) { 6824 auto Group = getInterleavedAccessGroup(&I); 6825 assert(Group && "Fail to get an interleaved access group."); 6826 6827 // Make one decision for the whole group. 6828 if (getWideningDecision(&I, VF) != CM_Unknown) 6829 continue; 6830 6831 NumAccesses = Group->getNumMembers(); 6832 if (interleavedAccessCanBeWidened(&I, VF)) 6833 InterleaveCost = getInterleaveGroupCost(&I, VF); 6834 } 6835 6836 InstructionCost GatherScatterCost = 6837 isLegalGatherOrScatter(&I, VF) 6838 ? getGatherScatterCost(&I, VF) * NumAccesses 6839 : InstructionCost::getInvalid(); 6840 6841 InstructionCost ScalarizationCost = 6842 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6843 6844 // Choose better solution for the current VF, 6845 // write down this decision and use it during vectorization. 6846 InstructionCost Cost; 6847 InstWidening Decision; 6848 if (InterleaveCost <= GatherScatterCost && 6849 InterleaveCost < ScalarizationCost) { 6850 Decision = CM_Interleave; 6851 Cost = InterleaveCost; 6852 } else if (GatherScatterCost < ScalarizationCost) { 6853 Decision = CM_GatherScatter; 6854 Cost = GatherScatterCost; 6855 } else { 6856 Decision = CM_Scalarize; 6857 Cost = ScalarizationCost; 6858 } 6859 // If the instructions belongs to an interleave group, the whole group 6860 // receives the same decision. The whole group receives the cost, but 6861 // the cost will actually be assigned to one instruction. 6862 if (auto Group = getInterleavedAccessGroup(&I)) 6863 setWideningDecision(Group, VF, Decision, Cost); 6864 else 6865 setWideningDecision(&I, VF, Decision, Cost); 6866 } 6867 } 6868 6869 // Make sure that any load of address and any other address computation 6870 // remains scalar unless there is gather/scatter support. This avoids 6871 // inevitable extracts into address registers, and also has the benefit of 6872 // activating LSR more, since that pass can't optimize vectorized 6873 // addresses. 6874 if (TTI.prefersVectorizedAddressing()) 6875 return; 6876 6877 // Start with all scalar pointer uses. 6878 SmallPtrSet<Instruction *, 8> AddrDefs; 6879 for (BasicBlock *BB : TheLoop->blocks()) 6880 for (Instruction &I : *BB) { 6881 Instruction *PtrDef = 6882 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6883 if (PtrDef && TheLoop->contains(PtrDef) && 6884 getWideningDecision(&I, VF) != CM_GatherScatter) 6885 AddrDefs.insert(PtrDef); 6886 } 6887 6888 // Add all instructions used to generate the addresses. 6889 SmallVector<Instruction *, 4> Worklist; 6890 append_range(Worklist, AddrDefs); 6891 while (!Worklist.empty()) { 6892 Instruction *I = Worklist.pop_back_val(); 6893 for (auto &Op : I->operands()) 6894 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6895 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6896 AddrDefs.insert(InstOp).second) 6897 Worklist.push_back(InstOp); 6898 } 6899 6900 for (auto *I : AddrDefs) { 6901 if (isa<LoadInst>(I)) { 6902 // Setting the desired widening decision should ideally be handled in 6903 // by cost functions, but since this involves the task of finding out 6904 // if the loaded register is involved in an address computation, it is 6905 // instead changed here when we know this is the case. 6906 InstWidening Decision = getWideningDecision(I, VF); 6907 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6908 // Scalarize a widened load of address. 6909 setWideningDecision( 6910 I, VF, CM_Scalarize, 6911 (VF.getKnownMinValue() * 6912 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6913 else if (auto Group = getInterleavedAccessGroup(I)) { 6914 // Scalarize an interleave group of address loads. 6915 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6916 if (Instruction *Member = Group->getMember(I)) 6917 setWideningDecision( 6918 Member, VF, CM_Scalarize, 6919 (VF.getKnownMinValue() * 6920 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6921 } 6922 } 6923 } else 6924 // Make sure I gets scalarized and a cost estimate without 6925 // scalarization overhead. 6926 ForcedScalars[VF].insert(I); 6927 } 6928 } 6929 6930 InstructionCost 6931 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6932 Type *&VectorTy) { 6933 Type *RetTy = I->getType(); 6934 if (canTruncateToMinimalBitwidth(I, VF)) 6935 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6936 auto SE = PSE.getSE(); 6937 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6938 6939 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6940 ElementCount VF) -> bool { 6941 if (VF.isScalar()) 6942 return true; 6943 6944 auto Scalarized = InstsToScalarize.find(VF); 6945 assert(Scalarized != InstsToScalarize.end() && 6946 "VF not yet analyzed for scalarization profitability"); 6947 return !Scalarized->second.count(I) && 6948 llvm::all_of(I->users(), [&](User *U) { 6949 auto *UI = cast<Instruction>(U); 6950 return !Scalarized->second.count(UI); 6951 }); 6952 }; 6953 (void) hasSingleCopyAfterVectorization; 6954 6955 if (isScalarAfterVectorization(I, VF)) { 6956 // With the exception of GEPs and PHIs, after scalarization there should 6957 // only be one copy of the instruction generated in the loop. This is 6958 // because the VF is either 1, or any instructions that need scalarizing 6959 // have already been dealt with by the the time we get here. As a result, 6960 // it means we don't have to multiply the instruction cost by VF. 6961 assert(I->getOpcode() == Instruction::GetElementPtr || 6962 I->getOpcode() == Instruction::PHI || 6963 (I->getOpcode() == Instruction::BitCast && 6964 I->getType()->isPointerTy()) || 6965 hasSingleCopyAfterVectorization(I, VF)); 6966 VectorTy = RetTy; 6967 } else 6968 VectorTy = ToVectorTy(RetTy, VF); 6969 6970 // TODO: We need to estimate the cost of intrinsic calls. 6971 switch (I->getOpcode()) { 6972 case Instruction::GetElementPtr: 6973 // We mark this instruction as zero-cost because the cost of GEPs in 6974 // vectorized code depends on whether the corresponding memory instruction 6975 // is scalarized or not. Therefore, we handle GEPs with the memory 6976 // instruction cost. 6977 return 0; 6978 case Instruction::Br: { 6979 // In cases of scalarized and predicated instructions, there will be VF 6980 // predicated blocks in the vectorized loop. Each branch around these 6981 // blocks requires also an extract of its vector compare i1 element. 6982 bool ScalarPredicatedBB = false; 6983 BranchInst *BI = cast<BranchInst>(I); 6984 if (VF.isVector() && BI->isConditional() && 6985 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 6986 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) 6987 ScalarPredicatedBB = true; 6988 6989 if (ScalarPredicatedBB) { 6990 // Not possible to scalarize scalable vector with predicated instructions. 6991 if (VF.isScalable()) 6992 return InstructionCost::getInvalid(); 6993 // Return cost for branches around scalarized and predicated blocks. 6994 auto *Vec_i1Ty = 6995 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6996 return ( 6997 TTI.getScalarizationOverhead( 6998 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 6999 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7000 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7001 // The back-edge branch will remain, as will all scalar branches. 7002 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7003 else 7004 // This branch will be eliminated by if-conversion. 7005 return 0; 7006 // Note: We currently assume zero cost for an unconditional branch inside 7007 // a predicated block since it will become a fall-through, although we 7008 // may decide in the future to call TTI for all branches. 7009 } 7010 case Instruction::PHI: { 7011 auto *Phi = cast<PHINode>(I); 7012 7013 // First-order recurrences are replaced by vector shuffles inside the loop. 7014 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7015 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7016 return TTI.getShuffleCost( 7017 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7018 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7019 7020 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7021 // converted into select instructions. We require N - 1 selects per phi 7022 // node, where N is the number of incoming values. 7023 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7024 return (Phi->getNumIncomingValues() - 1) * 7025 TTI.getCmpSelInstrCost( 7026 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7027 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7028 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7029 7030 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7031 } 7032 case Instruction::UDiv: 7033 case Instruction::SDiv: 7034 case Instruction::URem: 7035 case Instruction::SRem: 7036 // If we have a predicated instruction, it may not be executed for each 7037 // vector lane. Get the scalarization cost and scale this amount by the 7038 // probability of executing the predicated block. If the instruction is not 7039 // predicated, we fall through to the next case. 7040 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7041 InstructionCost Cost = 0; 7042 7043 // These instructions have a non-void type, so account for the phi nodes 7044 // that we will create. This cost is likely to be zero. The phi node 7045 // cost, if any, should be scaled by the block probability because it 7046 // models a copy at the end of each predicated block. 7047 Cost += VF.getKnownMinValue() * 7048 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7049 7050 // The cost of the non-predicated instruction. 7051 Cost += VF.getKnownMinValue() * 7052 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7053 7054 // The cost of insertelement and extractelement instructions needed for 7055 // scalarization. 7056 Cost += getScalarizationOverhead(I, VF); 7057 7058 // Scale the cost by the probability of executing the predicated blocks. 7059 // This assumes the predicated block for each vector lane is equally 7060 // likely. 7061 return Cost / getReciprocalPredBlockProb(); 7062 } 7063 LLVM_FALLTHROUGH; 7064 case Instruction::Add: 7065 case Instruction::FAdd: 7066 case Instruction::Sub: 7067 case Instruction::FSub: 7068 case Instruction::Mul: 7069 case Instruction::FMul: 7070 case Instruction::FDiv: 7071 case Instruction::FRem: 7072 case Instruction::Shl: 7073 case Instruction::LShr: 7074 case Instruction::AShr: 7075 case Instruction::And: 7076 case Instruction::Or: 7077 case Instruction::Xor: { 7078 // Since we will replace the stride by 1 the multiplication should go away. 7079 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7080 return 0; 7081 7082 // Detect reduction patterns 7083 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7084 return *RedCost; 7085 7086 // Certain instructions can be cheaper to vectorize if they have a constant 7087 // second vector operand. One example of this are shifts on x86. 7088 Value *Op2 = I->getOperand(1); 7089 TargetTransformInfo::OperandValueProperties Op2VP; 7090 TargetTransformInfo::OperandValueKind Op2VK = 7091 TTI.getOperandInfo(Op2, Op2VP); 7092 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7093 Op2VK = TargetTransformInfo::OK_UniformValue; 7094 7095 SmallVector<const Value *, 4> Operands(I->operand_values()); 7096 return TTI.getArithmeticInstrCost( 7097 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7098 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7099 } 7100 case Instruction::FNeg: { 7101 return TTI.getArithmeticInstrCost( 7102 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7103 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7104 TargetTransformInfo::OP_None, I->getOperand(0), I); 7105 } 7106 case Instruction::Select: { 7107 SelectInst *SI = cast<SelectInst>(I); 7108 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7109 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7110 7111 const Value *Op0, *Op1; 7112 using namespace llvm::PatternMatch; 7113 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7114 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7115 // select x, y, false --> x & y 7116 // select x, true, y --> x | y 7117 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7118 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7119 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7120 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7121 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7122 Op1->getType()->getScalarSizeInBits() == 1); 7123 7124 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7125 return TTI.getArithmeticInstrCost( 7126 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7127 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7128 } 7129 7130 Type *CondTy = SI->getCondition()->getType(); 7131 if (!ScalarCond) 7132 CondTy = VectorType::get(CondTy, VF); 7133 7134 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7135 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7136 Pred = Cmp->getPredicate(); 7137 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7138 CostKind, I); 7139 } 7140 case Instruction::ICmp: 7141 case Instruction::FCmp: { 7142 Type *ValTy = I->getOperand(0)->getType(); 7143 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7144 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7145 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7146 VectorTy = ToVectorTy(ValTy, VF); 7147 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7148 cast<CmpInst>(I)->getPredicate(), CostKind, 7149 I); 7150 } 7151 case Instruction::Store: 7152 case Instruction::Load: { 7153 ElementCount Width = VF; 7154 if (Width.isVector()) { 7155 InstWidening Decision = getWideningDecision(I, Width); 7156 assert(Decision != CM_Unknown && 7157 "CM decision should be taken at this point"); 7158 if (Decision == CM_Scalarize) { 7159 if (VF.isScalable() && isa<StoreInst>(I)) 7160 // We can't scalarize a scalable vector store (even a uniform one 7161 // currently), return an invalid cost so as to prevent vectorization. 7162 return InstructionCost::getInvalid(); 7163 Width = ElementCount::getFixed(1); 7164 } 7165 } 7166 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7167 return getMemoryInstructionCost(I, VF); 7168 } 7169 case Instruction::BitCast: 7170 if (I->getType()->isPointerTy()) 7171 return 0; 7172 LLVM_FALLTHROUGH; 7173 case Instruction::ZExt: 7174 case Instruction::SExt: 7175 case Instruction::FPToUI: 7176 case Instruction::FPToSI: 7177 case Instruction::FPExt: 7178 case Instruction::PtrToInt: 7179 case Instruction::IntToPtr: 7180 case Instruction::SIToFP: 7181 case Instruction::UIToFP: 7182 case Instruction::Trunc: 7183 case Instruction::FPTrunc: { 7184 // Computes the CastContextHint from a Load/Store instruction. 7185 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7186 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7187 "Expected a load or a store!"); 7188 7189 if (VF.isScalar() || !TheLoop->contains(I)) 7190 return TTI::CastContextHint::Normal; 7191 7192 switch (getWideningDecision(I, VF)) { 7193 case LoopVectorizationCostModel::CM_GatherScatter: 7194 return TTI::CastContextHint::GatherScatter; 7195 case LoopVectorizationCostModel::CM_Interleave: 7196 return TTI::CastContextHint::Interleave; 7197 case LoopVectorizationCostModel::CM_Scalarize: 7198 case LoopVectorizationCostModel::CM_Widen: 7199 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7200 : TTI::CastContextHint::Normal; 7201 case LoopVectorizationCostModel::CM_Widen_Reverse: 7202 return TTI::CastContextHint::Reversed; 7203 case LoopVectorizationCostModel::CM_Unknown: 7204 llvm_unreachable("Instr did not go through cost modelling?"); 7205 } 7206 7207 llvm_unreachable("Unhandled case!"); 7208 }; 7209 7210 unsigned Opcode = I->getOpcode(); 7211 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7212 // For Trunc, the context is the only user, which must be a StoreInst. 7213 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7214 if (I->hasOneUse()) 7215 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7216 CCH = ComputeCCH(Store); 7217 } 7218 // For Z/Sext, the context is the operand, which must be a LoadInst. 7219 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7220 Opcode == Instruction::FPExt) { 7221 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7222 CCH = ComputeCCH(Load); 7223 } 7224 7225 // We optimize the truncation of induction variables having constant 7226 // integer steps. The cost of these truncations is the same as the scalar 7227 // operation. 7228 if (isOptimizableIVTruncate(I, VF)) { 7229 auto *Trunc = cast<TruncInst>(I); 7230 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7231 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7232 } 7233 7234 // Detect reduction patterns 7235 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7236 return *RedCost; 7237 7238 Type *SrcScalarTy = I->getOperand(0)->getType(); 7239 Type *SrcVecTy = 7240 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7241 if (canTruncateToMinimalBitwidth(I, VF)) { 7242 // This cast is going to be shrunk. This may remove the cast or it might 7243 // turn it into slightly different cast. For example, if MinBW == 16, 7244 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7245 // 7246 // Calculate the modified src and dest types. 7247 Type *MinVecTy = VectorTy; 7248 if (Opcode == Instruction::Trunc) { 7249 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7250 VectorTy = 7251 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7252 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7253 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7254 VectorTy = 7255 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7256 } 7257 } 7258 7259 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7260 } 7261 case Instruction::Call: { 7262 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7263 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7264 return *RedCost; 7265 bool NeedToScalarize; 7266 CallInst *CI = cast<CallInst>(I); 7267 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7268 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7269 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7270 return std::min(CallCost, IntrinsicCost); 7271 } 7272 return CallCost; 7273 } 7274 case Instruction::ExtractValue: 7275 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7276 case Instruction::Alloca: 7277 // We cannot easily widen alloca to a scalable alloca, as 7278 // the result would need to be a vector of pointers. 7279 if (VF.isScalable()) 7280 return InstructionCost::getInvalid(); 7281 LLVM_FALLTHROUGH; 7282 default: 7283 // This opcode is unknown. Assume that it is the same as 'mul'. 7284 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7285 } // end of switch. 7286 } 7287 7288 char LoopVectorize::ID = 0; 7289 7290 static const char lv_name[] = "Loop Vectorization"; 7291 7292 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7293 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7294 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7295 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7296 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7297 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7298 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7299 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7300 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7301 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7302 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7303 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7304 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7305 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7306 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7307 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7308 7309 namespace llvm { 7310 7311 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7312 7313 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7314 bool VectorizeOnlyWhenForced) { 7315 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7316 } 7317 7318 } // end namespace llvm 7319 7320 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7321 // Check if the pointer operand of a load or store instruction is 7322 // consecutive. 7323 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7324 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7325 return false; 7326 } 7327 7328 void LoopVectorizationCostModel::collectValuesToIgnore() { 7329 // Ignore ephemeral values. 7330 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7331 7332 // Find all stores to invariant variables. Since they are going to sink 7333 // outside the loop we do not need calculate cost for them. 7334 for (BasicBlock *BB : TheLoop->blocks()) 7335 for (Instruction &I : *BB) { 7336 StoreInst *SI; 7337 if ((SI = dyn_cast<StoreInst>(&I)) && 7338 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7339 ValuesToIgnore.insert(&I); 7340 } 7341 7342 // Ignore type-promoting instructions we identified during reduction 7343 // detection. 7344 for (auto &Reduction : Legal->getReductionVars()) { 7345 const RecurrenceDescriptor &RedDes = Reduction.second; 7346 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7347 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7348 } 7349 // Ignore type-casting instructions we identified during induction 7350 // detection. 7351 for (auto &Induction : Legal->getInductionVars()) { 7352 const InductionDescriptor &IndDes = Induction.second; 7353 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7354 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7355 } 7356 } 7357 7358 void LoopVectorizationCostModel::collectInLoopReductions() { 7359 for (auto &Reduction : Legal->getReductionVars()) { 7360 PHINode *Phi = Reduction.first; 7361 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7362 7363 // We don't collect reductions that are type promoted (yet). 7364 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7365 continue; 7366 7367 // If the target would prefer this reduction to happen "in-loop", then we 7368 // want to record it as such. 7369 unsigned Opcode = RdxDesc.getOpcode(); 7370 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7371 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7372 TargetTransformInfo::ReductionFlags())) 7373 continue; 7374 7375 // Check that we can correctly put the reductions into the loop, by 7376 // finding the chain of operations that leads from the phi to the loop 7377 // exit value. 7378 SmallVector<Instruction *, 4> ReductionOperations = 7379 RdxDesc.getReductionOpChain(Phi, TheLoop); 7380 bool InLoop = !ReductionOperations.empty(); 7381 if (InLoop) { 7382 InLoopReductionChains[Phi] = ReductionOperations; 7383 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7384 Instruction *LastChain = Phi; 7385 for (auto *I : ReductionOperations) { 7386 InLoopReductionImmediateChains[I] = LastChain; 7387 LastChain = I; 7388 } 7389 } 7390 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7391 << " reduction for phi: " << *Phi << "\n"); 7392 } 7393 } 7394 7395 // TODO: we could return a pair of values that specify the max VF and 7396 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7397 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7398 // doesn't have a cost model that can choose which plan to execute if 7399 // more than one is generated. 7400 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7401 LoopVectorizationCostModel &CM) { 7402 unsigned WidestType; 7403 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7404 return WidestVectorRegBits / WidestType; 7405 } 7406 7407 VectorizationFactor 7408 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7409 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7410 ElementCount VF = UserVF; 7411 // Outer loop handling: They may require CFG and instruction level 7412 // transformations before even evaluating whether vectorization is profitable. 7413 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7414 // the vectorization pipeline. 7415 if (!OrigLoop->isInnermost()) { 7416 // If the user doesn't provide a vectorization factor, determine a 7417 // reasonable one. 7418 if (UserVF.isZero()) { 7419 VF = ElementCount::getFixed(determineVPlanVF( 7420 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7421 .getFixedSize(), 7422 CM)); 7423 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7424 7425 // Make sure we have a VF > 1 for stress testing. 7426 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7427 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7428 << "overriding computed VF.\n"); 7429 VF = ElementCount::getFixed(4); 7430 } 7431 } 7432 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7433 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7434 "VF needs to be a power of two"); 7435 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7436 << "VF " << VF << " to build VPlans.\n"); 7437 buildVPlans(VF, VF); 7438 7439 // For VPlan build stress testing, we bail out after VPlan construction. 7440 if (VPlanBuildStressTest) 7441 return VectorizationFactor::Disabled(); 7442 7443 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7444 } 7445 7446 LLVM_DEBUG( 7447 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7448 "VPlan-native path.\n"); 7449 return VectorizationFactor::Disabled(); 7450 } 7451 7452 Optional<VectorizationFactor> 7453 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7454 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7455 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7456 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7457 return None; 7458 7459 // Invalidate interleave groups if all blocks of loop will be predicated. 7460 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7461 !useMaskedInterleavedAccesses(*TTI)) { 7462 LLVM_DEBUG( 7463 dbgs() 7464 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7465 "which requires masked-interleaved support.\n"); 7466 if (CM.InterleaveInfo.invalidateGroups()) 7467 // Invalidating interleave groups also requires invalidating all decisions 7468 // based on them, which includes widening decisions and uniform and scalar 7469 // values. 7470 CM.invalidateCostModelingDecisions(); 7471 } 7472 7473 ElementCount MaxUserVF = 7474 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7475 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7476 if (!UserVF.isZero() && UserVFIsLegal) { 7477 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7478 "VF needs to be a power of two"); 7479 // Collect the instructions (and their associated costs) that will be more 7480 // profitable to scalarize. 7481 if (CM.selectUserVectorizationFactor(UserVF)) { 7482 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7483 CM.collectInLoopReductions(); 7484 buildVPlansWithVPRecipes(UserVF, UserVF); 7485 LLVM_DEBUG(printPlans(dbgs())); 7486 return {{UserVF, 0, 0}}; 7487 } else 7488 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7489 "InvalidCost", ORE, OrigLoop); 7490 } 7491 7492 // Populate the set of Vectorization Factor Candidates. 7493 ElementCountSet VFCandidates; 7494 for (auto VF = ElementCount::getFixed(1); 7495 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7496 VFCandidates.insert(VF); 7497 for (auto VF = ElementCount::getScalable(1); 7498 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7499 VFCandidates.insert(VF); 7500 7501 for (const auto &VF : VFCandidates) { 7502 // Collect Uniform and Scalar instructions after vectorization with VF. 7503 CM.collectUniformsAndScalars(VF); 7504 7505 // Collect the instructions (and their associated costs) that will be more 7506 // profitable to scalarize. 7507 if (VF.isVector()) 7508 CM.collectInstsToScalarize(VF); 7509 } 7510 7511 CM.collectInLoopReductions(); 7512 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7513 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7514 7515 LLVM_DEBUG(printPlans(dbgs())); 7516 if (!MaxFactors.hasVector()) 7517 return VectorizationFactor::Disabled(); 7518 7519 // Select the optimal vectorization factor. 7520 VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); 7521 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7522 return VF; 7523 } 7524 7525 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7526 assert(count_if(VPlans, 7527 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7528 1 && 7529 "Best VF has not a single VPlan."); 7530 7531 for (const VPlanPtr &Plan : VPlans) { 7532 if (Plan->hasVF(VF)) 7533 return *Plan.get(); 7534 } 7535 llvm_unreachable("No plan found!"); 7536 } 7537 7538 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7539 SmallVector<Metadata *, 4> MDs; 7540 // Reserve first location for self reference to the LoopID metadata node. 7541 MDs.push_back(nullptr); 7542 bool IsUnrollMetadata = false; 7543 MDNode *LoopID = L->getLoopID(); 7544 if (LoopID) { 7545 // First find existing loop unrolling disable metadata. 7546 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7547 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7548 if (MD) { 7549 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7550 IsUnrollMetadata = 7551 S && S->getString().startswith("llvm.loop.unroll.disable"); 7552 } 7553 MDs.push_back(LoopID->getOperand(i)); 7554 } 7555 } 7556 7557 if (!IsUnrollMetadata) { 7558 // Add runtime unroll disable metadata. 7559 LLVMContext &Context = L->getHeader()->getContext(); 7560 SmallVector<Metadata *, 1> DisableOperands; 7561 DisableOperands.push_back( 7562 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7563 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7564 MDs.push_back(DisableNode); 7565 MDNode *NewLoopID = MDNode::get(Context, MDs); 7566 // Set operand 0 to refer to the loop id itself. 7567 NewLoopID->replaceOperandWith(0, NewLoopID); 7568 L->setLoopID(NewLoopID); 7569 } 7570 } 7571 7572 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7573 VPlan &BestVPlan, 7574 InnerLoopVectorizer &ILV, 7575 DominatorTree *DT, 7576 bool IsEpilogueVectorization) { 7577 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7578 << '\n'); 7579 7580 // Perform the actual loop transformation. 7581 7582 // 1. Set up the skeleton for vectorization, including vector pre-header and 7583 // middle block. The vector loop is created during VPlan execution. 7584 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7585 Value *CanonicalIVStartValue; 7586 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7587 ILV.createVectorizedLoopSkeleton(); 7588 7589 // Only use noalias metadata when using memory checks guaranteeing no overlap 7590 // across all iterations. 7591 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7592 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7593 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7594 7595 // We currently don't use LoopVersioning for the actual loop cloning but we 7596 // still use it to add the noalias metadata. 7597 // TODO: Find a better way to re-use LoopVersioning functionality to add 7598 // metadata. 7599 State.LVer = std::make_unique<LoopVersioning>( 7600 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7601 PSE.getSE()); 7602 State.LVer->prepareNoAliasMetadata(); 7603 } 7604 7605 ILV.collectPoisonGeneratingRecipes(State); 7606 7607 ILV.printDebugTracesAtStart(); 7608 7609 //===------------------------------------------------===// 7610 // 7611 // Notice: any optimization or new instruction that go 7612 // into the code below should also be implemented in 7613 // the cost-model. 7614 // 7615 //===------------------------------------------------===// 7616 7617 // 2. Copy and widen instructions from the old loop into the new loop. 7618 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7619 ILV.getOrCreateVectorTripCount(nullptr), 7620 CanonicalIVStartValue, State, 7621 IsEpilogueVectorization); 7622 7623 BestVPlan.execute(&State); 7624 7625 // Keep all loop hints from the original loop on the vector loop (we'll 7626 // replace the vectorizer-specific hints below). 7627 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7628 7629 Optional<MDNode *> VectorizedLoopID = 7630 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7631 LLVMLoopVectorizeFollowupVectorized}); 7632 7633 VPBasicBlock *HeaderVPBB = 7634 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7635 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7636 if (VectorizedLoopID) 7637 L->setLoopID(VectorizedLoopID.value()); 7638 else { 7639 // Keep all loop hints from the original loop on the vector loop (we'll 7640 // replace the vectorizer-specific hints below). 7641 if (MDNode *LID = OrigLoop->getLoopID()) 7642 L->setLoopID(LID); 7643 7644 LoopVectorizeHints Hints(L, true, *ORE); 7645 Hints.setAlreadyVectorized(); 7646 } 7647 // Disable runtime unrolling when vectorizing the epilogue loop. 7648 if (CanonicalIVStartValue) 7649 AddRuntimeUnrollDisableMetaData(L); 7650 7651 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7652 // predication, updating analyses. 7653 ILV.fixVectorizedLoop(State, BestVPlan); 7654 7655 ILV.printDebugTracesAtEnd(); 7656 } 7657 7658 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7659 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7660 for (const auto &Plan : VPlans) 7661 if (PrintVPlansInDotFormat) 7662 Plan->printDOT(O); 7663 else 7664 Plan->print(O); 7665 } 7666 #endif 7667 7668 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7669 7670 //===--------------------------------------------------------------------===// 7671 // EpilogueVectorizerMainLoop 7672 //===--------------------------------------------------------------------===// 7673 7674 /// This function is partially responsible for generating the control flow 7675 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7676 std::pair<BasicBlock *, Value *> 7677 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7678 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7679 7680 // Workaround! Compute the trip count of the original loop and cache it 7681 // before we start modifying the CFG. This code has a systemic problem 7682 // wherein it tries to run analysis over partially constructed IR; this is 7683 // wrong, and not simply for SCEV. The trip count of the original loop 7684 // simply happens to be prone to hitting this in practice. In theory, we 7685 // can hit the same issue for any SCEV, or ValueTracking query done during 7686 // mutation. See PR49900. 7687 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7688 createVectorLoopSkeleton(""); 7689 7690 // Generate the code to check the minimum iteration count of the vector 7691 // epilogue (see below). 7692 EPI.EpilogueIterationCountCheck = 7693 emitIterationCountCheck(LoopScalarPreHeader, true); 7694 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7695 7696 // Generate the code to check any assumptions that we've made for SCEV 7697 // expressions. 7698 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7699 7700 // Generate the code that checks at runtime if arrays overlap. We put the 7701 // checks into a separate block to make the more common case of few elements 7702 // faster. 7703 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7704 7705 // Generate the iteration count check for the main loop, *after* the check 7706 // for the epilogue loop, so that the path-length is shorter for the case 7707 // that goes directly through the vector epilogue. The longer-path length for 7708 // the main loop is compensated for, by the gain from vectorizing the larger 7709 // trip count. Note: the branch will get updated later on when we vectorize 7710 // the epilogue. 7711 EPI.MainLoopIterationCountCheck = 7712 emitIterationCountCheck(LoopScalarPreHeader, false); 7713 7714 // Generate the induction variable. 7715 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7716 7717 // Skip induction resume value creation here because they will be created in 7718 // the second pass. If we created them here, they wouldn't be used anyway, 7719 // because the vplan in the second pass still contains the inductions from the 7720 // original loop. 7721 7722 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7723 } 7724 7725 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7726 LLVM_DEBUG({ 7727 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7728 << "Main Loop VF:" << EPI.MainLoopVF 7729 << ", Main Loop UF:" << EPI.MainLoopUF 7730 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7731 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7732 }); 7733 } 7734 7735 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7736 DEBUG_WITH_TYPE(VerboseDebug, { 7737 dbgs() << "intermediate fn:\n" 7738 << *OrigLoop->getHeader()->getParent() << "\n"; 7739 }); 7740 } 7741 7742 BasicBlock * 7743 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7744 bool ForEpilogue) { 7745 assert(Bypass && "Expected valid bypass basic block."); 7746 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7747 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7748 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7749 // Reuse existing vector loop preheader for TC checks. 7750 // Note that new preheader block is generated for vector loop. 7751 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7752 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7753 7754 // Generate code to check if the loop's trip count is less than VF * UF of the 7755 // main vector loop. 7756 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7757 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7758 7759 Value *CheckMinIters = Builder.CreateICmp( 7760 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7761 "min.iters.check"); 7762 7763 if (!ForEpilogue) 7764 TCCheckBlock->setName("vector.main.loop.iter.check"); 7765 7766 // Create new preheader for vector loop. 7767 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7768 DT, LI, nullptr, "vector.ph"); 7769 7770 if (ForEpilogue) { 7771 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7772 DT->getNode(Bypass)->getIDom()) && 7773 "TC check is expected to dominate Bypass"); 7774 7775 // Update dominator for Bypass & LoopExit. 7776 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7777 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7778 // For loops with multiple exits, there's no edge from the middle block 7779 // to exit blocks (as the epilogue must run) and thus no need to update 7780 // the immediate dominator of the exit blocks. 7781 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7782 7783 LoopBypassBlocks.push_back(TCCheckBlock); 7784 7785 // Save the trip count so we don't have to regenerate it in the 7786 // vec.epilog.iter.check. This is safe to do because the trip count 7787 // generated here dominates the vector epilog iter check. 7788 EPI.TripCount = Count; 7789 } 7790 7791 ReplaceInstWithInst( 7792 TCCheckBlock->getTerminator(), 7793 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7794 7795 return TCCheckBlock; 7796 } 7797 7798 //===--------------------------------------------------------------------===// 7799 // EpilogueVectorizerEpilogueLoop 7800 //===--------------------------------------------------------------------===// 7801 7802 /// This function is partially responsible for generating the control flow 7803 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7804 std::pair<BasicBlock *, Value *> 7805 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7806 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7807 createVectorLoopSkeleton("vec.epilog."); 7808 7809 // Now, compare the remaining count and if there aren't enough iterations to 7810 // execute the vectorized epilogue skip to the scalar part. 7811 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7812 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7813 LoopVectorPreHeader = 7814 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7815 LI, nullptr, "vec.epilog.ph"); 7816 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7817 VecEpilogueIterationCountCheck); 7818 7819 // Adjust the control flow taking the state info from the main loop 7820 // vectorization into account. 7821 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7822 "expected this to be saved from the previous pass."); 7823 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7824 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7825 7826 DT->changeImmediateDominator(LoopVectorPreHeader, 7827 EPI.MainLoopIterationCountCheck); 7828 7829 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7830 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7831 7832 if (EPI.SCEVSafetyCheck) 7833 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7834 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7835 if (EPI.MemSafetyCheck) 7836 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7837 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7838 7839 DT->changeImmediateDominator( 7840 VecEpilogueIterationCountCheck, 7841 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7842 7843 DT->changeImmediateDominator(LoopScalarPreHeader, 7844 EPI.EpilogueIterationCountCheck); 7845 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7846 // If there is an epilogue which must run, there's no edge from the 7847 // middle block to exit blocks and thus no need to update the immediate 7848 // dominator of the exit blocks. 7849 DT->changeImmediateDominator(LoopExitBlock, 7850 EPI.EpilogueIterationCountCheck); 7851 7852 // Keep track of bypass blocks, as they feed start values to the induction 7853 // phis in the scalar loop preheader. 7854 if (EPI.SCEVSafetyCheck) 7855 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7856 if (EPI.MemSafetyCheck) 7857 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7858 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7859 7860 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7861 // merge control-flow from the latch block and the middle block. Update the 7862 // incoming values here and move the Phi into the preheader. 7863 SmallVector<PHINode *, 4> PhisInBlock; 7864 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7865 PhisInBlock.push_back(&Phi); 7866 7867 for (PHINode *Phi : PhisInBlock) { 7868 Phi->replaceIncomingBlockWith( 7869 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7870 VecEpilogueIterationCountCheck); 7871 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7872 if (EPI.SCEVSafetyCheck) 7873 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7874 if (EPI.MemSafetyCheck) 7875 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7876 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7877 } 7878 7879 // Generate a resume induction for the vector epilogue and put it in the 7880 // vector epilogue preheader 7881 Type *IdxTy = Legal->getWidestInductionType(); 7882 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7883 LoopVectorPreHeader->getFirstNonPHI()); 7884 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7885 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7886 EPI.MainLoopIterationCountCheck); 7887 7888 // Generate induction resume values. These variables save the new starting 7889 // indexes for the scalar loop. They are used to test if there are any tail 7890 // iterations left once the vector loop has completed. 7891 // Note that when the vectorized epilogue is skipped due to iteration count 7892 // check, then the resume value for the induction variable comes from 7893 // the trip count of the main vector loop, hence passing the AdditionalBypass 7894 // argument. 7895 createInductionResumeValues({VecEpilogueIterationCountCheck, 7896 EPI.VectorTripCount} /* AdditionalBypass */); 7897 7898 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7899 } 7900 7901 BasicBlock * 7902 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7903 BasicBlock *Bypass, BasicBlock *Insert) { 7904 7905 assert(EPI.TripCount && 7906 "Expected trip count to have been safed in the first pass."); 7907 assert( 7908 (!isa<Instruction>(EPI.TripCount) || 7909 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7910 "saved trip count does not dominate insertion point."); 7911 Value *TC = EPI.TripCount; 7912 IRBuilder<> Builder(Insert->getTerminator()); 7913 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7914 7915 // Generate code to check if the loop's trip count is less than VF * UF of the 7916 // vector epilogue loop. 7917 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7918 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7919 7920 Value *CheckMinIters = 7921 Builder.CreateICmp(P, Count, 7922 createStepForVF(Builder, Count->getType(), 7923 EPI.EpilogueVF, EPI.EpilogueUF), 7924 "min.epilog.iters.check"); 7925 7926 ReplaceInstWithInst( 7927 Insert->getTerminator(), 7928 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7929 7930 LoopBypassBlocks.push_back(Insert); 7931 return Insert; 7932 } 7933 7934 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7935 LLVM_DEBUG({ 7936 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7937 << "Epilogue Loop VF:" << EPI.EpilogueVF 7938 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7939 }); 7940 } 7941 7942 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7943 DEBUG_WITH_TYPE(VerboseDebug, { 7944 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7945 }); 7946 } 7947 7948 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7949 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7950 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7951 bool PredicateAtRangeStart = Predicate(Range.Start); 7952 7953 for (ElementCount TmpVF = Range.Start * 2; 7954 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7955 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7956 Range.End = TmpVF; 7957 break; 7958 } 7959 7960 return PredicateAtRangeStart; 7961 } 7962 7963 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7964 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7965 /// of VF's starting at a given VF and extending it as much as possible. Each 7966 /// vectorization decision can potentially shorten this sub-range during 7967 /// buildVPlan(). 7968 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7969 ElementCount MaxVF) { 7970 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7971 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7972 VFRange SubRange = {VF, MaxVFPlusOne}; 7973 VPlans.push_back(buildVPlan(SubRange)); 7974 VF = SubRange.End; 7975 } 7976 } 7977 7978 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7979 VPlanPtr &Plan) { 7980 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7981 7982 // Look for cached value. 7983 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7984 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7985 if (ECEntryIt != EdgeMaskCache.end()) 7986 return ECEntryIt->second; 7987 7988 VPValue *SrcMask = createBlockInMask(Src, Plan); 7989 7990 // The terminator has to be a branch inst! 7991 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7992 assert(BI && "Unexpected terminator found"); 7993 7994 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7995 return EdgeMaskCache[Edge] = SrcMask; 7996 7997 // If source is an exiting block, we know the exit edge is dynamically dead 7998 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7999 // adding uses of an otherwise potentially dead instruction. 8000 if (OrigLoop->isLoopExiting(Src)) 8001 return EdgeMaskCache[Edge] = SrcMask; 8002 8003 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8004 assert(EdgeMask && "No Edge Mask found for condition"); 8005 8006 if (BI->getSuccessor(0) != Dst) 8007 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8008 8009 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8010 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8011 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8012 // The select version does not introduce new UB if SrcMask is false and 8013 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8014 VPValue *False = Plan->getOrAddVPValue( 8015 ConstantInt::getFalse(BI->getCondition()->getType())); 8016 EdgeMask = 8017 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8018 } 8019 8020 return EdgeMaskCache[Edge] = EdgeMask; 8021 } 8022 8023 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8024 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8025 8026 // Look for cached value. 8027 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8028 if (BCEntryIt != BlockMaskCache.end()) 8029 return BCEntryIt->second; 8030 8031 // All-one mask is modelled as no-mask following the convention for masked 8032 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8033 VPValue *BlockMask = nullptr; 8034 8035 if (OrigLoop->getHeader() == BB) { 8036 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8037 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8038 8039 assert(CM.foldTailByMasking() && "must fold the tail"); 8040 8041 // If we're using the active lane mask for control flow, then we get the 8042 // mask from the active lane mask PHI that is cached in the VPlan. 8043 PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); 8044 if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) 8045 return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); 8046 8047 // Introduce the early-exit compare IV <= BTC to form header block mask. 8048 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8049 // constructing the desired canonical IV in the header block as its first 8050 // non-phi instructions. 8051 8052 VPBasicBlock *HeaderVPBB = 8053 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8054 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8055 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8056 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8057 8058 VPBuilder::InsertPointGuard Guard(Builder); 8059 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8060 if (EmitGetActiveLaneMask != PredicationStyle::None) { 8061 VPValue *TC = Plan->getOrCreateTripCount(); 8062 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, 8063 nullptr, "active.lane.mask"); 8064 } else { 8065 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8066 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8067 } 8068 return BlockMaskCache[BB] = BlockMask; 8069 } 8070 8071 // This is the block mask. We OR all incoming edges. 8072 for (auto *Predecessor : predecessors(BB)) { 8073 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8074 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8075 return BlockMaskCache[BB] = EdgeMask; 8076 8077 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8078 BlockMask = EdgeMask; 8079 continue; 8080 } 8081 8082 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8083 } 8084 8085 return BlockMaskCache[BB] = BlockMask; 8086 } 8087 8088 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8089 ArrayRef<VPValue *> Operands, 8090 VFRange &Range, 8091 VPlanPtr &Plan) { 8092 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8093 "Must be called with either a load or store"); 8094 8095 auto willWiden = [&](ElementCount VF) -> bool { 8096 LoopVectorizationCostModel::InstWidening Decision = 8097 CM.getWideningDecision(I, VF); 8098 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8099 "CM decision should be taken at this point."); 8100 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8101 return true; 8102 if (CM.isScalarAfterVectorization(I, VF) || 8103 CM.isProfitableToScalarize(I, VF)) 8104 return false; 8105 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8106 }; 8107 8108 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8109 return nullptr; 8110 8111 VPValue *Mask = nullptr; 8112 if (Legal->isMaskRequired(I)) 8113 Mask = createBlockInMask(I->getParent(), Plan); 8114 8115 // Determine if the pointer operand of the access is either consecutive or 8116 // reverse consecutive. 8117 LoopVectorizationCostModel::InstWidening Decision = 8118 CM.getWideningDecision(I, Range.Start); 8119 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8120 bool Consecutive = 8121 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8122 8123 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8124 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8125 Consecutive, Reverse); 8126 8127 StoreInst *Store = cast<StoreInst>(I); 8128 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8129 Mask, Consecutive, Reverse); 8130 } 8131 8132 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8133 /// insert a recipe to expand the step for the induction recipe. 8134 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8135 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8136 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8137 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8138 // Returns true if an instruction \p I should be scalarized instead of 8139 // vectorized for the chosen vectorization factor. 8140 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8141 return CM.isScalarAfterVectorization(I, VF) || 8142 CM.isProfitableToScalarize(I, VF); 8143 }; 8144 8145 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8146 [&](ElementCount VF) { 8147 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8148 }, 8149 Range); 8150 assert(IndDesc.getStartValue() == 8151 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8152 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8153 "step must be loop invariant"); 8154 8155 VPValue *Step = 8156 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8157 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8158 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8159 !NeedsScalarIVOnly); 8160 } 8161 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8162 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8163 !NeedsScalarIVOnly); 8164 } 8165 8166 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8167 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8168 8169 // Check if this is an integer or fp induction. If so, build the recipe that 8170 // produces its scalar and vector values. 8171 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8172 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8173 *PSE.getSE(), *OrigLoop, Range); 8174 8175 // Check if this is pointer induction. If so, build the recipe for it. 8176 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8177 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8178 *PSE.getSE()); 8179 return nullptr; 8180 } 8181 8182 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8183 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8184 // Optimize the special case where the source is a constant integer 8185 // induction variable. Notice that we can only optimize the 'trunc' case 8186 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8187 // (c) other casts depend on pointer size. 8188 8189 // Determine whether \p K is a truncation based on an induction variable that 8190 // can be optimized. 8191 auto isOptimizableIVTruncate = 8192 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8193 return [=](ElementCount VF) -> bool { 8194 return CM.isOptimizableIVTruncate(K, VF); 8195 }; 8196 }; 8197 8198 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8199 isOptimizableIVTruncate(I), Range)) { 8200 8201 auto *Phi = cast<PHINode>(I->getOperand(0)); 8202 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8203 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8204 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8205 *PSE.getSE(), *OrigLoop, Range); 8206 } 8207 return nullptr; 8208 } 8209 8210 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8211 ArrayRef<VPValue *> Operands, 8212 VPlanPtr &Plan) { 8213 // If all incoming values are equal, the incoming VPValue can be used directly 8214 // instead of creating a new VPBlendRecipe. 8215 VPValue *FirstIncoming = Operands[0]; 8216 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8217 return FirstIncoming == Inc; 8218 })) { 8219 return Operands[0]; 8220 } 8221 8222 unsigned NumIncoming = Phi->getNumIncomingValues(); 8223 // For in-loop reductions, we do not need to create an additional select. 8224 VPValue *InLoopVal = nullptr; 8225 for (unsigned In = 0; In < NumIncoming; In++) { 8226 PHINode *PhiOp = 8227 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8228 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8229 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8230 InLoopVal = Operands[In]; 8231 } 8232 } 8233 8234 assert((!InLoopVal || NumIncoming == 2) && 8235 "Found an in-loop reduction for PHI with unexpected number of " 8236 "incoming values"); 8237 if (InLoopVal) 8238 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8239 8240 // We know that all PHIs in non-header blocks are converted into selects, so 8241 // we don't have to worry about the insertion order and we can just use the 8242 // builder. At this point we generate the predication tree. There may be 8243 // duplications since this is a simple recursive scan, but future 8244 // optimizations will clean it up. 8245 SmallVector<VPValue *, 2> OperandsWithMask; 8246 8247 for (unsigned In = 0; In < NumIncoming; In++) { 8248 VPValue *EdgeMask = 8249 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8250 assert((EdgeMask || NumIncoming == 1) && 8251 "Multiple predecessors with one having a full mask"); 8252 OperandsWithMask.push_back(Operands[In]); 8253 if (EdgeMask) 8254 OperandsWithMask.push_back(EdgeMask); 8255 } 8256 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8257 } 8258 8259 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8260 ArrayRef<VPValue *> Operands, 8261 VFRange &Range) const { 8262 8263 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8264 [this, CI](ElementCount VF) { 8265 return CM.isScalarWithPredication(CI, VF); 8266 }, 8267 Range); 8268 8269 if (IsPredicated) 8270 return nullptr; 8271 8272 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8273 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8274 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8275 ID == Intrinsic::pseudoprobe || 8276 ID == Intrinsic::experimental_noalias_scope_decl)) 8277 return nullptr; 8278 8279 auto willWiden = [&](ElementCount VF) -> bool { 8280 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8281 // The following case may be scalarized depending on the VF. 8282 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8283 // version of the instruction. 8284 // Is it beneficial to perform intrinsic call compared to lib call? 8285 bool NeedToScalarize = false; 8286 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8287 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8288 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8289 return UseVectorIntrinsic || !NeedToScalarize; 8290 }; 8291 8292 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8293 return nullptr; 8294 8295 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8296 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8297 } 8298 8299 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8300 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8301 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8302 // Instruction should be widened, unless it is scalar after vectorization, 8303 // scalarization is profitable or it is predicated. 8304 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8305 return CM.isScalarAfterVectorization(I, VF) || 8306 CM.isProfitableToScalarize(I, VF) || 8307 CM.isScalarWithPredication(I, VF); 8308 }; 8309 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8310 Range); 8311 } 8312 8313 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8314 ArrayRef<VPValue *> Operands) const { 8315 auto IsVectorizableOpcode = [](unsigned Opcode) { 8316 switch (Opcode) { 8317 case Instruction::Add: 8318 case Instruction::And: 8319 case Instruction::AShr: 8320 case Instruction::BitCast: 8321 case Instruction::FAdd: 8322 case Instruction::FCmp: 8323 case Instruction::FDiv: 8324 case Instruction::FMul: 8325 case Instruction::FNeg: 8326 case Instruction::FPExt: 8327 case Instruction::FPToSI: 8328 case Instruction::FPToUI: 8329 case Instruction::FPTrunc: 8330 case Instruction::FRem: 8331 case Instruction::FSub: 8332 case Instruction::ICmp: 8333 case Instruction::IntToPtr: 8334 case Instruction::LShr: 8335 case Instruction::Mul: 8336 case Instruction::Or: 8337 case Instruction::PtrToInt: 8338 case Instruction::SDiv: 8339 case Instruction::Select: 8340 case Instruction::SExt: 8341 case Instruction::Shl: 8342 case Instruction::SIToFP: 8343 case Instruction::SRem: 8344 case Instruction::Sub: 8345 case Instruction::Trunc: 8346 case Instruction::UDiv: 8347 case Instruction::UIToFP: 8348 case Instruction::URem: 8349 case Instruction::Xor: 8350 case Instruction::ZExt: 8351 case Instruction::Freeze: 8352 return true; 8353 } 8354 return false; 8355 }; 8356 8357 if (!IsVectorizableOpcode(I->getOpcode())) 8358 return nullptr; 8359 8360 // Success: widen this instruction. 8361 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8362 } 8363 8364 void VPRecipeBuilder::fixHeaderPhis() { 8365 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8366 for (VPHeaderPHIRecipe *R : PhisToFix) { 8367 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8368 VPRecipeBase *IncR = 8369 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8370 R->addOperand(IncR->getVPSingleValue()); 8371 } 8372 } 8373 8374 VPBasicBlock *VPRecipeBuilder::handleReplication( 8375 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8376 VPlanPtr &Plan) { 8377 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8378 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8379 Range); 8380 8381 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8382 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8383 Range); 8384 8385 // Even if the instruction is not marked as uniform, there are certain 8386 // intrinsic calls that can be effectively treated as such, so we check for 8387 // them here. Conservatively, we only do this for scalable vectors, since 8388 // for fixed-width VFs we can always fall back on full scalarization. 8389 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8390 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8391 case Intrinsic::assume: 8392 case Intrinsic::lifetime_start: 8393 case Intrinsic::lifetime_end: 8394 // For scalable vectors if one of the operands is variant then we still 8395 // want to mark as uniform, which will generate one instruction for just 8396 // the first lane of the vector. We can't scalarize the call in the same 8397 // way as for fixed-width vectors because we don't know how many lanes 8398 // there are. 8399 // 8400 // The reasons for doing it this way for scalable vectors are: 8401 // 1. For the assume intrinsic generating the instruction for the first 8402 // lane is still be better than not generating any at all. For 8403 // example, the input may be a splat across all lanes. 8404 // 2. For the lifetime start/end intrinsics the pointer operand only 8405 // does anything useful when the input comes from a stack object, 8406 // which suggests it should always be uniform. For non-stack objects 8407 // the effect is to poison the object, which still allows us to 8408 // remove the call. 8409 IsUniform = true; 8410 break; 8411 default: 8412 break; 8413 } 8414 } 8415 8416 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8417 IsUniform, IsPredicated); 8418 setRecipe(I, Recipe); 8419 Plan->addVPValue(I, Recipe); 8420 8421 // Find if I uses a predicated instruction. If so, it will use its scalar 8422 // value. Avoid hoisting the insert-element which packs the scalar value into 8423 // a vector value, as that happens iff all users use the vector value. 8424 for (VPValue *Op : Recipe->operands()) { 8425 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8426 if (!PredR) 8427 continue; 8428 auto *RepR = 8429 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8430 assert(RepR->isPredicated() && 8431 "expected Replicate recipe to be predicated"); 8432 RepR->setAlsoPack(false); 8433 } 8434 8435 // Finalize the recipe for Instr, first if it is not predicated. 8436 if (!IsPredicated) { 8437 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8438 VPBB->appendRecipe(Recipe); 8439 return VPBB; 8440 } 8441 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8442 8443 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8444 assert(SingleSucc && "VPBB must have a single successor when handling " 8445 "predicated replication."); 8446 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8447 // Record predicated instructions for above packing optimizations. 8448 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8449 VPBlockUtils::insertBlockAfter(Region, VPBB); 8450 auto *RegSucc = new VPBasicBlock(); 8451 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8452 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8453 return RegSucc; 8454 } 8455 8456 VPRegionBlock *VPRecipeBuilder::createReplicateRegion( 8457 Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) { 8458 // Instructions marked for predication are replicated and placed under an 8459 // if-then construct to prevent side-effects. 8460 8461 // Generate recipes to compute the block mask for this region. 8462 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8463 8464 // Build the triangular if-then region. 8465 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8466 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8467 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8468 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8469 auto *PHIRecipe = Instr->getType()->isVoidTy() 8470 ? nullptr 8471 : new VPPredInstPHIRecipe(PredRecipe); 8472 if (PHIRecipe) { 8473 Plan->removeVPValueFor(Instr); 8474 Plan->addVPValue(Instr, PHIRecipe); 8475 } 8476 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8477 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8478 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8479 8480 // Note: first set Entry as region entry and then connect successors starting 8481 // from it in order, to propagate the "parent" of each VPBasicBlock. 8482 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8483 VPBlockUtils::connectBlocks(Pred, Exiting); 8484 8485 return Region; 8486 } 8487 8488 VPRecipeOrVPValueTy 8489 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8490 ArrayRef<VPValue *> Operands, 8491 VFRange &Range, VPlanPtr &Plan) { 8492 // First, check for specific widening recipes that deal with inductions, Phi 8493 // nodes, calls and memory operations. 8494 VPRecipeBase *Recipe; 8495 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8496 if (Phi->getParent() != OrigLoop->getHeader()) 8497 return tryToBlend(Phi, Operands, Plan); 8498 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8499 return toVPRecipeResult(Recipe); 8500 8501 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8502 assert((Legal->isReductionVariable(Phi) || 8503 Legal->isFirstOrderRecurrence(Phi)) && 8504 "can only widen reductions and first-order recurrences here"); 8505 VPValue *StartV = Operands[0]; 8506 if (Legal->isReductionVariable(Phi)) { 8507 const RecurrenceDescriptor &RdxDesc = 8508 Legal->getReductionVars().find(Phi)->second; 8509 assert(RdxDesc.getRecurrenceStartValue() == 8510 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8511 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8512 CM.isInLoopReduction(Phi), 8513 CM.useOrderedReductions(RdxDesc)); 8514 } else { 8515 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8516 } 8517 8518 // Record the incoming value from the backedge, so we can add the incoming 8519 // value from the backedge after all recipes have been created. 8520 recordRecipeOf(cast<Instruction>( 8521 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8522 PhisToFix.push_back(PhiRecipe); 8523 return toVPRecipeResult(PhiRecipe); 8524 } 8525 8526 if (isa<TruncInst>(Instr) && 8527 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8528 Range, *Plan))) 8529 return toVPRecipeResult(Recipe); 8530 8531 // All widen recipes below deal only with VF > 1. 8532 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8533 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8534 return nullptr; 8535 8536 if (auto *CI = dyn_cast<CallInst>(Instr)) 8537 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8538 8539 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8540 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8541 8542 if (!shouldWiden(Instr, Range)) 8543 return nullptr; 8544 8545 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8546 return toVPRecipeResult(new VPWidenGEPRecipe( 8547 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8548 8549 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8550 bool InvariantCond = 8551 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8552 return toVPRecipeResult(new VPWidenSelectRecipe( 8553 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8554 } 8555 8556 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8557 } 8558 8559 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8560 ElementCount MaxVF) { 8561 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8562 8563 // Add assume instructions we need to drop to DeadInstructions, to prevent 8564 // them from being added to the VPlan. 8565 // TODO: We only need to drop assumes in blocks that get flattend. If the 8566 // control flow is preserved, we should keep them. 8567 SmallPtrSet<Instruction *, 4> DeadInstructions; 8568 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8569 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8570 8571 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8572 // Dead instructions do not need sinking. Remove them from SinkAfter. 8573 for (Instruction *I : DeadInstructions) 8574 SinkAfter.erase(I); 8575 8576 // Cannot sink instructions after dead instructions (there won't be any 8577 // recipes for them). Instead, find the first non-dead previous instruction. 8578 for (auto &P : Legal->getSinkAfter()) { 8579 Instruction *SinkTarget = P.second; 8580 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8581 (void)FirstInst; 8582 while (DeadInstructions.contains(SinkTarget)) { 8583 assert( 8584 SinkTarget != FirstInst && 8585 "Must find a live instruction (at least the one feeding the " 8586 "first-order recurrence PHI) before reaching beginning of the block"); 8587 SinkTarget = SinkTarget->getPrevNode(); 8588 assert(SinkTarget != P.first && 8589 "sink source equals target, no sinking required"); 8590 } 8591 P.second = SinkTarget; 8592 } 8593 8594 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8595 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8596 VFRange SubRange = {VF, MaxVFPlusOne}; 8597 VPlans.push_back( 8598 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8599 VF = SubRange.End; 8600 } 8601 } 8602 8603 // Add the necessary canonical IV and branch recipes required to control the 8604 // loop. 8605 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8606 bool HasNUW, 8607 bool UseLaneMaskForLoopControlFlow) { 8608 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8609 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8610 8611 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8612 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8613 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8614 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8615 Header->insert(CanonicalIVPHI, Header->begin()); 8616 8617 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8618 // IV by VF * UF. 8619 auto *CanonicalIVIncrement = 8620 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8621 : VPInstruction::CanonicalIVIncrement, 8622 {CanonicalIVPHI}, DL, "index.next"); 8623 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8624 8625 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8626 EB->appendRecipe(CanonicalIVIncrement); 8627 8628 if (UseLaneMaskForLoopControlFlow) { 8629 // Create the active lane mask instruction in the vplan preheader. 8630 VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); 8631 8632 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since 8633 // we have to take unrolling into account. Each part needs to start at 8634 // Part * VF 8635 auto *CanonicalIVIncrementParts = 8636 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8637 : VPInstruction::CanonicalIVIncrementForPart, 8638 {StartV}, DL, "index.part.next"); 8639 Preheader->appendRecipe(CanonicalIVIncrementParts); 8640 8641 // Create the ActiveLaneMask instruction using the correct start values. 8642 VPValue *TC = Plan.getOrCreateTripCount(); 8643 auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8644 {CanonicalIVIncrementParts, TC}, DL, 8645 "active.lane.mask.entry"); 8646 Preheader->appendRecipe(EntryALM); 8647 8648 // Now create the ActiveLaneMaskPhi recipe in the main loop using the 8649 // preheader ActiveLaneMask instruction. 8650 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); 8651 Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); 8652 8653 // Create the active lane mask for the next iteration of the loop. 8654 CanonicalIVIncrementParts = 8655 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8656 : VPInstruction::CanonicalIVIncrementForPart, 8657 {CanonicalIVIncrement}, DL); 8658 EB->appendRecipe(CanonicalIVIncrementParts); 8659 8660 auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8661 {CanonicalIVIncrementParts, TC}, DL, 8662 "active.lane.mask.next"); 8663 EB->appendRecipe(ALM); 8664 LaneMaskPhi->addOperand(ALM); 8665 8666 // We have to invert the mask here because a true condition means jumping 8667 // to the exit block. 8668 auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); 8669 EB->appendRecipe(NotMask); 8670 8671 VPInstruction *BranchBack = 8672 new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); 8673 EB->appendRecipe(BranchBack); 8674 } else { 8675 // Add the BranchOnCount VPInstruction to the latch. 8676 VPInstruction *BranchBack = new VPInstruction( 8677 VPInstruction::BranchOnCount, 8678 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8679 EB->appendRecipe(BranchBack); 8680 } 8681 } 8682 8683 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8684 // original exit block. 8685 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8686 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8687 VPlan &Plan) { 8688 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8689 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8690 // Only handle single-exit loops with unique exit blocks for now. 8691 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8692 return; 8693 8694 // Introduce VPUsers modeling the exit values. 8695 for (PHINode &ExitPhi : ExitBB->phis()) { 8696 Value *IncomingValue = 8697 ExitPhi.getIncomingValueForBlock(ExitingBB); 8698 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8699 Plan.addLiveOut(&ExitPhi, V); 8700 } 8701 } 8702 8703 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8704 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8705 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8706 8707 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8708 8709 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8710 8711 // --------------------------------------------------------------------------- 8712 // Pre-construction: record ingredients whose recipes we'll need to further 8713 // process after constructing the initial VPlan. 8714 // --------------------------------------------------------------------------- 8715 8716 // Mark instructions we'll need to sink later and their targets as 8717 // ingredients whose recipe we'll need to record. 8718 for (auto &Entry : SinkAfter) { 8719 RecipeBuilder.recordRecipeOf(Entry.first); 8720 RecipeBuilder.recordRecipeOf(Entry.second); 8721 } 8722 for (auto &Reduction : CM.getInLoopReductionChains()) { 8723 PHINode *Phi = Reduction.first; 8724 RecurKind Kind = 8725 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8726 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8727 8728 RecipeBuilder.recordRecipeOf(Phi); 8729 for (auto &R : ReductionOperations) { 8730 RecipeBuilder.recordRecipeOf(R); 8731 // For min/max reductions, where we have a pair of icmp/select, we also 8732 // need to record the ICmp recipe, so it can be removed later. 8733 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8734 "Only min/max recurrences allowed for inloop reductions"); 8735 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8736 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8737 } 8738 } 8739 8740 // For each interleave group which is relevant for this (possibly trimmed) 8741 // Range, add it to the set of groups to be later applied to the VPlan and add 8742 // placeholders for its members' Recipes which we'll be replacing with a 8743 // single VPInterleaveRecipe. 8744 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8745 auto applyIG = [IG, this](ElementCount VF) -> bool { 8746 return (VF.isVector() && // Query is illegal for VF == 1 8747 CM.getWideningDecision(IG->getInsertPos(), VF) == 8748 LoopVectorizationCostModel::CM_Interleave); 8749 }; 8750 if (!getDecisionAndClampRange(applyIG, Range)) 8751 continue; 8752 InterleaveGroups.insert(IG); 8753 for (unsigned i = 0; i < IG->getFactor(); i++) 8754 if (Instruction *Member = IG->getMember(i)) 8755 RecipeBuilder.recordRecipeOf(Member); 8756 }; 8757 8758 // --------------------------------------------------------------------------- 8759 // Build initial VPlan: Scan the body of the loop in a topological order to 8760 // visit each basic block after having visited its predecessor basic blocks. 8761 // --------------------------------------------------------------------------- 8762 8763 // Create initial VPlan skeleton, starting with a block for the pre-header, 8764 // followed by a region for the vector loop, followed by the middle block. The 8765 // skeleton vector loop region contains a header and latch block. 8766 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8767 auto Plan = std::make_unique<VPlan>(Preheader); 8768 8769 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8770 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8771 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8772 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8773 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8774 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8775 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8776 8777 Instruction *DLInst = 8778 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8779 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8780 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8781 !CM.foldTailByMasking(), 8782 CM.useActiveLaneMaskForControlFlow()); 8783 8784 // Scan the body of the loop in a topological order to visit each basic block 8785 // after having visited its predecessor basic blocks. 8786 LoopBlocksDFS DFS(OrigLoop); 8787 DFS.perform(LI); 8788 8789 VPBasicBlock *VPBB = HeaderVPBB; 8790 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8791 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8792 // Relevant instructions from basic block BB will be grouped into VPRecipe 8793 // ingredients and fill a new VPBasicBlock. 8794 unsigned VPBBsForBB = 0; 8795 if (VPBB != HeaderVPBB) 8796 VPBB->setName(BB->getName()); 8797 Builder.setInsertPoint(VPBB); 8798 8799 // Introduce each ingredient into VPlan. 8800 // TODO: Model and preserve debug intrinsics in VPlan. 8801 for (Instruction &I : BB->instructionsWithoutDebug()) { 8802 Instruction *Instr = &I; 8803 8804 // First filter out irrelevant instructions, to ensure no recipes are 8805 // built for them. 8806 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8807 continue; 8808 8809 SmallVector<VPValue *, 4> Operands; 8810 auto *Phi = dyn_cast<PHINode>(Instr); 8811 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8812 Operands.push_back(Plan->getOrAddVPValue( 8813 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8814 } else { 8815 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8816 Operands = {OpRange.begin(), OpRange.end()}; 8817 } 8818 8819 // Invariant stores inside loop will be deleted and a single store 8820 // with the final reduction value will be added to the exit block 8821 StoreInst *SI; 8822 if ((SI = dyn_cast<StoreInst>(&I)) && 8823 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8824 continue; 8825 8826 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8827 Instr, Operands, Range, Plan)) { 8828 // If Instr can be simplified to an existing VPValue, use it. 8829 if (RecipeOrValue.is<VPValue *>()) { 8830 auto *VPV = RecipeOrValue.get<VPValue *>(); 8831 Plan->addVPValue(Instr, VPV); 8832 // If the re-used value is a recipe, register the recipe for the 8833 // instruction, in case the recipe for Instr needs to be recorded. 8834 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8835 RecipeBuilder.setRecipe(Instr, R); 8836 continue; 8837 } 8838 // Otherwise, add the new recipe. 8839 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8840 for (auto *Def : Recipe->definedValues()) { 8841 auto *UV = Def->getUnderlyingValue(); 8842 Plan->addVPValue(UV, Def); 8843 } 8844 8845 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8846 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8847 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8848 // of the header block. That can happen for truncates of induction 8849 // variables. Those recipes are moved to the phi section of the header 8850 // block after applying SinkAfter, which relies on the original 8851 // position of the trunc. 8852 assert(isa<TruncInst>(Instr)); 8853 InductionsToMove.push_back( 8854 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8855 } 8856 RecipeBuilder.setRecipe(Instr, Recipe); 8857 VPBB->appendRecipe(Recipe); 8858 continue; 8859 } 8860 8861 // Otherwise, if all widening options failed, Instruction is to be 8862 // replicated. This may create a successor for VPBB. 8863 VPBasicBlock *NextVPBB = 8864 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8865 if (NextVPBB != VPBB) { 8866 VPBB = NextVPBB; 8867 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8868 : ""); 8869 } 8870 } 8871 8872 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8873 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8874 } 8875 8876 HeaderVPBB->setName("vector.body"); 8877 8878 // Fold the last, empty block into its predecessor. 8879 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8880 assert(VPBB && "expected to fold last (empty) block"); 8881 // After here, VPBB should not be used. 8882 VPBB = nullptr; 8883 8884 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8885 8886 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8887 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8888 "entry block must be set to a VPRegionBlock having a non-empty entry " 8889 "VPBasicBlock"); 8890 RecipeBuilder.fixHeaderPhis(); 8891 8892 // --------------------------------------------------------------------------- 8893 // Transform initial VPlan: Apply previously taken decisions, in order, to 8894 // bring the VPlan to its final state. 8895 // --------------------------------------------------------------------------- 8896 8897 // Apply Sink-After legal constraints. 8898 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8899 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8900 if (Region && Region->isReplicator()) { 8901 assert(Region->getNumSuccessors() == 1 && 8902 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8903 assert(R->getParent()->size() == 1 && 8904 "A recipe in an original replicator region must be the only " 8905 "recipe in its block"); 8906 return Region; 8907 } 8908 return nullptr; 8909 }; 8910 for (auto &Entry : SinkAfter) { 8911 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8912 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8913 8914 auto *TargetRegion = GetReplicateRegion(Target); 8915 auto *SinkRegion = GetReplicateRegion(Sink); 8916 if (!SinkRegion) { 8917 // If the sink source is not a replicate region, sink the recipe directly. 8918 if (TargetRegion) { 8919 // The target is in a replication region, make sure to move Sink to 8920 // the block after it, not into the replication region itself. 8921 VPBasicBlock *NextBlock = 8922 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8923 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8924 } else 8925 Sink->moveAfter(Target); 8926 continue; 8927 } 8928 8929 // The sink source is in a replicate region. Unhook the region from the CFG. 8930 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8931 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8932 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8933 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8934 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8935 8936 if (TargetRegion) { 8937 // The target recipe is also in a replicate region, move the sink region 8938 // after the target region. 8939 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8940 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8941 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8942 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8943 } else { 8944 // The sink source is in a replicate region, we need to move the whole 8945 // replicate region, which should only contain a single recipe in the 8946 // main block. 8947 auto *SplitBlock = 8948 Target->getParent()->splitAt(std::next(Target->getIterator())); 8949 8950 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8951 8952 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8953 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8954 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8955 } 8956 } 8957 8958 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8959 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8960 8961 // Now that sink-after is done, move induction recipes for optimized truncates 8962 // to the phi section of the header block. 8963 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8964 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8965 8966 // Adjust the recipes for any inloop reductions. 8967 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 8968 RecipeBuilder, Range.Start); 8969 8970 // Introduce a recipe to combine the incoming and previous values of a 8971 // first-order recurrence. 8972 for (VPRecipeBase &R : 8973 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8974 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8975 if (!RecurPhi) 8976 continue; 8977 8978 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8979 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8980 auto *Region = GetReplicateRegion(PrevRecipe); 8981 if (Region) 8982 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); 8983 if (!InsertBlock) { 8984 InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); 8985 VPBlockUtils::insertBlockAfter(InsertBlock, Region); 8986 } 8987 if (Region || PrevRecipe->isPhi()) 8988 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8989 else 8990 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 8991 8992 auto *RecurSplice = cast<VPInstruction>( 8993 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 8994 {RecurPhi, RecurPhi->getBackedgeValue()})); 8995 8996 RecurPhi->replaceAllUsesWith(RecurSplice); 8997 // Set the first operand of RecurSplice to RecurPhi again, after replacing 8998 // all users. 8999 RecurSplice->setOperand(0, RecurPhi); 9000 } 9001 9002 // Interleave memory: for each Interleave Group we marked earlier as relevant 9003 // for this VPlan, replace the Recipes widening its memory instructions with a 9004 // single VPInterleaveRecipe at its insertion point. 9005 for (auto IG : InterleaveGroups) { 9006 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9007 RecipeBuilder.getRecipe(IG->getInsertPos())); 9008 SmallVector<VPValue *, 4> StoredValues; 9009 for (unsigned i = 0; i < IG->getFactor(); ++i) 9010 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9011 auto *StoreR = 9012 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9013 StoredValues.push_back(StoreR->getStoredValue()); 9014 } 9015 9016 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9017 Recipe->getMask()); 9018 VPIG->insertBefore(Recipe); 9019 unsigned J = 0; 9020 for (unsigned i = 0; i < IG->getFactor(); ++i) 9021 if (Instruction *Member = IG->getMember(i)) { 9022 if (!Member->getType()->isVoidTy()) { 9023 VPValue *OriginalV = Plan->getVPValue(Member); 9024 Plan->removeVPValueFor(Member); 9025 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9026 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9027 J++; 9028 } 9029 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9030 } 9031 } 9032 9033 std::string PlanName; 9034 raw_string_ostream RSO(PlanName); 9035 ElementCount VF = Range.Start; 9036 Plan->addVF(VF); 9037 RSO << "Initial VPlan for VF={" << VF; 9038 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9039 Plan->addVF(VF); 9040 RSO << "," << VF; 9041 } 9042 RSO << "},UF>=1"; 9043 RSO.flush(); 9044 Plan->setName(PlanName); 9045 9046 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9047 // in ways that accessing values using original IR values is incorrect. 9048 Plan->disableValue2VPValue(); 9049 9050 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9051 VPlanTransforms::sinkScalarOperands(*Plan); 9052 VPlanTransforms::removeDeadRecipes(*Plan); 9053 VPlanTransforms::mergeReplicateRegions(*Plan); 9054 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9055 9056 // Fold Exit block into its predecessor if possible. 9057 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9058 // VPBasicBlock as exit. 9059 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); 9060 9061 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9062 return Plan; 9063 } 9064 9065 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9066 // Outer loop handling: They may require CFG and instruction level 9067 // transformations before even evaluating whether vectorization is profitable. 9068 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9069 // the vectorization pipeline. 9070 assert(!OrigLoop->isInnermost()); 9071 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9072 9073 // Create new empty VPlan 9074 auto Plan = std::make_unique<VPlan>(); 9075 9076 // Build hierarchical CFG 9077 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9078 HCFGBuilder.buildHierarchicalCFG(); 9079 9080 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9081 VF *= 2) 9082 Plan->addVF(VF); 9083 9084 SmallPtrSet<Instruction *, 1> DeadInstructions; 9085 VPlanTransforms::VPInstructionsToVPRecipes( 9086 OrigLoop, Plan, 9087 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9088 DeadInstructions, *PSE.getSE()); 9089 9090 // Remove the existing terminator of the exiting block of the top-most region. 9091 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9092 auto *Term = 9093 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9094 Term->eraseFromParent(); 9095 9096 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9097 true, CM.useActiveLaneMaskForControlFlow()); 9098 return Plan; 9099 } 9100 9101 // Adjust the recipes for reductions. For in-loop reductions the chain of 9102 // instructions leading from the loop exit instr to the phi need to be converted 9103 // to reductions, with one operand being vector and the other being the scalar 9104 // reduction chain. For other reductions, a select is introduced between the phi 9105 // and live-out recipes when folding the tail. 9106 void LoopVectorizationPlanner::adjustRecipesForReductions( 9107 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9108 ElementCount MinVF) { 9109 for (auto &Reduction : CM.getInLoopReductionChains()) { 9110 PHINode *Phi = Reduction.first; 9111 const RecurrenceDescriptor &RdxDesc = 9112 Legal->getReductionVars().find(Phi)->second; 9113 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9114 9115 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9116 continue; 9117 9118 // ReductionOperations are orders top-down from the phi's use to the 9119 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9120 // which of the two operands will remain scalar and which will be reduced. 9121 // For minmax the chain will be the select instructions. 9122 Instruction *Chain = Phi; 9123 for (Instruction *R : ReductionOperations) { 9124 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9125 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9126 9127 VPValue *ChainOp = Plan->getVPValue(Chain); 9128 unsigned FirstOpId; 9129 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9130 "Only min/max recurrences allowed for inloop reductions"); 9131 // Recognize a call to the llvm.fmuladd intrinsic. 9132 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9133 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9134 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9135 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9136 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9137 "Expected to replace a VPWidenSelectSC"); 9138 FirstOpId = 1; 9139 } else { 9140 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9141 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9142 "Expected to replace a VPWidenSC"); 9143 FirstOpId = 0; 9144 } 9145 unsigned VecOpId = 9146 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9147 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9148 9149 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9150 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9151 : nullptr; 9152 9153 if (IsFMulAdd) { 9154 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9155 // need to create an fmul recipe to use as the vector operand for the 9156 // fadd reduction. 9157 VPInstruction *FMulRecipe = new VPInstruction( 9158 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9159 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9160 WidenRecipe->getParent()->insert(FMulRecipe, 9161 WidenRecipe->getIterator()); 9162 VecOp = FMulRecipe; 9163 } 9164 VPReductionRecipe *RedRecipe = 9165 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9166 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9167 Plan->removeVPValueFor(R); 9168 Plan->addVPValue(R, RedRecipe); 9169 // Append the recipe to the end of the VPBasicBlock because we need to 9170 // ensure that it comes after all of it's inputs, including CondOp. 9171 WidenRecipe->getParent()->appendRecipe(RedRecipe); 9172 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9173 WidenRecipe->eraseFromParent(); 9174 9175 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9176 VPRecipeBase *CompareRecipe = 9177 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9178 assert(isa<VPWidenRecipe>(CompareRecipe) && 9179 "Expected to replace a VPWidenSC"); 9180 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9181 "Expected no remaining users"); 9182 CompareRecipe->eraseFromParent(); 9183 } 9184 Chain = R; 9185 } 9186 } 9187 9188 // If tail is folded by masking, introduce selects between the phi 9189 // and the live-out instruction of each reduction, at the beginning of the 9190 // dedicated latch block. 9191 if (CM.foldTailByMasking()) { 9192 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9193 for (VPRecipeBase &R : 9194 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9195 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9196 if (!PhiR || PhiR->isInLoop()) 9197 continue; 9198 VPValue *Cond = 9199 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9200 VPValue *Red = PhiR->getBackedgeValue(); 9201 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9202 "reduction recipe must be defined before latch"); 9203 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9204 } 9205 } 9206 } 9207 9208 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9209 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9210 VPSlotTracker &SlotTracker) const { 9211 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9212 IG->getInsertPos()->printAsOperand(O, false); 9213 O << ", "; 9214 getAddr()->printAsOperand(O, SlotTracker); 9215 VPValue *Mask = getMask(); 9216 if (Mask) { 9217 O << ", "; 9218 Mask->printAsOperand(O, SlotTracker); 9219 } 9220 9221 unsigned OpIdx = 0; 9222 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9223 if (!IG->getMember(i)) 9224 continue; 9225 if (getNumStoreOperands() > 0) { 9226 O << "\n" << Indent << " store "; 9227 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9228 O << " to index " << i; 9229 } else { 9230 O << "\n" << Indent << " "; 9231 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9232 O << " = load from index " << i; 9233 } 9234 ++OpIdx; 9235 } 9236 } 9237 #endif 9238 9239 void VPWidenCallRecipe::execute(VPTransformState &State) { 9240 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9241 *this, State); 9242 } 9243 9244 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9245 assert(!State.Instance && "Int or FP induction being replicated."); 9246 9247 Value *Start = getStartValue()->getLiveInIRValue(); 9248 const InductionDescriptor &ID = getInductionDescriptor(); 9249 TruncInst *Trunc = getTruncInst(); 9250 IRBuilderBase &Builder = State.Builder; 9251 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9252 assert(State.VF.isVector() && "must have vector VF"); 9253 9254 // The value from the original loop to which we are mapping the new induction 9255 // variable. 9256 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9257 9258 // Fast-math-flags propagate from the original induction instruction. 9259 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9260 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9261 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9262 9263 // Now do the actual transformations, and start with fetching the step value. 9264 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9265 9266 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9267 "Expected either an induction phi-node or a truncate of it!"); 9268 9269 // Construct the initial value of the vector IV in the vector loop preheader 9270 auto CurrIP = Builder.saveIP(); 9271 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9272 Builder.SetInsertPoint(VectorPH->getTerminator()); 9273 if (isa<TruncInst>(EntryVal)) { 9274 assert(Start->getType()->isIntegerTy() && 9275 "Truncation requires an integer type"); 9276 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9277 Step = Builder.CreateTrunc(Step, TruncType); 9278 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9279 } 9280 9281 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9282 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9283 Value *SteppedStart = getStepVector( 9284 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9285 9286 // We create vector phi nodes for both integer and floating-point induction 9287 // variables. Here, we determine the kind of arithmetic we will perform. 9288 Instruction::BinaryOps AddOp; 9289 Instruction::BinaryOps MulOp; 9290 if (Step->getType()->isIntegerTy()) { 9291 AddOp = Instruction::Add; 9292 MulOp = Instruction::Mul; 9293 } else { 9294 AddOp = ID.getInductionOpcode(); 9295 MulOp = Instruction::FMul; 9296 } 9297 9298 // Multiply the vectorization factor by the step using integer or 9299 // floating-point arithmetic as appropriate. 9300 Type *StepType = Step->getType(); 9301 Value *RuntimeVF; 9302 if (Step->getType()->isFloatingPointTy()) 9303 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9304 else 9305 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9306 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9307 9308 // Create a vector splat to use in the induction update. 9309 // 9310 // FIXME: If the step is non-constant, we create the vector splat with 9311 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9312 // handle a constant vector splat. 9313 Value *SplatVF = isa<Constant>(Mul) 9314 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9315 : Builder.CreateVectorSplat(State.VF, Mul); 9316 Builder.restoreIP(CurrIP); 9317 9318 // We may need to add the step a number of times, depending on the unroll 9319 // factor. The last of those goes into the PHI. 9320 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9321 &*State.CFG.PrevBB->getFirstInsertionPt()); 9322 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9323 Instruction *LastInduction = VecInd; 9324 for (unsigned Part = 0; Part < State.UF; ++Part) { 9325 State.set(this, LastInduction, Part); 9326 9327 if (isa<TruncInst>(EntryVal)) 9328 State.addMetadata(LastInduction, EntryVal); 9329 9330 LastInduction = cast<Instruction>( 9331 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9332 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9333 } 9334 9335 LastInduction->setName("vec.ind.next"); 9336 VecInd->addIncoming(SteppedStart, VectorPH); 9337 // Add induction update using an incorrect block temporarily. The phi node 9338 // will be fixed after VPlan execution. Note that at this point the latch 9339 // block cannot be used, as it does not exist yet. 9340 // TODO: Model increment value in VPlan, by turning the recipe into a 9341 // multi-def and a subclass of VPHeaderPHIRecipe. 9342 VecInd->addIncoming(LastInduction, VectorPH); 9343 } 9344 9345 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9346 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9347 "Not a pointer induction according to InductionDescriptor!"); 9348 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9349 "Unexpected type."); 9350 9351 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9352 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9353 9354 if (onlyScalarsGenerated(State.VF)) { 9355 // This is the normalized GEP that starts counting at zero. 9356 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9357 CanonicalIV, IndDesc.getStep()->getType()); 9358 // Determine the number of scalars we need to generate for each unroll 9359 // iteration. If the instruction is uniform, we only need to generate the 9360 // first lane. Otherwise, we generate all VF values. 9361 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9362 assert((IsUniform || !State.VF.isScalable()) && 9363 "Cannot scalarize a scalable VF"); 9364 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9365 9366 for (unsigned Part = 0; Part < State.UF; ++Part) { 9367 Value *PartStart = 9368 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9369 9370 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9371 Value *Idx = State.Builder.CreateAdd( 9372 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9373 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9374 9375 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9376 State.CFG.PrevBB->getTerminator()); 9377 Value *SclrGep = emitTransformedIndex( 9378 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9379 SclrGep->setName("next.gep"); 9380 State.set(this, SclrGep, VPIteration(Part, Lane)); 9381 } 9382 } 9383 return; 9384 } 9385 9386 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9387 "Induction step not a SCEV constant!"); 9388 Type *PhiType = IndDesc.getStep()->getType(); 9389 9390 // Build a pointer phi 9391 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9392 Type *ScStValueType = ScalarStartValue->getType(); 9393 PHINode *NewPointerPhi = 9394 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9395 9396 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9397 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9398 9399 // A pointer induction, performed by using a gep 9400 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9401 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9402 9403 const SCEV *ScalarStep = IndDesc.getStep(); 9404 SCEVExpander Exp(SE, DL, "induction"); 9405 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9406 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9407 Value *NumUnrolledElems = 9408 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9409 Value *InductionGEP = GetElementPtrInst::Create( 9410 IndDesc.getElementType(), NewPointerPhi, 9411 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9412 InductionLoc); 9413 // Add induction update using an incorrect block temporarily. The phi node 9414 // will be fixed after VPlan execution. Note that at this point the latch 9415 // block cannot be used, as it does not exist yet. 9416 // TODO: Model increment value in VPlan, by turning the recipe into a 9417 // multi-def and a subclass of VPHeaderPHIRecipe. 9418 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9419 9420 // Create UF many actual address geps that use the pointer 9421 // phi as base and a vectorized version of the step value 9422 // (<step*0, ..., step*N>) as offset. 9423 for (unsigned Part = 0; Part < State.UF; ++Part) { 9424 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9425 Value *StartOffsetScalar = 9426 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9427 Value *StartOffset = 9428 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9429 // Create a vector of consecutive numbers from zero to VF. 9430 StartOffset = State.Builder.CreateAdd( 9431 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9432 9433 Value *GEP = State.Builder.CreateGEP( 9434 IndDesc.getElementType(), NewPointerPhi, 9435 State.Builder.CreateMul( 9436 StartOffset, 9437 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9438 "vector.gep")); 9439 State.set(this, GEP, Part); 9440 } 9441 } 9442 9443 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9444 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9445 9446 // Fast-math-flags propagate from the original induction instruction. 9447 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9448 if (IndDesc.getInductionBinOp() && 9449 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9450 State.Builder.setFastMathFlags( 9451 IndDesc.getInductionBinOp()->getFastMathFlags()); 9452 9453 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9454 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9455 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9456 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9457 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9458 ScalarIV = 9459 Ty->isIntegerTy() 9460 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9461 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9462 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9463 getStartValue()->getLiveInIRValue(), Step, 9464 IndDesc); 9465 ScalarIV->setName("offset.idx"); 9466 } 9467 if (TruncToTy) { 9468 assert(Step->getType()->isIntegerTy() && 9469 "Truncation requires an integer step"); 9470 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9471 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9472 } 9473 return ScalarIV; 9474 }; 9475 9476 Value *ScalarIV = CreateScalarIV(Step); 9477 if (State.VF.isVector()) { 9478 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9479 return; 9480 } 9481 9482 for (unsigned Part = 0; Part < State.UF; ++Part) { 9483 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9484 Value *EntryPart; 9485 if (Step->getType()->isFloatingPointTy()) { 9486 Value *StartIdx = 9487 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9488 // Floating-point operations inherit FMF via the builder's flags. 9489 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9490 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9491 ScalarIV, MulOp); 9492 } else { 9493 Value *StartIdx = 9494 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9495 EntryPart = State.Builder.CreateAdd( 9496 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9497 } 9498 State.set(this, EntryPart, Part); 9499 } 9500 } 9501 9502 void VPInterleaveRecipe::execute(VPTransformState &State) { 9503 assert(!State.Instance && "Interleave group being replicated."); 9504 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9505 getStoredValues(), getMask()); 9506 } 9507 9508 void VPReductionRecipe::execute(VPTransformState &State) { 9509 assert(!State.Instance && "Reduction being replicated."); 9510 Value *PrevInChain = State.get(getChainOp(), 0); 9511 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9512 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9513 // Propagate the fast-math flags carried by the underlying instruction. 9514 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9515 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9516 for (unsigned Part = 0; Part < State.UF; ++Part) { 9517 Value *NewVecOp = State.get(getVecOp(), Part); 9518 if (VPValue *Cond = getCondOp()) { 9519 Value *NewCond = State.get(Cond, Part); 9520 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9521 Value *Iden = RdxDesc->getRecurrenceIdentity( 9522 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9523 Value *IdenVec = 9524 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9525 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9526 NewVecOp = Select; 9527 } 9528 Value *NewRed; 9529 Value *NextInChain; 9530 if (IsOrdered) { 9531 if (State.VF.isVector()) 9532 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9533 PrevInChain); 9534 else 9535 NewRed = State.Builder.CreateBinOp( 9536 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9537 NewVecOp); 9538 PrevInChain = NewRed; 9539 } else { 9540 PrevInChain = State.get(getChainOp(), Part); 9541 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9542 } 9543 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9544 NextInChain = 9545 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9546 NewRed, PrevInChain); 9547 } else if (IsOrdered) 9548 NextInChain = NewRed; 9549 else 9550 NextInChain = State.Builder.CreateBinOp( 9551 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9552 PrevInChain); 9553 State.set(this, NextInChain, Part); 9554 } 9555 } 9556 9557 void VPReplicateRecipe::execute(VPTransformState &State) { 9558 if (State.Instance) { // Generate a single instance. 9559 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9560 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9561 IsPredicated, State); 9562 // Insert scalar instance packing it into a vector. 9563 if (AlsoPack && State.VF.isVector()) { 9564 // If we're constructing lane 0, initialize to start from poison. 9565 if (State.Instance->Lane.isFirstLane()) { 9566 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9567 Value *Poison = PoisonValue::get( 9568 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9569 State.set(this, Poison, State.Instance->Part); 9570 } 9571 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9572 } 9573 return; 9574 } 9575 9576 // Generate scalar instances for all VF lanes of all UF parts, unless the 9577 // instruction is uniform inwhich case generate only the first lane for each 9578 // of the UF parts. 9579 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9580 assert((!State.VF.isScalable() || IsUniform) && 9581 "Can't scalarize a scalable vector"); 9582 for (unsigned Part = 0; Part < State.UF; ++Part) 9583 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9584 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9585 VPIteration(Part, Lane), IsPredicated, 9586 State); 9587 } 9588 9589 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9590 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9591 9592 // Attempt to issue a wide load. 9593 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9594 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9595 9596 assert((LI || SI) && "Invalid Load/Store instruction"); 9597 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9598 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9599 9600 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9601 9602 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9603 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9604 bool CreateGatherScatter = !Consecutive; 9605 9606 auto &Builder = State.Builder; 9607 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9608 bool isMaskRequired = getMask(); 9609 if (isMaskRequired) 9610 for (unsigned Part = 0; Part < State.UF; ++Part) 9611 BlockInMaskParts[Part] = State.get(getMask(), Part); 9612 9613 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9614 // Calculate the pointer for the specific unroll-part. 9615 GetElementPtrInst *PartPtr = nullptr; 9616 9617 bool InBounds = false; 9618 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9619 InBounds = gep->isInBounds(); 9620 if (Reverse) { 9621 // If the address is consecutive but reversed, then the 9622 // wide store needs to start at the last vector element. 9623 // RunTimeVF = VScale * VF.getKnownMinValue() 9624 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9625 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9626 // NumElt = -Part * RunTimeVF 9627 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9628 // LastLane = 1 - RunTimeVF 9629 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9630 PartPtr = 9631 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9632 PartPtr->setIsInBounds(InBounds); 9633 PartPtr = cast<GetElementPtrInst>( 9634 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9635 PartPtr->setIsInBounds(InBounds); 9636 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9637 BlockInMaskParts[Part] = 9638 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9639 } else { 9640 Value *Increment = 9641 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9642 PartPtr = cast<GetElementPtrInst>( 9643 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9644 PartPtr->setIsInBounds(InBounds); 9645 } 9646 9647 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9648 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9649 }; 9650 9651 // Handle Stores: 9652 if (SI) { 9653 State.setDebugLocFromInst(SI); 9654 9655 for (unsigned Part = 0; Part < State.UF; ++Part) { 9656 Instruction *NewSI = nullptr; 9657 Value *StoredVal = State.get(StoredValue, Part); 9658 if (CreateGatherScatter) { 9659 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9660 Value *VectorGep = State.get(getAddr(), Part); 9661 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9662 MaskPart); 9663 } else { 9664 if (Reverse) { 9665 // If we store to reverse consecutive memory locations, then we need 9666 // to reverse the order of elements in the stored value. 9667 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9668 // We don't want to update the value in the map as it might be used in 9669 // another expression. So don't call resetVectorValue(StoredVal). 9670 } 9671 auto *VecPtr = 9672 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9673 if (isMaskRequired) 9674 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9675 BlockInMaskParts[Part]); 9676 else 9677 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9678 } 9679 State.addMetadata(NewSI, SI); 9680 } 9681 return; 9682 } 9683 9684 // Handle loads. 9685 assert(LI && "Must have a load instruction"); 9686 State.setDebugLocFromInst(LI); 9687 for (unsigned Part = 0; Part < State.UF; ++Part) { 9688 Value *NewLI; 9689 if (CreateGatherScatter) { 9690 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9691 Value *VectorGep = State.get(getAddr(), Part); 9692 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9693 nullptr, "wide.masked.gather"); 9694 State.addMetadata(NewLI, LI); 9695 } else { 9696 auto *VecPtr = 9697 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9698 if (isMaskRequired) 9699 NewLI = Builder.CreateMaskedLoad( 9700 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9701 PoisonValue::get(DataTy), "wide.masked.load"); 9702 else 9703 NewLI = 9704 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9705 9706 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9707 State.addMetadata(NewLI, LI); 9708 if (Reverse) 9709 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9710 } 9711 9712 State.set(getVPSingleValue(), NewLI, Part); 9713 } 9714 } 9715 9716 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9717 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9718 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9719 // for predication. 9720 static ScalarEpilogueLowering getScalarEpilogueLowering( 9721 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9722 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9723 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9724 LoopVectorizationLegality &LVL) { 9725 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9726 // don't look at hints or options, and don't request a scalar epilogue. 9727 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9728 // LoopAccessInfo (due to code dependency and not being able to reliably get 9729 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9730 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9731 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9732 // back to the old way and vectorize with versioning when forced. See D81345.) 9733 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9734 PGSOQueryType::IRPass) && 9735 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9736 return CM_ScalarEpilogueNotAllowedOptSize; 9737 9738 // 2) If set, obey the directives 9739 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9740 switch (PreferPredicateOverEpilogue) { 9741 case PreferPredicateTy::ScalarEpilogue: 9742 return CM_ScalarEpilogueAllowed; 9743 case PreferPredicateTy::PredicateElseScalarEpilogue: 9744 return CM_ScalarEpilogueNotNeededUsePredicate; 9745 case PreferPredicateTy::PredicateOrDontVectorize: 9746 return CM_ScalarEpilogueNotAllowedUsePredicate; 9747 }; 9748 } 9749 9750 // 3) If set, obey the hints 9751 switch (Hints.getPredicate()) { 9752 case LoopVectorizeHints::FK_Enabled: 9753 return CM_ScalarEpilogueNotNeededUsePredicate; 9754 case LoopVectorizeHints::FK_Disabled: 9755 return CM_ScalarEpilogueAllowed; 9756 }; 9757 9758 // 4) if the TTI hook indicates this is profitable, request predication. 9759 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9760 LVL.getLAI())) 9761 return CM_ScalarEpilogueNotNeededUsePredicate; 9762 9763 return CM_ScalarEpilogueAllowed; 9764 } 9765 9766 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9767 // If Values have been set for this Def return the one relevant for \p Part. 9768 if (hasVectorValue(Def, Part)) 9769 return Data.PerPartOutput[Def][Part]; 9770 9771 if (!hasScalarValue(Def, {Part, 0})) { 9772 Value *IRV = Def->getLiveInIRValue(); 9773 Value *B = ILV->getBroadcastInstrs(IRV); 9774 set(Def, B, Part); 9775 return B; 9776 } 9777 9778 Value *ScalarValue = get(Def, {Part, 0}); 9779 // If we aren't vectorizing, we can just copy the scalar map values over 9780 // to the vector map. 9781 if (VF.isScalar()) { 9782 set(Def, ScalarValue, Part); 9783 return ScalarValue; 9784 } 9785 9786 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9787 bool IsUniform = RepR && RepR->isUniform(); 9788 9789 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9790 // Check if there is a scalar value for the selected lane. 9791 if (!hasScalarValue(Def, {Part, LastLane})) { 9792 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9793 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 9794 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 9795 "unexpected recipe found to be invariant"); 9796 IsUniform = true; 9797 LastLane = 0; 9798 } 9799 9800 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9801 // Set the insert point after the last scalarized instruction or after the 9802 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9803 // will directly follow the scalar definitions. 9804 auto OldIP = Builder.saveIP(); 9805 auto NewIP = 9806 isa<PHINode>(LastInst) 9807 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9808 : std::next(BasicBlock::iterator(LastInst)); 9809 Builder.SetInsertPoint(&*NewIP); 9810 9811 // However, if we are vectorizing, we need to construct the vector values. 9812 // If the value is known to be uniform after vectorization, we can just 9813 // broadcast the scalar value corresponding to lane zero for each unroll 9814 // iteration. Otherwise, we construct the vector values using 9815 // insertelement instructions. Since the resulting vectors are stored in 9816 // State, we will only generate the insertelements once. 9817 Value *VectorValue = nullptr; 9818 if (IsUniform) { 9819 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9820 set(Def, VectorValue, Part); 9821 } else { 9822 // Initialize packing with insertelements to start from undef. 9823 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9824 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9825 set(Def, Undef, Part); 9826 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9827 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9828 VectorValue = get(Def, Part); 9829 } 9830 Builder.restoreIP(OldIP); 9831 return VectorValue; 9832 } 9833 9834 // Process the loop in the VPlan-native vectorization path. This path builds 9835 // VPlan upfront in the vectorization pipeline, which allows to apply 9836 // VPlan-to-VPlan transformations from the very beginning without modifying the 9837 // input LLVM IR. 9838 static bool processLoopInVPlanNativePath( 9839 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9840 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9841 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9842 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9843 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9844 LoopVectorizationRequirements &Requirements) { 9845 9846 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9847 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9848 return false; 9849 } 9850 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9851 Function *F = L->getHeader()->getParent(); 9852 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9853 9854 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9855 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9856 9857 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9858 &Hints, IAI); 9859 // Use the planner for outer loop vectorization. 9860 // TODO: CM is not used at this point inside the planner. Turn CM into an 9861 // optional argument if we don't need it in the future. 9862 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); 9863 9864 // Get user vectorization factor. 9865 ElementCount UserVF = Hints.getWidth(); 9866 9867 CM.collectElementTypesForWidening(); 9868 9869 // Plan how to best vectorize, return the best VF and its cost. 9870 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9871 9872 // If we are stress testing VPlan builds, do not attempt to generate vector 9873 // code. Masked vector code generation support will follow soon. 9874 // Also, do not attempt to vectorize if no vector code will be produced. 9875 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9876 return false; 9877 9878 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 9879 9880 { 9881 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9882 F->getParent()->getDataLayout()); 9883 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 9884 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 9885 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9886 << L->getHeader()->getParent()->getName() << "\"\n"); 9887 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 9888 } 9889 9890 // Mark the loop as already vectorized to avoid vectorizing again. 9891 Hints.setAlreadyVectorized(); 9892 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9893 return true; 9894 } 9895 9896 // Emit a remark if there are stores to floats that required a floating point 9897 // extension. If the vectorized loop was generated with floating point there 9898 // will be a performance penalty from the conversion overhead and the change in 9899 // the vector width. 9900 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9901 SmallVector<Instruction *, 4> Worklist; 9902 for (BasicBlock *BB : L->getBlocks()) { 9903 for (Instruction &Inst : *BB) { 9904 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9905 if (S->getValueOperand()->getType()->isFloatTy()) 9906 Worklist.push_back(S); 9907 } 9908 } 9909 } 9910 9911 // Traverse the floating point stores upwards searching, for floating point 9912 // conversions. 9913 SmallPtrSet<const Instruction *, 4> Visited; 9914 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9915 while (!Worklist.empty()) { 9916 auto *I = Worklist.pop_back_val(); 9917 if (!L->contains(I)) 9918 continue; 9919 if (!Visited.insert(I).second) 9920 continue; 9921 9922 // Emit a remark if the floating point store required a floating 9923 // point conversion. 9924 // TODO: More work could be done to identify the root cause such as a 9925 // constant or a function return type and point the user to it. 9926 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9927 ORE->emit([&]() { 9928 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9929 I->getDebugLoc(), L->getHeader()) 9930 << "floating point conversion changes vector width. " 9931 << "Mixed floating point precision requires an up/down " 9932 << "cast that will negatively impact performance."; 9933 }); 9934 9935 for (Use &Op : I->operands()) 9936 if (auto *OpI = dyn_cast<Instruction>(Op)) 9937 Worklist.push_back(OpI); 9938 } 9939 } 9940 9941 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 9942 VectorizationFactor &VF, 9943 Optional<unsigned> VScale, Loop *L, 9944 ScalarEvolution &SE) { 9945 InstructionCost CheckCost = Checks.getCost(); 9946 if (!CheckCost.isValid()) 9947 return false; 9948 9949 // When interleaving only scalar and vector cost will be equal, which in turn 9950 // would lead to a divide by 0. Fall back to hard threshold. 9951 if (VF.Width.isScalar()) { 9952 if (CheckCost > VectorizeMemoryCheckThreshold) { 9953 LLVM_DEBUG( 9954 dbgs() 9955 << "LV: Interleaving only is not profitable due to runtime checks\n"); 9956 return false; 9957 } 9958 return true; 9959 } 9960 9961 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 9962 double ScalarC = *VF.ScalarCost.getValue(); 9963 if (ScalarC == 0) 9964 return true; 9965 9966 // First, compute the minimum iteration count required so that the vector 9967 // loop outperforms the scalar loop. 9968 // The total cost of the scalar loop is 9969 // ScalarC * TC 9970 // where 9971 // * TC is the actual trip count of the loop. 9972 // * ScalarC is the cost of a single scalar iteration. 9973 // 9974 // The total cost of the vector loop is 9975 // RtC + VecC * (TC / VF) + EpiC 9976 // where 9977 // * RtC is the cost of the generated runtime checks 9978 // * VecC is the cost of a single vector iteration. 9979 // * TC is the actual trip count of the loop 9980 // * VF is the vectorization factor 9981 // * EpiCost is the cost of the generated epilogue, including the cost 9982 // of the remaining scalar operations. 9983 // 9984 // Vectorization is profitable once the total vector cost is less than the 9985 // total scalar cost: 9986 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 9987 // 9988 // Now we can compute the minimum required trip count TC as 9989 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 9990 // 9991 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 9992 // the computations are performed on doubles, not integers and the result 9993 // is rounded up, hence we get an upper estimate of the TC. 9994 unsigned IntVF = VF.Width.getKnownMinValue(); 9995 if (VF.Width.isScalable()) { 9996 unsigned AssumedMinimumVscale = 1; 9997 if (VScale) 9998 AssumedMinimumVscale = *VScale; 9999 IntVF *= AssumedMinimumVscale; 10000 } 10001 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 10002 double RtC = *CheckCost.getValue(); 10003 double MinTC1 = RtC / (ScalarC - VecCOverVF); 10004 10005 // Second, compute a minimum iteration count so that the cost of the 10006 // runtime checks is only a fraction of the total scalar loop cost. This 10007 // adds a loop-dependent bound on the overhead incurred if the runtime 10008 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10009 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10010 // cost, compute 10011 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10012 double MinTC2 = RtC * 10 / ScalarC; 10013 10014 // Now pick the larger minimum. If it is not a multiple of VF, choose the 10015 // next closest multiple of VF. This should partly compensate for ignoring 10016 // the epilogue cost. 10017 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 10018 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); 10019 10020 LLVM_DEBUG( 10021 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10022 << VF.MinProfitableTripCount << "\n"); 10023 10024 // Skip vectorization if the expected trip count is less than the minimum 10025 // required trip count. 10026 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 10027 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10028 VF.MinProfitableTripCount)) { 10029 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10030 "trip count < minimum profitable VF (" 10031 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10032 << ")\n"); 10033 10034 return false; 10035 } 10036 } 10037 return true; 10038 } 10039 10040 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10041 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10042 !EnableLoopInterleaving), 10043 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10044 !EnableLoopVectorization) {} 10045 10046 bool LoopVectorizePass::processLoop(Loop *L) { 10047 assert((EnableVPlanNativePath || L->isInnermost()) && 10048 "VPlan-native path is not enabled. Only process inner loops."); 10049 10050 #ifndef NDEBUG 10051 const std::string DebugLocStr = getDebugLocString(L); 10052 #endif /* NDEBUG */ 10053 10054 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10055 << L->getHeader()->getParent()->getName() << "' from " 10056 << DebugLocStr << "\n"); 10057 10058 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10059 10060 LLVM_DEBUG( 10061 dbgs() << "LV: Loop hints:" 10062 << " force=" 10063 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10064 ? "disabled" 10065 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10066 ? "enabled" 10067 : "?")) 10068 << " width=" << Hints.getWidth() 10069 << " interleave=" << Hints.getInterleave() << "\n"); 10070 10071 // Function containing loop 10072 Function *F = L->getHeader()->getParent(); 10073 10074 // Looking at the diagnostic output is the only way to determine if a loop 10075 // was vectorized (other than looking at the IR or machine code), so it 10076 // is important to generate an optimization remark for each loop. Most of 10077 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10078 // generated as OptimizationRemark and OptimizationRemarkMissed are 10079 // less verbose reporting vectorized loops and unvectorized loops that may 10080 // benefit from vectorization, respectively. 10081 10082 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10083 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10084 return false; 10085 } 10086 10087 PredicatedScalarEvolution PSE(*SE, *L); 10088 10089 // Check if it is legal to vectorize the loop. 10090 LoopVectorizationRequirements Requirements; 10091 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10092 &Requirements, &Hints, DB, AC, BFI, PSI); 10093 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10094 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10095 Hints.emitRemarkWithHints(); 10096 return false; 10097 } 10098 10099 // Check the function attributes and profiles to find out if this function 10100 // should be optimized for size. 10101 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10102 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10103 10104 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10105 // here. They may require CFG and instruction level transformations before 10106 // even evaluating whether vectorization is profitable. Since we cannot modify 10107 // the incoming IR, we need to build VPlan upfront in the vectorization 10108 // pipeline. 10109 if (!L->isInnermost()) 10110 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10111 ORE, BFI, PSI, Hints, Requirements); 10112 10113 assert(L->isInnermost() && "Inner loop expected."); 10114 10115 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10116 // count by optimizing for size, to minimize overheads. 10117 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10118 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10119 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10120 << "This loop is worth vectorizing only if no scalar " 10121 << "iteration overheads are incurred."); 10122 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10123 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10124 else { 10125 LLVM_DEBUG(dbgs() << "\n"); 10126 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10127 } 10128 } 10129 10130 // Check the function attributes to see if implicit floats are allowed. 10131 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10132 // an integer loop and the vector instructions selected are purely integer 10133 // vector instructions? 10134 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10135 reportVectorizationFailure( 10136 "Can't vectorize when the NoImplicitFloat attribute is used", 10137 "loop not vectorized due to NoImplicitFloat attribute", 10138 "NoImplicitFloat", ORE, L); 10139 Hints.emitRemarkWithHints(); 10140 return false; 10141 } 10142 10143 // Check if the target supports potentially unsafe FP vectorization. 10144 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10145 // for the target we're vectorizing for, to make sure none of the 10146 // additional fp-math flags can help. 10147 if (Hints.isPotentiallyUnsafe() && 10148 TTI->isFPVectorizationPotentiallyUnsafe()) { 10149 reportVectorizationFailure( 10150 "Potentially unsafe FP op prevents vectorization", 10151 "loop not vectorized due to unsafe FP support.", 10152 "UnsafeFP", ORE, L); 10153 Hints.emitRemarkWithHints(); 10154 return false; 10155 } 10156 10157 bool AllowOrderedReductions; 10158 // If the flag is set, use that instead and override the TTI behaviour. 10159 if (ForceOrderedReductions.getNumOccurrences() > 0) 10160 AllowOrderedReductions = ForceOrderedReductions; 10161 else 10162 AllowOrderedReductions = TTI->enableOrderedReductions(); 10163 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10164 ORE->emit([&]() { 10165 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10166 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10167 ExactFPMathInst->getDebugLoc(), 10168 ExactFPMathInst->getParent()) 10169 << "loop not vectorized: cannot prove it is safe to reorder " 10170 "floating-point operations"; 10171 }); 10172 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10173 "reorder floating-point operations\n"); 10174 Hints.emitRemarkWithHints(); 10175 return false; 10176 } 10177 10178 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10179 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10180 10181 // If an override option has been passed in for interleaved accesses, use it. 10182 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10183 UseInterleaved = EnableInterleavedMemAccesses; 10184 10185 // Analyze interleaved memory accesses. 10186 if (UseInterleaved) { 10187 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10188 } 10189 10190 // Use the cost model. 10191 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10192 F, &Hints, IAI); 10193 CM.collectValuesToIgnore(); 10194 CM.collectElementTypesForWidening(); 10195 10196 // Use the planner for vectorization. 10197 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); 10198 10199 // Get user vectorization factor and interleave count. 10200 ElementCount UserVF = Hints.getWidth(); 10201 unsigned UserIC = Hints.getInterleave(); 10202 10203 // Plan how to best vectorize, return the best VF and its cost. 10204 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10205 10206 VectorizationFactor VF = VectorizationFactor::Disabled(); 10207 unsigned IC = 1; 10208 10209 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10210 F->getParent()->getDataLayout()); 10211 if (MaybeVF) { 10212 VF = *MaybeVF; 10213 // Select the interleave count. 10214 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10215 10216 unsigned SelectedIC = std::max(IC, UserIC); 10217 // Optimistically generate runtime checks if they are needed. Drop them if 10218 // they turn out to not be profitable. 10219 if (VF.Width.isVector() || SelectedIC > 1) 10220 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10221 10222 // Check if it is profitable to vectorize with runtime checks. 10223 bool ForceVectorization = 10224 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10225 if (!ForceVectorization && 10226 !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, 10227 *PSE.getSE())) { 10228 ORE->emit([&]() { 10229 return OptimizationRemarkAnalysisAliasing( 10230 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10231 L->getHeader()) 10232 << "loop not vectorized: cannot prove it is safe to reorder " 10233 "memory operations"; 10234 }); 10235 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10236 Hints.emitRemarkWithHints(); 10237 return false; 10238 } 10239 } 10240 10241 // Identify the diagnostic messages that should be produced. 10242 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10243 bool VectorizeLoop = true, InterleaveLoop = true; 10244 if (VF.Width.isScalar()) { 10245 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10246 VecDiagMsg = std::make_pair( 10247 "VectorizationNotBeneficial", 10248 "the cost-model indicates that vectorization is not beneficial"); 10249 VectorizeLoop = false; 10250 } 10251 10252 if (!MaybeVF && UserIC > 1) { 10253 // Tell the user interleaving was avoided up-front, despite being explicitly 10254 // requested. 10255 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10256 "interleaving should be avoided up front\n"); 10257 IntDiagMsg = std::make_pair( 10258 "InterleavingAvoided", 10259 "Ignoring UserIC, because interleaving was avoided up front"); 10260 InterleaveLoop = false; 10261 } else if (IC == 1 && UserIC <= 1) { 10262 // Tell the user interleaving is not beneficial. 10263 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10264 IntDiagMsg = std::make_pair( 10265 "InterleavingNotBeneficial", 10266 "the cost-model indicates that interleaving is not beneficial"); 10267 InterleaveLoop = false; 10268 if (UserIC == 1) { 10269 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10270 IntDiagMsg.second += 10271 " and is explicitly disabled or interleave count is set to 1"; 10272 } 10273 } else if (IC > 1 && UserIC == 1) { 10274 // Tell the user interleaving is beneficial, but it explicitly disabled. 10275 LLVM_DEBUG( 10276 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10277 IntDiagMsg = std::make_pair( 10278 "InterleavingBeneficialButDisabled", 10279 "the cost-model indicates that interleaving is beneficial " 10280 "but is explicitly disabled or interleave count is set to 1"); 10281 InterleaveLoop = false; 10282 } 10283 10284 // Override IC if user provided an interleave count. 10285 IC = UserIC > 0 ? UserIC : IC; 10286 10287 // Emit diagnostic messages, if any. 10288 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10289 if (!VectorizeLoop && !InterleaveLoop) { 10290 // Do not vectorize or interleaving the loop. 10291 ORE->emit([&]() { 10292 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10293 L->getStartLoc(), L->getHeader()) 10294 << VecDiagMsg.second; 10295 }); 10296 ORE->emit([&]() { 10297 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10298 L->getStartLoc(), L->getHeader()) 10299 << IntDiagMsg.second; 10300 }); 10301 return false; 10302 } else if (!VectorizeLoop && InterleaveLoop) { 10303 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10304 ORE->emit([&]() { 10305 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10306 L->getStartLoc(), L->getHeader()) 10307 << VecDiagMsg.second; 10308 }); 10309 } else if (VectorizeLoop && !InterleaveLoop) { 10310 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10311 << ") in " << DebugLocStr << '\n'); 10312 ORE->emit([&]() { 10313 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10314 L->getStartLoc(), L->getHeader()) 10315 << IntDiagMsg.second; 10316 }); 10317 } else if (VectorizeLoop && InterleaveLoop) { 10318 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10319 << ") in " << DebugLocStr << '\n'); 10320 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10321 } 10322 10323 bool DisableRuntimeUnroll = false; 10324 MDNode *OrigLoopID = L->getLoopID(); 10325 { 10326 using namespace ore; 10327 if (!VectorizeLoop) { 10328 assert(IC > 1 && "interleave count should not be 1 or 0"); 10329 // If we decided that it is not legal to vectorize the loop, then 10330 // interleave it. 10331 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10332 &CM, BFI, PSI, Checks); 10333 10334 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10335 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10336 10337 ORE->emit([&]() { 10338 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10339 L->getHeader()) 10340 << "interleaved loop (interleaved count: " 10341 << NV("InterleaveCount", IC) << ")"; 10342 }); 10343 } else { 10344 // If we decided that it is *legal* to vectorize the loop, then do it. 10345 10346 // Consider vectorizing the epilogue too if it's profitable. 10347 VectorizationFactor EpilogueVF = 10348 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10349 if (EpilogueVF.Width.isVector()) { 10350 10351 // The first pass vectorizes the main loop and creates a scalar epilogue 10352 // to be vectorized by executing the plan (potentially with a different 10353 // factor) again shortly afterwards. 10354 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10355 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10356 EPI, &LVL, &CM, BFI, PSI, Checks); 10357 10358 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10359 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10360 DT, true); 10361 ++LoopsVectorized; 10362 10363 // Second pass vectorizes the epilogue and adjusts the control flow 10364 // edges from the first pass. 10365 EPI.MainLoopVF = EPI.EpilogueVF; 10366 EPI.MainLoopUF = EPI.EpilogueUF; 10367 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10368 ORE, EPI, &LVL, &CM, BFI, PSI, 10369 Checks); 10370 10371 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10372 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10373 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10374 Header->setName("vec.epilog.vector.body"); 10375 10376 // Ensure that the start values for any VPReductionPHIRecipes are 10377 // updated before vectorising the epilogue loop. 10378 for (VPRecipeBase &R : Header->phis()) { 10379 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10380 if (auto *Resume = MainILV.getReductionResumeValue( 10381 ReductionPhi->getRecurrenceDescriptor())) { 10382 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10383 ReductionPhi->setOperand(0, StartVal); 10384 } 10385 } 10386 } 10387 10388 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10389 DT, true); 10390 ++LoopsEpilogueVectorized; 10391 10392 if (!MainILV.areSafetyChecksAdded()) 10393 DisableRuntimeUnroll = true; 10394 } else { 10395 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10396 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10397 PSI, Checks); 10398 10399 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10400 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10401 ++LoopsVectorized; 10402 10403 // Add metadata to disable runtime unrolling a scalar loop when there 10404 // are no runtime checks about strides and memory. A scalar loop that is 10405 // rarely used is not worth unrolling. 10406 if (!LB.areSafetyChecksAdded()) 10407 DisableRuntimeUnroll = true; 10408 } 10409 // Report the vectorization decision. 10410 ORE->emit([&]() { 10411 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10412 L->getHeader()) 10413 << "vectorized loop (vectorization width: " 10414 << NV("VectorizationFactor", VF.Width) 10415 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10416 }); 10417 } 10418 10419 if (ORE->allowExtraAnalysis(LV_NAME)) 10420 checkMixedPrecision(L, ORE); 10421 } 10422 10423 Optional<MDNode *> RemainderLoopID = 10424 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10425 LLVMLoopVectorizeFollowupEpilogue}); 10426 if (RemainderLoopID) { 10427 L->setLoopID(RemainderLoopID.value()); 10428 } else { 10429 if (DisableRuntimeUnroll) 10430 AddRuntimeUnrollDisableMetaData(L); 10431 10432 // Mark the loop as already vectorized to avoid vectorizing again. 10433 Hints.setAlreadyVectorized(); 10434 } 10435 10436 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10437 return true; 10438 } 10439 10440 LoopVectorizeResult LoopVectorizePass::runImpl( 10441 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10442 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10443 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10444 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10445 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10446 SE = &SE_; 10447 LI = &LI_; 10448 TTI = &TTI_; 10449 DT = &DT_; 10450 BFI = &BFI_; 10451 TLI = TLI_; 10452 AA = &AA_; 10453 AC = &AC_; 10454 GetLAA = &GetLAA_; 10455 DB = &DB_; 10456 ORE = &ORE_; 10457 PSI = PSI_; 10458 10459 // Don't attempt if 10460 // 1. the target claims to have no vector registers, and 10461 // 2. interleaving won't help ILP. 10462 // 10463 // The second condition is necessary because, even if the target has no 10464 // vector registers, loop vectorization may still enable scalar 10465 // interleaving. 10466 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10467 TTI->getMaxInterleaveFactor(1) < 2) 10468 return LoopVectorizeResult(false, false); 10469 10470 bool Changed = false, CFGChanged = false; 10471 10472 // The vectorizer requires loops to be in simplified form. 10473 // Since simplification may add new inner loops, it has to run before the 10474 // legality and profitability checks. This means running the loop vectorizer 10475 // will simplify all loops, regardless of whether anything end up being 10476 // vectorized. 10477 for (auto &L : *LI) 10478 Changed |= CFGChanged |= 10479 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10480 10481 // Build up a worklist of inner-loops to vectorize. This is necessary as 10482 // the act of vectorizing or partially unrolling a loop creates new loops 10483 // and can invalidate iterators across the loops. 10484 SmallVector<Loop *, 8> Worklist; 10485 10486 for (Loop *L : *LI) 10487 collectSupportedLoops(*L, LI, ORE, Worklist); 10488 10489 LoopsAnalyzed += Worklist.size(); 10490 10491 // Now walk the identified inner loops. 10492 while (!Worklist.empty()) { 10493 Loop *L = Worklist.pop_back_val(); 10494 10495 // For the inner loops we actually process, form LCSSA to simplify the 10496 // transform. 10497 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10498 10499 Changed |= CFGChanged |= processLoop(L); 10500 } 10501 10502 // Process each loop nest in the function. 10503 return LoopVectorizeResult(Changed, CFGChanged); 10504 } 10505 10506 PreservedAnalyses LoopVectorizePass::run(Function &F, 10507 FunctionAnalysisManager &AM) { 10508 auto &LI = AM.getResult<LoopAnalysis>(F); 10509 // There are no loops in the function. Return before computing other expensive 10510 // analyses. 10511 if (LI.empty()) 10512 return PreservedAnalyses::all(); 10513 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10514 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10515 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10516 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10517 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10518 auto &AA = AM.getResult<AAManager>(F); 10519 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10520 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10521 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10522 10523 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10524 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10525 [&](Loop &L) -> const LoopAccessInfo & { 10526 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10527 TLI, TTI, nullptr, nullptr, nullptr}; 10528 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10529 }; 10530 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10531 ProfileSummaryInfo *PSI = 10532 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10533 LoopVectorizeResult Result = 10534 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10535 if (!Result.MadeAnyChange) 10536 return PreservedAnalyses::all(); 10537 PreservedAnalyses PA; 10538 10539 // We currently do not preserve loopinfo/dominator analyses with outer loop 10540 // vectorization. Until this is addressed, mark these analyses as preserved 10541 // only for non-VPlan-native path. 10542 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10543 if (!EnableVPlanNativePath) { 10544 PA.preserve<LoopAnalysis>(); 10545 PA.preserve<DominatorTreeAnalysis>(); 10546 } 10547 10548 if (Result.MadeCFGChange) { 10549 // Making CFG changes likely means a loop got vectorized. Indicate that 10550 // extra simplification passes should be run. 10551 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10552 // be run if runtime checks have been added. 10553 AM.getResult<ShouldRunExtraVectorPasses>(F); 10554 PA.preserve<ShouldRunExtraVectorPasses>(); 10555 } else { 10556 PA.preserveSet<CFGAnalyses>(); 10557 } 10558 return PA; 10559 } 10560 10561 void LoopVectorizePass::printPipeline( 10562 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10563 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10564 OS, MapClassName2PassName); 10565 10566 OS << "<"; 10567 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10568 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10569 OS << ">"; 10570 } 10571