1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/None.h" 69 #include "llvm/ADT/Optional.h" 70 #include "llvm/ADT/STLExtras.h" 71 #include "llvm/ADT/SmallPtrSet.h" 72 #include "llvm/ADT/SmallSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 90 #include "llvm/Analysis/ProfileSummaryInfo.h" 91 #include "llvm/Analysis/ScalarEvolution.h" 92 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 93 #include "llvm/Analysis/TargetLibraryInfo.h" 94 #include "llvm/Analysis/TargetTransformInfo.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/InitializePasses.h" 125 #include "llvm/Pass.h" 126 #include "llvm/Support/Casting.h" 127 #include "llvm/Support/CommandLine.h" 128 #include "llvm/Support/Compiler.h" 129 #include "llvm/Support/Debug.h" 130 #include "llvm/Support/ErrorHandling.h" 131 #include "llvm/Support/InstructionCost.h" 132 #include "llvm/Support/MathExtras.h" 133 #include "llvm/Support/raw_ostream.h" 134 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 135 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks")); 202 203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 204 // that predication is preferred, and this lists all options. I.e., the 205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 206 // and predicate the instructions accordingly. If tail-folding fails, there are 207 // different fallback strategies depending on these values: 208 namespace PreferPredicateTy { 209 enum Option { 210 ScalarEpilogue = 0, 211 PredicateElseScalarEpilogue, 212 PredicateOrDontVectorize 213 }; 214 } // namespace PreferPredicateTy 215 216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 217 "prefer-predicate-over-epilogue", 218 cl::init(PreferPredicateTy::ScalarEpilogue), 219 cl::Hidden, 220 cl::desc("Tail-folding and predication preferences over creating a scalar " 221 "epilogue loop."), 222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 223 "scalar-epilogue", 224 "Don't tail-predicate loops, create scalar epilogue"), 225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 226 "predicate-else-scalar-epilogue", 227 "prefer tail-folding, create scalar epilogue if tail " 228 "folding fails."), 229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 230 "predicate-dont-vectorize", 231 "prefers tail-folding, don't attempt vectorization if " 232 "tail-folding fails."))); 233 234 static cl::opt<bool> MaximizeBandwidth( 235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 236 cl::desc("Maximize bandwidth when selecting vectorization factor which " 237 "will be determined by the smallest type in loop.")); 238 239 static cl::opt<bool> EnableInterleavedMemAccesses( 240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 241 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 242 243 /// An interleave-group may need masking if it resides in a block that needs 244 /// predication, or in order to mask away gaps. 245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 248 249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 251 cl::desc("We don't interleave loops with a estimated constant trip count " 252 "below this number")); 253 254 static cl::opt<unsigned> ForceTargetNumScalarRegs( 255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 256 cl::desc("A flag that overrides the target's number of scalar registers.")); 257 258 static cl::opt<unsigned> ForceTargetNumVectorRegs( 259 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 260 cl::desc("A flag that overrides the target's number of vector registers.")); 261 262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 264 cl::desc("A flag that overrides the target's max interleave factor for " 265 "scalar loops.")); 266 267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 269 cl::desc("A flag that overrides the target's max interleave factor for " 270 "vectorized loops.")); 271 272 static cl::opt<unsigned> ForceTargetInstructionCost( 273 "force-target-instruction-cost", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's expected cost for " 275 "an instruction to a single constant value. Mostly " 276 "useful for getting consistent testing.")); 277 278 static cl::opt<bool> ForceTargetSupportsScalableVectors( 279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 280 cl::desc( 281 "Pretend that scalable vectors are supported, even if the target does " 282 "not support them. This flag should only be used for testing.")); 283 284 static cl::opt<unsigned> SmallLoopCost( 285 "small-loop-cost", cl::init(20), cl::Hidden, 286 cl::desc( 287 "The cost of a loop that is considered 'small' by the interleaver.")); 288 289 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 291 cl::desc("Enable the use of the block frequency analysis to access PGO " 292 "heuristics minimizing code growth in cold regions and being more " 293 "aggressive in hot regions.")); 294 295 // Runtime interleave loops for load/store throughput. 296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 298 cl::desc( 299 "Enable runtime interleaving until load/store ports are saturated")); 300 301 /// Interleave small loops with scalar reductions. 302 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 304 cl::desc("Enable interleaving for loops with small iteration counts that " 305 "contain scalar reductions to expose ILP.")); 306 307 /// The number of stores in a loop that are allowed to need predication. 308 static cl::opt<unsigned> NumberOfStoresToPredicate( 309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 310 cl::desc("Max number of stores to be predicated behind an if.")); 311 312 static cl::opt<bool> EnableIndVarRegisterHeur( 313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 314 cl::desc("Count the induction variable only once when interleaving")); 315 316 static cl::opt<bool> EnableCondStoresVectorization( 317 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 318 cl::desc("Enable if predication of stores during vectorization.")); 319 320 static cl::opt<unsigned> MaxNestedScalarReductionIC( 321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 322 cl::desc("The maximum interleave count to use when interleaving a scalar " 323 "reduction in a nested loop.")); 324 325 static cl::opt<bool> 326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 327 cl::Hidden, 328 cl::desc("Prefer in-loop vector reductions, " 329 "overriding the targets preference.")); 330 331 static cl::opt<bool> ForceOrderedReductions( 332 "force-ordered-reductions", cl::init(false), cl::Hidden, 333 cl::desc("Enable the vectorisation of loops with in-order (strict) " 334 "FP reductions")); 335 336 static cl::opt<bool> PreferPredicatedReductionSelect( 337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 338 cl::desc( 339 "Prefer predicating a reduction operation over an after loop select.")); 340 341 cl::opt<bool> EnableVPlanNativePath( 342 "enable-vplan-native-path", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 cl::opt<bool> PrintVPlansInDotFormat( 365 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 366 cl::desc("Use dot format instead of plain text when dumping VPlans")); 367 368 /// A helper function that returns true if the given type is irregular. The 369 /// type is irregular if its allocated size doesn't equal the store size of an 370 /// element of the corresponding vector type. 371 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 372 // Determine if an array of N elements of type Ty is "bitcast compatible" 373 // with a <N x Ty> vector. 374 // This is only true if there is no padding between the array elements. 375 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 376 } 377 378 /// A helper function that returns the reciprocal of the block probability of 379 /// predicated blocks. If we return X, we are assuming the predicated block 380 /// will execute once for every X iterations of the loop header. 381 /// 382 /// TODO: We should use actual block probability here, if available. Currently, 383 /// we always assume predicated blocks have a 50% chance of executing. 384 static unsigned getReciprocalPredBlockProb() { return 2; } 385 386 /// A helper function that returns an integer or floating-point constant with 387 /// value C. 388 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 389 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 390 : ConstantFP::get(Ty, C); 391 } 392 393 /// Returns "best known" trip count for the specified loop \p L as defined by 394 /// the following procedure: 395 /// 1) Returns exact trip count if it is known. 396 /// 2) Returns expected trip count according to profile data if any. 397 /// 3) Returns upper bound estimate if it is known. 398 /// 4) Returns None if all of the above failed. 399 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 400 // Check if exact trip count is known. 401 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 402 return ExpectedTC; 403 404 // Check if there is an expected trip count available from profile data. 405 if (LoopVectorizeWithBlockFrequency) 406 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 407 return EstimatedTC; 408 409 // Check if upper bound estimate is known. 410 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 411 return ExpectedTC; 412 413 return None; 414 } 415 416 // Forward declare GeneratedRTChecks. 417 class GeneratedRTChecks; 418 419 namespace llvm { 420 421 AnalysisKey ShouldRunExtraVectorPasses::Key; 422 423 /// InnerLoopVectorizer vectorizes loops which contain only one basic 424 /// block to a specified vectorization factor (VF). 425 /// This class performs the widening of scalars into vectors, or multiple 426 /// scalars. This class also implements the following features: 427 /// * It inserts an epilogue loop for handling loops that don't have iteration 428 /// counts that are known to be a multiple of the vectorization factor. 429 /// * It handles the code generation for reduction variables. 430 /// * Scalarization (implementation using scalars) of un-vectorizable 431 /// instructions. 432 /// InnerLoopVectorizer does not perform any vectorization-legality 433 /// checks, and relies on the caller to check for the different legality 434 /// aspects. The InnerLoopVectorizer relies on the 435 /// LoopVectorizationLegality class to provide information about the induction 436 /// and reduction variables that were found to a given vectorization factor. 437 class InnerLoopVectorizer { 438 public: 439 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 440 LoopInfo *LI, DominatorTree *DT, 441 const TargetLibraryInfo *TLI, 442 const TargetTransformInfo *TTI, AssumptionCache *AC, 443 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 444 ElementCount MinProfitableTripCount, 445 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 446 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 447 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 448 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 449 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 450 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 451 PSI(PSI), RTChecks(RTChecks) { 452 // Query this against the original loop and save it here because the profile 453 // of the original loop header may change as the transformation happens. 454 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 455 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 456 457 if (MinProfitableTripCount.isZero()) 458 this->MinProfitableTripCount = VecWidth; 459 else 460 this->MinProfitableTripCount = MinProfitableTripCount; 461 } 462 463 virtual ~InnerLoopVectorizer() = default; 464 465 /// Create a new empty loop that will contain vectorized instructions later 466 /// on, while the old loop will be used as the scalar remainder. Control flow 467 /// is generated around the vectorized (and scalar epilogue) loops consisting 468 /// of various checks and bypasses. Return the pre-header block of the new 469 /// loop and the start value for the canonical induction, if it is != 0. The 470 /// latter is the case when vectorizing the epilogue loop. In the case of 471 /// epilogue vectorization, this function is overriden to handle the more 472 /// complex control flow around the loops. 473 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 474 475 /// Widen a single call instruction within the innermost loop. 476 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 477 VPTransformState &State); 478 479 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 480 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 481 482 // Return true if any runtime check is added. 483 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 484 485 /// A type for vectorized values in the new loop. Each value from the 486 /// original loop, when vectorized, is represented by UF vector values in the 487 /// new unrolled loop, where UF is the unroll factor. 488 using VectorParts = SmallVector<Value *, 2>; 489 490 /// A helper function to scalarize a single Instruction in the innermost loop. 491 /// Generates a sequence of scalar instances for each lane between \p MinLane 492 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 493 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 494 /// Instr's operands. 495 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 496 const VPIteration &Instance, bool IfPredicateInstr, 497 VPTransformState &State); 498 499 /// Construct the vector value of a scalarized value \p V one lane at a time. 500 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 501 VPTransformState &State); 502 503 /// Try to vectorize interleaved access group \p Group with the base address 504 /// given in \p Addr, optionally masking the vector operations if \p 505 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 506 /// values in the vectorized loop. 507 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 508 ArrayRef<VPValue *> VPDefs, 509 VPTransformState &State, VPValue *Addr, 510 ArrayRef<VPValue *> StoredValues, 511 VPValue *BlockInMask = nullptr); 512 513 /// Fix the non-induction PHIs in \p Plan. 514 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 515 516 /// Returns true if the reordering of FP operations is not allowed, but we are 517 /// able to vectorize with strict in-order reductions for the given RdxDesc. 518 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 519 520 /// Create a broadcast instruction. This method generates a broadcast 521 /// instruction (shuffle) for loop invariant values and for the induction 522 /// value. If this is the induction variable then we extend it to N, N+1, ... 523 /// this is needed because each iteration in the loop corresponds to a SIMD 524 /// element. 525 virtual Value *getBroadcastInstrs(Value *V); 526 527 // Returns the resume value (bc.merge.rdx) for a reduction as 528 // generated by fixReduction. 529 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 530 531 protected: 532 friend class LoopVectorizationPlanner; 533 534 /// A small list of PHINodes. 535 using PhiVector = SmallVector<PHINode *, 4>; 536 537 /// A type for scalarized values in the new loop. Each value from the 538 /// original loop, when scalarized, is represented by UF x VF scalar values 539 /// in the new unrolled loop, where UF is the unroll factor and VF is the 540 /// vectorization factor. 541 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 542 543 /// Set up the values of the IVs correctly when exiting the vector loop. 544 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 545 Value *VectorTripCount, Value *EndValue, 546 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 547 VPlan &Plan); 548 549 /// Handle all cross-iteration phis in the header. 550 void fixCrossIterationPHIs(VPTransformState &State); 551 552 /// Create the exit value of first order recurrences in the middle block and 553 /// update their users. 554 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 555 VPTransformState &State); 556 557 /// Create code for the loop exit value of the reduction. 558 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 559 560 /// Clear NSW/NUW flags from reduction instructions if necessary. 561 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 562 VPTransformState &State); 563 564 /// Iteratively sink the scalarized operands of a predicated instruction into 565 /// the block that was created for it. 566 void sinkScalarOperands(Instruction *PredInst); 567 568 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 569 /// represented as. 570 void truncateToMinimalBitwidths(VPTransformState &State); 571 572 /// Returns (and creates if needed) the original loop trip count. 573 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 574 575 /// Returns (and creates if needed) the trip count of the widened loop. 576 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 577 578 /// Returns a bitcasted value to the requested vector type. 579 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 580 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 581 const DataLayout &DL); 582 583 /// Emit a bypass check to see if the vector trip count is zero, including if 584 /// it overflows. 585 void emitIterationCountCheck(BasicBlock *Bypass); 586 587 /// Emit a bypass check to see if all of the SCEV assumptions we've 588 /// had to make are correct. Returns the block containing the checks or 589 /// nullptr if no checks have been added. 590 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 591 592 /// Emit bypass checks to check any memory assumptions we may have made. 593 /// Returns the block containing the checks or nullptr if no checks have been 594 /// added. 595 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 596 597 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 598 /// vector loop preheader, middle block and scalar preheader. 599 void createVectorLoopSkeleton(StringRef Prefix); 600 601 /// Create new phi nodes for the induction variables to resume iteration count 602 /// in the scalar epilogue, from where the vectorized loop left off. 603 /// In cases where the loop skeleton is more complicated (eg. epilogue 604 /// vectorization) and the resume values can come from an additional bypass 605 /// block, the \p AdditionalBypass pair provides information about the bypass 606 /// block and the end value on the edge from bypass to this loop. 607 void createInductionResumeValues( 608 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 609 610 /// Complete the loop skeleton by adding debug MDs, creating appropriate 611 /// conditional branches in the middle block, preparing the builder and 612 /// running the verifier. Return the preheader of the completed vector loop. 613 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 614 615 /// Collect poison-generating recipes that may generate a poison value that is 616 /// used after vectorization, even when their operands are not poison. Those 617 /// recipes meet the following conditions: 618 /// * Contribute to the address computation of a recipe generating a widen 619 /// memory load/store (VPWidenMemoryInstructionRecipe or 620 /// VPInterleaveRecipe). 621 /// * Such a widen memory load/store has at least one underlying Instruction 622 /// that is in a basic block that needs predication and after vectorization 623 /// the generated instruction won't be predicated. 624 void collectPoisonGeneratingRecipes(VPTransformState &State); 625 626 /// Allow subclasses to override and print debug traces before/after vplan 627 /// execution, when trace information is requested. 628 virtual void printDebugTracesAtStart(){}; 629 virtual void printDebugTracesAtEnd(){}; 630 631 /// The original loop. 632 Loop *OrigLoop; 633 634 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 635 /// dynamic knowledge to simplify SCEV expressions and converts them to a 636 /// more usable form. 637 PredicatedScalarEvolution &PSE; 638 639 /// Loop Info. 640 LoopInfo *LI; 641 642 /// Dominator Tree. 643 DominatorTree *DT; 644 645 /// Alias Analysis. 646 AAResults *AA; 647 648 /// Target Library Info. 649 const TargetLibraryInfo *TLI; 650 651 /// Target Transform Info. 652 const TargetTransformInfo *TTI; 653 654 /// Assumption Cache. 655 AssumptionCache *AC; 656 657 /// Interface to emit optimization remarks. 658 OptimizationRemarkEmitter *ORE; 659 660 /// The vectorization SIMD factor to use. Each vector will have this many 661 /// vector elements. 662 ElementCount VF; 663 664 ElementCount MinProfitableTripCount; 665 666 /// The vectorization unroll factor to use. Each scalar is vectorized to this 667 /// many different vector instructions. 668 unsigned UF; 669 670 /// The builder that we use 671 IRBuilder<> Builder; 672 673 // --- Vectorization state --- 674 675 /// The vector-loop preheader. 676 BasicBlock *LoopVectorPreHeader; 677 678 /// The scalar-loop preheader. 679 BasicBlock *LoopScalarPreHeader; 680 681 /// Middle Block between the vector and the scalar. 682 BasicBlock *LoopMiddleBlock; 683 684 /// The unique ExitBlock of the scalar loop if one exists. Note that 685 /// there can be multiple exiting edges reaching this block. 686 BasicBlock *LoopExitBlock; 687 688 /// The scalar loop body. 689 BasicBlock *LoopScalarBody; 690 691 /// A list of all bypass blocks. The first block is the entry of the loop. 692 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 693 694 /// Store instructions that were predicated. 695 SmallVector<Instruction *, 4> PredicatedInstructions; 696 697 /// Trip count of the original loop. 698 Value *TripCount = nullptr; 699 700 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 701 Value *VectorTripCount = nullptr; 702 703 /// The legality analysis. 704 LoopVectorizationLegality *Legal; 705 706 /// The profitablity analysis. 707 LoopVectorizationCostModel *Cost; 708 709 // Record whether runtime checks are added. 710 bool AddedSafetyChecks = false; 711 712 // Holds the end values for each induction variable. We save the end values 713 // so we can later fix-up the external users of the induction variables. 714 DenseMap<PHINode *, Value *> IVEndValues; 715 716 /// BFI and PSI are used to check for profile guided size optimizations. 717 BlockFrequencyInfo *BFI; 718 ProfileSummaryInfo *PSI; 719 720 // Whether this loop should be optimized for size based on profile guided size 721 // optimizatios. 722 bool OptForSizeBasedOnProfile; 723 724 /// Structure to hold information about generated runtime checks, responsible 725 /// for cleaning the checks, if vectorization turns out unprofitable. 726 GeneratedRTChecks &RTChecks; 727 728 // Holds the resume values for reductions in the loops, used to set the 729 // correct start value of reduction PHIs when vectorizing the epilogue. 730 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 731 ReductionResumeValues; 732 }; 733 734 class InnerLoopUnroller : public InnerLoopVectorizer { 735 public: 736 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 737 LoopInfo *LI, DominatorTree *DT, 738 const TargetLibraryInfo *TLI, 739 const TargetTransformInfo *TTI, AssumptionCache *AC, 740 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 741 LoopVectorizationLegality *LVL, 742 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 743 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 744 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 745 ElementCount::getFixed(1), 746 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 747 BFI, PSI, Check) {} 748 749 private: 750 Value *getBroadcastInstrs(Value *V) override; 751 }; 752 753 /// Encapsulate information regarding vectorization of a loop and its epilogue. 754 /// This information is meant to be updated and used across two stages of 755 /// epilogue vectorization. 756 struct EpilogueLoopVectorizationInfo { 757 ElementCount MainLoopVF = ElementCount::getFixed(0); 758 unsigned MainLoopUF = 0; 759 ElementCount EpilogueVF = ElementCount::getFixed(0); 760 unsigned EpilogueUF = 0; 761 BasicBlock *MainLoopIterationCountCheck = nullptr; 762 BasicBlock *EpilogueIterationCountCheck = nullptr; 763 BasicBlock *SCEVSafetyCheck = nullptr; 764 BasicBlock *MemSafetyCheck = nullptr; 765 Value *TripCount = nullptr; 766 Value *VectorTripCount = nullptr; 767 768 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 769 ElementCount EVF, unsigned EUF) 770 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 771 assert(EUF == 1 && 772 "A high UF for the epilogue loop is likely not beneficial."); 773 } 774 }; 775 776 /// An extension of the inner loop vectorizer that creates a skeleton for a 777 /// vectorized loop that has its epilogue (residual) also vectorized. 778 /// The idea is to run the vplan on a given loop twice, firstly to setup the 779 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 780 /// from the first step and vectorize the epilogue. This is achieved by 781 /// deriving two concrete strategy classes from this base class and invoking 782 /// them in succession from the loop vectorizer planner. 783 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 784 public: 785 InnerLoopAndEpilogueVectorizer( 786 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 787 DominatorTree *DT, const TargetLibraryInfo *TLI, 788 const TargetTransformInfo *TTI, AssumptionCache *AC, 789 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 790 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 791 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 792 GeneratedRTChecks &Checks) 793 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 794 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 795 CM, BFI, PSI, Checks), 796 EPI(EPI) {} 797 798 // Override this function to handle the more complex control flow around the 799 // three loops. 800 std::pair<BasicBlock *, Value *> 801 createVectorizedLoopSkeleton() final override { 802 return createEpilogueVectorizedLoopSkeleton(); 803 } 804 805 /// The interface for creating a vectorized skeleton using one of two 806 /// different strategies, each corresponding to one execution of the vplan 807 /// as described above. 808 virtual std::pair<BasicBlock *, Value *> 809 createEpilogueVectorizedLoopSkeleton() = 0; 810 811 /// Holds and updates state information required to vectorize the main loop 812 /// and its epilogue in two separate passes. This setup helps us avoid 813 /// regenerating and recomputing runtime safety checks. It also helps us to 814 /// shorten the iteration-count-check path length for the cases where the 815 /// iteration count of the loop is so small that the main vector loop is 816 /// completely skipped. 817 EpilogueLoopVectorizationInfo &EPI; 818 }; 819 820 /// A specialized derived class of inner loop vectorizer that performs 821 /// vectorization of *main* loops in the process of vectorizing loops and their 822 /// epilogues. 823 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 824 public: 825 EpilogueVectorizerMainLoop( 826 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 827 DominatorTree *DT, const TargetLibraryInfo *TLI, 828 const TargetTransformInfo *TTI, AssumptionCache *AC, 829 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 830 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 831 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 832 GeneratedRTChecks &Check) 833 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 834 EPI, LVL, CM, BFI, PSI, Check) {} 835 /// Implements the interface for creating a vectorized skeleton using the 836 /// *main loop* strategy (ie the first pass of vplan execution). 837 std::pair<BasicBlock *, Value *> 838 createEpilogueVectorizedLoopSkeleton() final override; 839 840 protected: 841 /// Emits an iteration count bypass check once for the main loop (when \p 842 /// ForEpilogue is false) and once for the epilogue loop (when \p 843 /// ForEpilogue is true). 844 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 845 void printDebugTracesAtStart() override; 846 void printDebugTracesAtEnd() override; 847 }; 848 849 // A specialized derived class of inner loop vectorizer that performs 850 // vectorization of *epilogue* loops in the process of vectorizing loops and 851 // their epilogues. 852 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 853 public: 854 EpilogueVectorizerEpilogueLoop( 855 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 856 DominatorTree *DT, const TargetLibraryInfo *TLI, 857 const TargetTransformInfo *TTI, AssumptionCache *AC, 858 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 859 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 860 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 861 GeneratedRTChecks &Checks) 862 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 863 EPI, LVL, CM, BFI, PSI, Checks) { 864 TripCount = EPI.TripCount; 865 } 866 /// Implements the interface for creating a vectorized skeleton using the 867 /// *epilogue loop* strategy (ie the second pass of vplan execution). 868 std::pair<BasicBlock *, Value *> 869 createEpilogueVectorizedLoopSkeleton() final override; 870 871 protected: 872 /// Emits an iteration count bypass check after the main vector loop has 873 /// finished to see if there are any iterations left to execute by either 874 /// the vector epilogue or the scalar epilogue. 875 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 876 BasicBlock *Bypass, 877 BasicBlock *Insert); 878 void printDebugTracesAtStart() override; 879 void printDebugTracesAtEnd() override; 880 }; 881 } // end namespace llvm 882 883 /// Look for a meaningful debug location on the instruction or it's 884 /// operands. 885 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 886 if (!I) 887 return I; 888 889 DebugLoc Empty; 890 if (I->getDebugLoc() != Empty) 891 return I; 892 893 for (Use &Op : I->operands()) { 894 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 895 if (OpInst->getDebugLoc() != Empty) 896 return OpInst; 897 } 898 899 return I; 900 } 901 902 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 903 /// is passed, the message relates to that particular instruction. 904 #ifndef NDEBUG 905 static void debugVectorizationMessage(const StringRef Prefix, 906 const StringRef DebugMsg, 907 Instruction *I) { 908 dbgs() << "LV: " << Prefix << DebugMsg; 909 if (I != nullptr) 910 dbgs() << " " << *I; 911 else 912 dbgs() << '.'; 913 dbgs() << '\n'; 914 } 915 #endif 916 917 /// Create an analysis remark that explains why vectorization failed 918 /// 919 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 920 /// RemarkName is the identifier for the remark. If \p I is passed it is an 921 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 922 /// the location of the remark. \return the remark object that can be 923 /// streamed to. 924 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 925 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 926 Value *CodeRegion = TheLoop->getHeader(); 927 DebugLoc DL = TheLoop->getStartLoc(); 928 929 if (I) { 930 CodeRegion = I->getParent(); 931 // If there is no debug location attached to the instruction, revert back to 932 // using the loop's. 933 if (I->getDebugLoc()) 934 DL = I->getDebugLoc(); 935 } 936 937 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 938 } 939 940 namespace llvm { 941 942 /// Return a value for Step multiplied by VF. 943 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 944 int64_t Step) { 945 assert(Ty->isIntegerTy() && "Expected an integer step"); 946 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 947 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 948 } 949 950 /// Return the runtime value for VF. 951 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 952 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 953 return VF.isScalable() ? B.CreateVScale(EC) : EC; 954 } 955 956 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 957 ElementCount VF) { 958 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 959 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 960 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 961 return B.CreateUIToFP(RuntimeVF, FTy); 962 } 963 964 void reportVectorizationFailure(const StringRef DebugMsg, 965 const StringRef OREMsg, const StringRef ORETag, 966 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 967 Instruction *I) { 968 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 969 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 970 ORE->emit( 971 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 972 << "loop not vectorized: " << OREMsg); 973 } 974 975 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 976 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 977 Instruction *I) { 978 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 979 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 980 ORE->emit( 981 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 982 << Msg); 983 } 984 985 } // end namespace llvm 986 987 #ifndef NDEBUG 988 /// \return string containing a file name and a line # for the given loop. 989 static std::string getDebugLocString(const Loop *L) { 990 std::string Result; 991 if (L) { 992 raw_string_ostream OS(Result); 993 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 994 LoopDbgLoc.print(OS); 995 else 996 // Just print the module name. 997 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 998 OS.flush(); 999 } 1000 return Result; 1001 } 1002 #endif 1003 1004 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1005 VPTransformState &State) { 1006 1007 // Collect recipes in the backward slice of `Root` that may generate a poison 1008 // value that is used after vectorization. 1009 SmallPtrSet<VPRecipeBase *, 16> Visited; 1010 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1011 SmallVector<VPRecipeBase *, 16> Worklist; 1012 Worklist.push_back(Root); 1013 1014 // Traverse the backward slice of Root through its use-def chain. 1015 while (!Worklist.empty()) { 1016 VPRecipeBase *CurRec = Worklist.back(); 1017 Worklist.pop_back(); 1018 1019 if (!Visited.insert(CurRec).second) 1020 continue; 1021 1022 // Prune search if we find another recipe generating a widen memory 1023 // instruction. Widen memory instructions involved in address computation 1024 // will lead to gather/scatter instructions, which don't need to be 1025 // handled. 1026 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1027 isa<VPInterleaveRecipe>(CurRec) || 1028 isa<VPScalarIVStepsRecipe>(CurRec) || 1029 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1030 continue; 1031 1032 // This recipe contributes to the address computation of a widen 1033 // load/store. Collect recipe if its underlying instruction has 1034 // poison-generating flags. 1035 Instruction *Instr = CurRec->getUnderlyingInstr(); 1036 if (Instr && Instr->hasPoisonGeneratingFlags()) 1037 State.MayGeneratePoisonRecipes.insert(CurRec); 1038 1039 // Add new definitions to the worklist. 1040 for (VPValue *operand : CurRec->operands()) 1041 if (VPDef *OpDef = operand->getDef()) 1042 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1043 } 1044 }); 1045 1046 // Traverse all the recipes in the VPlan and collect the poison-generating 1047 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1048 // VPInterleaveRecipe. 1049 auto Iter = depth_first( 1050 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1051 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1052 for (VPRecipeBase &Recipe : *VPBB) { 1053 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1054 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1055 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1056 if (AddrDef && WidenRec->isConsecutive() && 1057 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1058 collectPoisonGeneratingInstrsInBackwardSlice( 1059 cast<VPRecipeBase>(AddrDef)); 1060 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1061 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1062 if (AddrDef) { 1063 // Check if any member of the interleave group needs predication. 1064 const InterleaveGroup<Instruction> *InterGroup = 1065 InterleaveRec->getInterleaveGroup(); 1066 bool NeedPredication = false; 1067 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1068 I < NumMembers; ++I) { 1069 Instruction *Member = InterGroup->getMember(I); 1070 if (Member) 1071 NeedPredication |= 1072 Legal->blockNeedsPredication(Member->getParent()); 1073 } 1074 1075 if (NeedPredication) 1076 collectPoisonGeneratingInstrsInBackwardSlice( 1077 cast<VPRecipeBase>(AddrDef)); 1078 } 1079 } 1080 } 1081 } 1082 } 1083 1084 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1085 const RecurrenceDescriptor &RdxDesc) { 1086 auto It = ReductionResumeValues.find(&RdxDesc); 1087 assert(It != ReductionResumeValues.end() && 1088 "Expected to find a resume value for the reduction."); 1089 return It->second; 1090 } 1091 1092 namespace llvm { 1093 1094 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1095 // lowered. 1096 enum ScalarEpilogueLowering { 1097 1098 // The default: allowing scalar epilogues. 1099 CM_ScalarEpilogueAllowed, 1100 1101 // Vectorization with OptForSize: don't allow epilogues. 1102 CM_ScalarEpilogueNotAllowedOptSize, 1103 1104 // A special case of vectorisation with OptForSize: loops with a very small 1105 // trip count are considered for vectorization under OptForSize, thereby 1106 // making sure the cost of their loop body is dominant, free of runtime 1107 // guards and scalar iteration overheads. 1108 CM_ScalarEpilogueNotAllowedLowTripLoop, 1109 1110 // Loop hint predicate indicating an epilogue is undesired. 1111 CM_ScalarEpilogueNotNeededUsePredicate, 1112 1113 // Directive indicating we must either tail fold or not vectorize 1114 CM_ScalarEpilogueNotAllowedUsePredicate 1115 }; 1116 1117 /// ElementCountComparator creates a total ordering for ElementCount 1118 /// for the purposes of using it in a set structure. 1119 struct ElementCountComparator { 1120 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1121 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1122 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1123 } 1124 }; 1125 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1126 1127 /// LoopVectorizationCostModel - estimates the expected speedups due to 1128 /// vectorization. 1129 /// In many cases vectorization is not profitable. This can happen because of 1130 /// a number of reasons. In this class we mainly attempt to predict the 1131 /// expected speedup/slowdowns due to the supported instruction set. We use the 1132 /// TargetTransformInfo to query the different backends for the cost of 1133 /// different operations. 1134 class LoopVectorizationCostModel { 1135 public: 1136 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1137 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1138 LoopVectorizationLegality *Legal, 1139 const TargetTransformInfo &TTI, 1140 const TargetLibraryInfo *TLI, DemandedBits *DB, 1141 AssumptionCache *AC, 1142 OptimizationRemarkEmitter *ORE, const Function *F, 1143 const LoopVectorizeHints *Hints, 1144 InterleavedAccessInfo &IAI) 1145 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1146 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1147 Hints(Hints), InterleaveInfo(IAI) {} 1148 1149 /// \return An upper bound for the vectorization factors (both fixed and 1150 /// scalable). If the factors are 0, vectorization and interleaving should be 1151 /// avoided up front. 1152 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1153 1154 /// \return True if runtime checks are required for vectorization, and false 1155 /// otherwise. 1156 bool runtimeChecksRequired(); 1157 1158 /// \return The most profitable vectorization factor and the cost of that VF. 1159 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1160 /// then this vectorization factor will be selected if vectorization is 1161 /// possible. 1162 VectorizationFactor 1163 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1164 1165 VectorizationFactor 1166 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1167 const LoopVectorizationPlanner &LVP); 1168 1169 /// Setup cost-based decisions for user vectorization factor. 1170 /// \return true if the UserVF is a feasible VF to be chosen. 1171 bool selectUserVectorizationFactor(ElementCount UserVF) { 1172 collectUniformsAndScalars(UserVF); 1173 collectInstsToScalarize(UserVF); 1174 return expectedCost(UserVF).first.isValid(); 1175 } 1176 1177 /// \return The size (in bits) of the smallest and widest types in the code 1178 /// that needs to be vectorized. We ignore values that remain scalar such as 1179 /// 64 bit loop indices. 1180 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1181 1182 /// \return The desired interleave count. 1183 /// If interleave count has been specified by metadata it will be returned. 1184 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1185 /// are the selected vectorization factor and the cost of the selected VF. 1186 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1187 1188 /// Memory access instruction may be vectorized in more than one way. 1189 /// Form of instruction after vectorization depends on cost. 1190 /// This function takes cost-based decisions for Load/Store instructions 1191 /// and collects them in a map. This decisions map is used for building 1192 /// the lists of loop-uniform and loop-scalar instructions. 1193 /// The calculated cost is saved with widening decision in order to 1194 /// avoid redundant calculations. 1195 void setCostBasedWideningDecision(ElementCount VF); 1196 1197 /// A struct that represents some properties of the register usage 1198 /// of a loop. 1199 struct RegisterUsage { 1200 /// Holds the number of loop invariant values that are used in the loop. 1201 /// The key is ClassID of target-provided register class. 1202 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1203 /// Holds the maximum number of concurrent live intervals in the loop. 1204 /// The key is ClassID of target-provided register class. 1205 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1206 }; 1207 1208 /// \return Returns information about the register usages of the loop for the 1209 /// given vectorization factors. 1210 SmallVector<RegisterUsage, 8> 1211 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1212 1213 /// Collect values we want to ignore in the cost model. 1214 void collectValuesToIgnore(); 1215 1216 /// Collect all element types in the loop for which widening is needed. 1217 void collectElementTypesForWidening(); 1218 1219 /// Split reductions into those that happen in the loop, and those that happen 1220 /// outside. In loop reductions are collected into InLoopReductionChains. 1221 void collectInLoopReductions(); 1222 1223 /// Returns true if we should use strict in-order reductions for the given 1224 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1225 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1226 /// of FP operations. 1227 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1228 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1229 } 1230 1231 /// \returns The smallest bitwidth each instruction can be represented with. 1232 /// The vector equivalents of these instructions should be truncated to this 1233 /// type. 1234 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1235 return MinBWs; 1236 } 1237 1238 /// \returns True if it is more profitable to scalarize instruction \p I for 1239 /// vectorization factor \p VF. 1240 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1241 assert(VF.isVector() && 1242 "Profitable to scalarize relevant only for VF > 1."); 1243 1244 // Cost model is not run in the VPlan-native path - return conservative 1245 // result until this changes. 1246 if (EnableVPlanNativePath) 1247 return false; 1248 1249 auto Scalars = InstsToScalarize.find(VF); 1250 assert(Scalars != InstsToScalarize.end() && 1251 "VF not yet analyzed for scalarization profitability"); 1252 return Scalars->second.find(I) != Scalars->second.end(); 1253 } 1254 1255 /// Returns true if \p I is known to be uniform after vectorization. 1256 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1257 if (VF.isScalar()) 1258 return true; 1259 1260 // Cost model is not run in the VPlan-native path - return conservative 1261 // result until this changes. 1262 if (EnableVPlanNativePath) 1263 return false; 1264 1265 auto UniformsPerVF = Uniforms.find(VF); 1266 assert(UniformsPerVF != Uniforms.end() && 1267 "VF not yet analyzed for uniformity"); 1268 return UniformsPerVF->second.count(I); 1269 } 1270 1271 /// Returns true if \p I is known to be scalar after vectorization. 1272 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1273 if (VF.isScalar()) 1274 return true; 1275 1276 // Cost model is not run in the VPlan-native path - return conservative 1277 // result until this changes. 1278 if (EnableVPlanNativePath) 1279 return false; 1280 1281 auto ScalarsPerVF = Scalars.find(VF); 1282 assert(ScalarsPerVF != Scalars.end() && 1283 "Scalar values are not calculated for VF"); 1284 return ScalarsPerVF->second.count(I); 1285 } 1286 1287 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1288 /// for vectorization factor \p VF. 1289 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1290 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1291 !isProfitableToScalarize(I, VF) && 1292 !isScalarAfterVectorization(I, VF); 1293 } 1294 1295 /// Decision that was taken during cost calculation for memory instruction. 1296 enum InstWidening { 1297 CM_Unknown, 1298 CM_Widen, // For consecutive accesses with stride +1. 1299 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1300 CM_Interleave, 1301 CM_GatherScatter, 1302 CM_Scalarize 1303 }; 1304 1305 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1306 /// instruction \p I and vector width \p VF. 1307 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1308 InstructionCost Cost) { 1309 assert(VF.isVector() && "Expected VF >=2"); 1310 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1311 } 1312 1313 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1314 /// interleaving group \p Grp and vector width \p VF. 1315 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1316 ElementCount VF, InstWidening W, 1317 InstructionCost Cost) { 1318 assert(VF.isVector() && "Expected VF >=2"); 1319 /// Broadcast this decicion to all instructions inside the group. 1320 /// But the cost will be assigned to one instruction only. 1321 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1322 if (auto *I = Grp->getMember(i)) { 1323 if (Grp->getInsertPos() == I) 1324 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1325 else 1326 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1327 } 1328 } 1329 } 1330 1331 /// Return the cost model decision for the given instruction \p I and vector 1332 /// width \p VF. Return CM_Unknown if this instruction did not pass 1333 /// through the cost modeling. 1334 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1335 assert(VF.isVector() && "Expected VF to be a vector VF"); 1336 // Cost model is not run in the VPlan-native path - return conservative 1337 // result until this changes. 1338 if (EnableVPlanNativePath) 1339 return CM_GatherScatter; 1340 1341 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1342 auto Itr = WideningDecisions.find(InstOnVF); 1343 if (Itr == WideningDecisions.end()) 1344 return CM_Unknown; 1345 return Itr->second.first; 1346 } 1347 1348 /// Return the vectorization cost for the given instruction \p I and vector 1349 /// width \p VF. 1350 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1351 assert(VF.isVector() && "Expected VF >=2"); 1352 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1353 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1354 "The cost is not calculated"); 1355 return WideningDecisions[InstOnVF].second; 1356 } 1357 1358 /// Return True if instruction \p I is an optimizable truncate whose operand 1359 /// is an induction variable. Such a truncate will be removed by adding a new 1360 /// induction variable with the destination type. 1361 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1362 // If the instruction is not a truncate, return false. 1363 auto *Trunc = dyn_cast<TruncInst>(I); 1364 if (!Trunc) 1365 return false; 1366 1367 // Get the source and destination types of the truncate. 1368 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1369 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1370 1371 // If the truncate is free for the given types, return false. Replacing a 1372 // free truncate with an induction variable would add an induction variable 1373 // update instruction to each iteration of the loop. We exclude from this 1374 // check the primary induction variable since it will need an update 1375 // instruction regardless. 1376 Value *Op = Trunc->getOperand(0); 1377 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1378 return false; 1379 1380 // If the truncated value is not an induction variable, return false. 1381 return Legal->isInductionPhi(Op); 1382 } 1383 1384 /// Collects the instructions to scalarize for each predicated instruction in 1385 /// the loop. 1386 void collectInstsToScalarize(ElementCount VF); 1387 1388 /// Collect Uniform and Scalar values for the given \p VF. 1389 /// The sets depend on CM decision for Load/Store instructions 1390 /// that may be vectorized as interleave, gather-scatter or scalarized. 1391 void collectUniformsAndScalars(ElementCount VF) { 1392 // Do the analysis once. 1393 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1394 return; 1395 setCostBasedWideningDecision(VF); 1396 collectLoopUniforms(VF); 1397 collectLoopScalars(VF); 1398 } 1399 1400 /// Returns true if the target machine supports masked store operation 1401 /// for the given \p DataType and kind of access to \p Ptr. 1402 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1403 return Legal->isConsecutivePtr(DataType, Ptr) && 1404 TTI.isLegalMaskedStore(DataType, Alignment); 1405 } 1406 1407 /// Returns true if the target machine supports masked load operation 1408 /// for the given \p DataType and kind of access to \p Ptr. 1409 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1410 return Legal->isConsecutivePtr(DataType, Ptr) && 1411 TTI.isLegalMaskedLoad(DataType, Alignment); 1412 } 1413 1414 /// Returns true if the target machine can represent \p V as a masked gather 1415 /// or scatter operation. 1416 bool isLegalGatherOrScatter(Value *V, 1417 ElementCount VF = ElementCount::getFixed(1)) { 1418 bool LI = isa<LoadInst>(V); 1419 bool SI = isa<StoreInst>(V); 1420 if (!LI && !SI) 1421 return false; 1422 auto *Ty = getLoadStoreType(V); 1423 Align Align = getLoadStoreAlignment(V); 1424 if (VF.isVector()) 1425 Ty = VectorType::get(Ty, VF); 1426 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1427 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1428 } 1429 1430 /// Returns true if the target machine supports all of the reduction 1431 /// variables found for the given VF. 1432 bool canVectorizeReductions(ElementCount VF) const { 1433 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1434 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1435 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1436 })); 1437 } 1438 1439 /// Returns true if \p I is an instruction that will be scalarized with 1440 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1441 /// instructions include conditional stores and instructions that may divide 1442 /// by zero. 1443 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1444 1445 // Returns true if \p I is an instruction that will be predicated either 1446 // through scalar predication or masked load/store or masked gather/scatter. 1447 // \p VF is the vectorization factor that will be used to vectorize \p I. 1448 // Superset of instructions that return true for isScalarWithPredication. 1449 bool isPredicatedInst(Instruction *I, ElementCount VF, 1450 bool IsKnownUniform = false) { 1451 // When we know the load is uniform and the original scalar loop was not 1452 // predicated we don't need to mark it as a predicated instruction. Any 1453 // vectorised blocks created when tail-folding are something artificial we 1454 // have introduced and we know there is always at least one active lane. 1455 // That's why we call Legal->blockNeedsPredication here because it doesn't 1456 // query tail-folding. 1457 if (IsKnownUniform && isa<LoadInst>(I) && 1458 !Legal->blockNeedsPredication(I->getParent())) 1459 return false; 1460 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1461 return false; 1462 // Loads and stores that need some form of masked operation are predicated 1463 // instructions. 1464 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1465 return Legal->isMaskRequired(I); 1466 return isScalarWithPredication(I, VF); 1467 } 1468 1469 /// Returns true if \p I is a memory instruction with consecutive memory 1470 /// access that can be widened. 1471 bool 1472 memoryInstructionCanBeWidened(Instruction *I, 1473 ElementCount VF = ElementCount::getFixed(1)); 1474 1475 /// Returns true if \p I is a memory instruction in an interleaved-group 1476 /// of memory accesses that can be vectorized with wide vector loads/stores 1477 /// and shuffles. 1478 bool 1479 interleavedAccessCanBeWidened(Instruction *I, 1480 ElementCount VF = ElementCount::getFixed(1)); 1481 1482 /// Check if \p Instr belongs to any interleaved access group. 1483 bool isAccessInterleaved(Instruction *Instr) { 1484 return InterleaveInfo.isInterleaved(Instr); 1485 } 1486 1487 /// Get the interleaved access group that \p Instr belongs to. 1488 const InterleaveGroup<Instruction> * 1489 getInterleavedAccessGroup(Instruction *Instr) { 1490 return InterleaveInfo.getInterleaveGroup(Instr); 1491 } 1492 1493 /// Returns true if we're required to use a scalar epilogue for at least 1494 /// the final iteration of the original loop. 1495 bool requiresScalarEpilogue(ElementCount VF) const { 1496 if (!isScalarEpilogueAllowed()) 1497 return false; 1498 // If we might exit from anywhere but the latch, must run the exiting 1499 // iteration in scalar form. 1500 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1501 return true; 1502 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1503 } 1504 1505 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1506 /// loop hint annotation. 1507 bool isScalarEpilogueAllowed() const { 1508 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1509 } 1510 1511 /// Returns true if all loop blocks should be masked to fold tail loop. 1512 bool foldTailByMasking() const { return FoldTailByMasking; } 1513 1514 /// Returns true if the instructions in this block requires predication 1515 /// for any reason, e.g. because tail folding now requires a predicate 1516 /// or because the block in the original loop was predicated. 1517 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1518 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1519 } 1520 1521 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1522 /// nodes to the chain of instructions representing the reductions. Uses a 1523 /// MapVector to ensure deterministic iteration order. 1524 using ReductionChainMap = 1525 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1526 1527 /// Return the chain of instructions representing an inloop reduction. 1528 const ReductionChainMap &getInLoopReductionChains() const { 1529 return InLoopReductionChains; 1530 } 1531 1532 /// Returns true if the Phi is part of an inloop reduction. 1533 bool isInLoopReduction(PHINode *Phi) const { 1534 return InLoopReductionChains.count(Phi); 1535 } 1536 1537 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1538 /// with factor VF. Return the cost of the instruction, including 1539 /// scalarization overhead if it's needed. 1540 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1541 1542 /// Estimate cost of a call instruction CI if it were vectorized with factor 1543 /// VF. Return the cost of the instruction, including scalarization overhead 1544 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1545 /// scalarized - 1546 /// i.e. either vector version isn't available, or is too expensive. 1547 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1548 bool &NeedToScalarize) const; 1549 1550 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1551 /// that of B. 1552 bool isMoreProfitable(const VectorizationFactor &A, 1553 const VectorizationFactor &B) const; 1554 1555 /// Invalidates decisions already taken by the cost model. 1556 void invalidateCostModelingDecisions() { 1557 WideningDecisions.clear(); 1558 Uniforms.clear(); 1559 Scalars.clear(); 1560 } 1561 1562 /// Convenience function that returns the value of vscale_range iff 1563 /// vscale_range.min == vscale_range.max or otherwise returns the value 1564 /// returned by the corresponding TLI method. 1565 Optional<unsigned> getVScaleForTuning() const; 1566 1567 private: 1568 unsigned NumPredStores = 0; 1569 1570 /// \return An upper bound for the vectorization factors for both 1571 /// fixed and scalable vectorization, where the minimum-known number of 1572 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1573 /// disabled or unsupported, then the scalable part will be equal to 1574 /// ElementCount::getScalable(0). 1575 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1576 ElementCount UserVF, 1577 bool FoldTailByMasking); 1578 1579 /// \return the maximized element count based on the targets vector 1580 /// registers and the loop trip-count, but limited to a maximum safe VF. 1581 /// This is a helper function of computeFeasibleMaxVF. 1582 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1583 unsigned SmallestType, 1584 unsigned WidestType, 1585 ElementCount MaxSafeVF, 1586 bool FoldTailByMasking); 1587 1588 /// \return the maximum legal scalable VF, based on the safe max number 1589 /// of elements. 1590 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1591 1592 /// The vectorization cost is a combination of the cost itself and a boolean 1593 /// indicating whether any of the contributing operations will actually 1594 /// operate on vector values after type legalization in the backend. If this 1595 /// latter value is false, then all operations will be scalarized (i.e. no 1596 /// vectorization has actually taken place). 1597 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1598 1599 /// Returns the expected execution cost. The unit of the cost does 1600 /// not matter because we use the 'cost' units to compare different 1601 /// vector widths. The cost that is returned is *not* normalized by 1602 /// the factor width. If \p Invalid is not nullptr, this function 1603 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1604 /// each instruction that has an Invalid cost for the given VF. 1605 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1606 VectorizationCostTy 1607 expectedCost(ElementCount VF, 1608 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1609 1610 /// Returns the execution time cost of an instruction for a given vector 1611 /// width. Vector width of one means scalar. 1612 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1613 1614 /// The cost-computation logic from getInstructionCost which provides 1615 /// the vector type as an output parameter. 1616 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1617 Type *&VectorTy); 1618 1619 /// Return the cost of instructions in an inloop reduction pattern, if I is 1620 /// part of that pattern. 1621 Optional<InstructionCost> 1622 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1623 TTI::TargetCostKind CostKind); 1624 1625 /// Calculate vectorization cost of memory instruction \p I. 1626 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1627 1628 /// The cost computation for scalarized memory instruction. 1629 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1630 1631 /// The cost computation for interleaving group of memory instructions. 1632 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1633 1634 /// The cost computation for Gather/Scatter instruction. 1635 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1636 1637 /// The cost computation for widening instruction \p I with consecutive 1638 /// memory access. 1639 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1640 1641 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1642 /// Load: scalar load + broadcast. 1643 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1644 /// element) 1645 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1646 1647 /// Estimate the overhead of scalarizing an instruction. This is a 1648 /// convenience wrapper for the type-based getScalarizationOverhead API. 1649 InstructionCost getScalarizationOverhead(Instruction *I, 1650 ElementCount VF) const; 1651 1652 /// Returns whether the instruction is a load or store and will be a emitted 1653 /// as a vector operation. 1654 bool isConsecutiveLoadOrStore(Instruction *I); 1655 1656 /// Returns true if an artificially high cost for emulated masked memrefs 1657 /// should be used. 1658 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1659 1660 /// Map of scalar integer values to the smallest bitwidth they can be legally 1661 /// represented as. The vector equivalents of these values should be truncated 1662 /// to this type. 1663 MapVector<Instruction *, uint64_t> MinBWs; 1664 1665 /// A type representing the costs for instructions if they were to be 1666 /// scalarized rather than vectorized. The entries are Instruction-Cost 1667 /// pairs. 1668 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1669 1670 /// A set containing all BasicBlocks that are known to present after 1671 /// vectorization as a predicated block. 1672 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1673 1674 /// Records whether it is allowed to have the original scalar loop execute at 1675 /// least once. This may be needed as a fallback loop in case runtime 1676 /// aliasing/dependence checks fail, or to handle the tail/remainder 1677 /// iterations when the trip count is unknown or doesn't divide by the VF, 1678 /// or as a peel-loop to handle gaps in interleave-groups. 1679 /// Under optsize and when the trip count is very small we don't allow any 1680 /// iterations to execute in the scalar loop. 1681 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1682 1683 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1684 bool FoldTailByMasking = false; 1685 1686 /// A map holding scalar costs for different vectorization factors. The 1687 /// presence of a cost for an instruction in the mapping indicates that the 1688 /// instruction will be scalarized when vectorizing with the associated 1689 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1690 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1691 1692 /// Holds the instructions known to be uniform after vectorization. 1693 /// The data is collected per VF. 1694 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1695 1696 /// Holds the instructions known to be scalar after vectorization. 1697 /// The data is collected per VF. 1698 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1699 1700 /// Holds the instructions (address computations) that are forced to be 1701 /// scalarized. 1702 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1703 1704 /// PHINodes of the reductions that should be expanded in-loop along with 1705 /// their associated chains of reduction operations, in program order from top 1706 /// (PHI) to bottom 1707 ReductionChainMap InLoopReductionChains; 1708 1709 /// A Map of inloop reduction operations and their immediate chain operand. 1710 /// FIXME: This can be removed once reductions can be costed correctly in 1711 /// vplan. This was added to allow quick lookup to the inloop operations, 1712 /// without having to loop through InLoopReductionChains. 1713 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1714 1715 /// Returns the expected difference in cost from scalarizing the expression 1716 /// feeding a predicated instruction \p PredInst. The instructions to 1717 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1718 /// non-negative return value implies the expression will be scalarized. 1719 /// Currently, only single-use chains are considered for scalarization. 1720 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1721 ElementCount VF); 1722 1723 /// Collect the instructions that are uniform after vectorization. An 1724 /// instruction is uniform if we represent it with a single scalar value in 1725 /// the vectorized loop corresponding to each vector iteration. Examples of 1726 /// uniform instructions include pointer operands of consecutive or 1727 /// interleaved memory accesses. Note that although uniformity implies an 1728 /// instruction will be scalar, the reverse is not true. In general, a 1729 /// scalarized instruction will be represented by VF scalar values in the 1730 /// vectorized loop, each corresponding to an iteration of the original 1731 /// scalar loop. 1732 void collectLoopUniforms(ElementCount VF); 1733 1734 /// Collect the instructions that are scalar after vectorization. An 1735 /// instruction is scalar if it is known to be uniform or will be scalarized 1736 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1737 /// to the list if they are used by a load/store instruction that is marked as 1738 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1739 /// VF values in the vectorized loop, each corresponding to an iteration of 1740 /// the original scalar loop. 1741 void collectLoopScalars(ElementCount VF); 1742 1743 /// Keeps cost model vectorization decision and cost for instructions. 1744 /// Right now it is used for memory instructions only. 1745 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1746 std::pair<InstWidening, InstructionCost>>; 1747 1748 DecisionList WideningDecisions; 1749 1750 /// Returns true if \p V is expected to be vectorized and it needs to be 1751 /// extracted. 1752 bool needsExtract(Value *V, ElementCount VF) const { 1753 Instruction *I = dyn_cast<Instruction>(V); 1754 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1755 TheLoop->isLoopInvariant(I)) 1756 return false; 1757 1758 // Assume we can vectorize V (and hence we need extraction) if the 1759 // scalars are not computed yet. This can happen, because it is called 1760 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1761 // the scalars are collected. That should be a safe assumption in most 1762 // cases, because we check if the operands have vectorizable types 1763 // beforehand in LoopVectorizationLegality. 1764 return Scalars.find(VF) == Scalars.end() || 1765 !isScalarAfterVectorization(I, VF); 1766 }; 1767 1768 /// Returns a range containing only operands needing to be extracted. 1769 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1770 ElementCount VF) const { 1771 return SmallVector<Value *, 4>(make_filter_range( 1772 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1773 } 1774 1775 /// Determines if we have the infrastructure to vectorize loop \p L and its 1776 /// epilogue, assuming the main loop is vectorized by \p VF. 1777 bool isCandidateForEpilogueVectorization(const Loop &L, 1778 const ElementCount VF) const; 1779 1780 /// Returns true if epilogue vectorization is considered profitable, and 1781 /// false otherwise. 1782 /// \p VF is the vectorization factor chosen for the original loop. 1783 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1784 1785 public: 1786 /// The loop that we evaluate. 1787 Loop *TheLoop; 1788 1789 /// Predicated scalar evolution analysis. 1790 PredicatedScalarEvolution &PSE; 1791 1792 /// Loop Info analysis. 1793 LoopInfo *LI; 1794 1795 /// Vectorization legality. 1796 LoopVectorizationLegality *Legal; 1797 1798 /// Vector target information. 1799 const TargetTransformInfo &TTI; 1800 1801 /// Target Library Info. 1802 const TargetLibraryInfo *TLI; 1803 1804 /// Demanded bits analysis. 1805 DemandedBits *DB; 1806 1807 /// Assumption cache. 1808 AssumptionCache *AC; 1809 1810 /// Interface to emit optimization remarks. 1811 OptimizationRemarkEmitter *ORE; 1812 1813 const Function *TheFunction; 1814 1815 /// Loop Vectorize Hint. 1816 const LoopVectorizeHints *Hints; 1817 1818 /// The interleave access information contains groups of interleaved accesses 1819 /// with the same stride and close to each other. 1820 InterleavedAccessInfo &InterleaveInfo; 1821 1822 /// Values to ignore in the cost model. 1823 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1824 1825 /// Values to ignore in the cost model when VF > 1. 1826 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1827 1828 /// All element types found in the loop. 1829 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1830 1831 /// Profitable vector factors. 1832 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1833 }; 1834 } // end namespace llvm 1835 1836 /// Helper struct to manage generating runtime checks for vectorization. 1837 /// 1838 /// The runtime checks are created up-front in temporary blocks to allow better 1839 /// estimating the cost and un-linked from the existing IR. After deciding to 1840 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1841 /// temporary blocks are completely removed. 1842 class GeneratedRTChecks { 1843 /// Basic block which contains the generated SCEV checks, if any. 1844 BasicBlock *SCEVCheckBlock = nullptr; 1845 1846 /// The value representing the result of the generated SCEV checks. If it is 1847 /// nullptr, either no SCEV checks have been generated or they have been used. 1848 Value *SCEVCheckCond = nullptr; 1849 1850 /// Basic block which contains the generated memory runtime checks, if any. 1851 BasicBlock *MemCheckBlock = nullptr; 1852 1853 /// The value representing the result of the generated memory runtime checks. 1854 /// If it is nullptr, either no memory runtime checks have been generated or 1855 /// they have been used. 1856 Value *MemRuntimeCheckCond = nullptr; 1857 1858 DominatorTree *DT; 1859 LoopInfo *LI; 1860 TargetTransformInfo *TTI; 1861 1862 SCEVExpander SCEVExp; 1863 SCEVExpander MemCheckExp; 1864 1865 bool CostTooHigh = false; 1866 1867 public: 1868 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1869 TargetTransformInfo *TTI, const DataLayout &DL) 1870 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1871 MemCheckExp(SE, DL, "scev.check") {} 1872 1873 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1874 /// accurately estimate the cost of the runtime checks. The blocks are 1875 /// un-linked from the IR and is added back during vector code generation. If 1876 /// there is no vector code generation, the check blocks are removed 1877 /// completely. 1878 void Create(Loop *L, const LoopAccessInfo &LAI, 1879 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1880 1881 // Hard cutoff to limit compile-time increase in case a very large number of 1882 // runtime checks needs to be generated. 1883 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1884 // profile info. 1885 CostTooHigh = 1886 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1887 if (CostTooHigh) 1888 return; 1889 1890 BasicBlock *LoopHeader = L->getHeader(); 1891 BasicBlock *Preheader = L->getLoopPreheader(); 1892 1893 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1894 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1895 // may be used by SCEVExpander. The blocks will be un-linked from their 1896 // predecessors and removed from LI & DT at the end of the function. 1897 if (!UnionPred.isAlwaysTrue()) { 1898 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1899 nullptr, "vector.scevcheck"); 1900 1901 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1902 &UnionPred, SCEVCheckBlock->getTerminator()); 1903 } 1904 1905 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1906 if (RtPtrChecking.Need) { 1907 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1908 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1909 "vector.memcheck"); 1910 1911 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1912 if (DiffChecks) { 1913 MemRuntimeCheckCond = addDiffRuntimeChecks( 1914 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1915 [VF](IRBuilderBase &B, unsigned Bits) { 1916 return getRuntimeVF(B, B.getIntNTy(Bits), VF); 1917 }, 1918 IC); 1919 } else { 1920 MemRuntimeCheckCond = 1921 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1922 RtPtrChecking.getChecks(), MemCheckExp); 1923 } 1924 assert(MemRuntimeCheckCond && 1925 "no RT checks generated although RtPtrChecking " 1926 "claimed checks are required"); 1927 } 1928 1929 if (!MemCheckBlock && !SCEVCheckBlock) 1930 return; 1931 1932 // Unhook the temporary block with the checks, update various places 1933 // accordingly. 1934 if (SCEVCheckBlock) 1935 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1936 if (MemCheckBlock) 1937 MemCheckBlock->replaceAllUsesWith(Preheader); 1938 1939 if (SCEVCheckBlock) { 1940 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1941 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1942 Preheader->getTerminator()->eraseFromParent(); 1943 } 1944 if (MemCheckBlock) { 1945 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1946 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1947 Preheader->getTerminator()->eraseFromParent(); 1948 } 1949 1950 DT->changeImmediateDominator(LoopHeader, Preheader); 1951 if (MemCheckBlock) { 1952 DT->eraseNode(MemCheckBlock); 1953 LI->removeBlock(MemCheckBlock); 1954 } 1955 if (SCEVCheckBlock) { 1956 DT->eraseNode(SCEVCheckBlock); 1957 LI->removeBlock(SCEVCheckBlock); 1958 } 1959 } 1960 1961 InstructionCost getCost() { 1962 if (SCEVCheckBlock || MemCheckBlock) 1963 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 1964 1965 if (CostTooHigh) { 1966 InstructionCost Cost; 1967 Cost.setInvalid(); 1968 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 1969 return Cost; 1970 } 1971 1972 InstructionCost RTCheckCost = 0; 1973 if (SCEVCheckBlock) 1974 for (Instruction &I : *SCEVCheckBlock) { 1975 if (SCEVCheckBlock->getTerminator() == &I) 1976 continue; 1977 InstructionCost C = 1978 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1979 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1980 RTCheckCost += C; 1981 } 1982 if (MemCheckBlock) 1983 for (Instruction &I : *MemCheckBlock) { 1984 if (MemCheckBlock->getTerminator() == &I) 1985 continue; 1986 InstructionCost C = 1987 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1988 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1989 RTCheckCost += C; 1990 } 1991 1992 if (SCEVCheckBlock || MemCheckBlock) 1993 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 1994 << "\n"); 1995 1996 return RTCheckCost; 1997 } 1998 1999 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2000 /// unused. 2001 ~GeneratedRTChecks() { 2002 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2003 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2004 if (!SCEVCheckCond) 2005 SCEVCleaner.markResultUsed(); 2006 2007 if (!MemRuntimeCheckCond) 2008 MemCheckCleaner.markResultUsed(); 2009 2010 if (MemRuntimeCheckCond) { 2011 auto &SE = *MemCheckExp.getSE(); 2012 // Memory runtime check generation creates compares that use expanded 2013 // values. Remove them before running the SCEVExpanderCleaners. 2014 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2015 if (MemCheckExp.isInsertedInstruction(&I)) 2016 continue; 2017 SE.forgetValue(&I); 2018 I.eraseFromParent(); 2019 } 2020 } 2021 MemCheckCleaner.cleanup(); 2022 SCEVCleaner.cleanup(); 2023 2024 if (SCEVCheckCond) 2025 SCEVCheckBlock->eraseFromParent(); 2026 if (MemRuntimeCheckCond) 2027 MemCheckBlock->eraseFromParent(); 2028 } 2029 2030 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2031 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2032 /// depending on the generated condition. 2033 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2034 BasicBlock *LoopVectorPreHeader, 2035 BasicBlock *LoopExitBlock) { 2036 if (!SCEVCheckCond) 2037 return nullptr; 2038 2039 Value *Cond = SCEVCheckCond; 2040 // Mark the check as used, to prevent it from being removed during cleanup. 2041 SCEVCheckCond = nullptr; 2042 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2043 if (C->isZero()) 2044 return nullptr; 2045 2046 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2047 2048 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2049 // Create new preheader for vector loop. 2050 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2051 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2052 2053 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2054 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2055 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2056 SCEVCheckBlock); 2057 2058 DT->addNewBlock(SCEVCheckBlock, Pred); 2059 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2060 2061 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2062 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2063 return SCEVCheckBlock; 2064 } 2065 2066 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2067 /// the branches to branch to the vector preheader or \p Bypass, depending on 2068 /// the generated condition. 2069 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2070 BasicBlock *LoopVectorPreHeader) { 2071 // Check if we generated code that checks in runtime if arrays overlap. 2072 if (!MemRuntimeCheckCond) 2073 return nullptr; 2074 2075 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2076 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2077 MemCheckBlock); 2078 2079 DT->addNewBlock(MemCheckBlock, Pred); 2080 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2081 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2082 2083 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2084 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2085 2086 ReplaceInstWithInst( 2087 MemCheckBlock->getTerminator(), 2088 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2089 MemCheckBlock->getTerminator()->setDebugLoc( 2090 Pred->getTerminator()->getDebugLoc()); 2091 2092 // Mark the check as used, to prevent it from being removed during cleanup. 2093 MemRuntimeCheckCond = nullptr; 2094 return MemCheckBlock; 2095 } 2096 }; 2097 2098 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2099 // vectorization. The loop needs to be annotated with #pragma omp simd 2100 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2101 // vector length information is not provided, vectorization is not considered 2102 // explicit. Interleave hints are not allowed either. These limitations will be 2103 // relaxed in the future. 2104 // Please, note that we are currently forced to abuse the pragma 'clang 2105 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2106 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2107 // provides *explicit vectorization hints* (LV can bypass legal checks and 2108 // assume that vectorization is legal). However, both hints are implemented 2109 // using the same metadata (llvm.loop.vectorize, processed by 2110 // LoopVectorizeHints). This will be fixed in the future when the native IR 2111 // representation for pragma 'omp simd' is introduced. 2112 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2113 OptimizationRemarkEmitter *ORE) { 2114 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2115 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2116 2117 // Only outer loops with an explicit vectorization hint are supported. 2118 // Unannotated outer loops are ignored. 2119 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2120 return false; 2121 2122 Function *Fn = OuterLp->getHeader()->getParent(); 2123 if (!Hints.allowVectorization(Fn, OuterLp, 2124 true /*VectorizeOnlyWhenForced*/)) { 2125 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2126 return false; 2127 } 2128 2129 if (Hints.getInterleave() > 1) { 2130 // TODO: Interleave support is future work. 2131 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2132 "outer loops.\n"); 2133 Hints.emitRemarkWithHints(); 2134 return false; 2135 } 2136 2137 return true; 2138 } 2139 2140 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2141 OptimizationRemarkEmitter *ORE, 2142 SmallVectorImpl<Loop *> &V) { 2143 // Collect inner loops and outer loops without irreducible control flow. For 2144 // now, only collect outer loops that have explicit vectorization hints. If we 2145 // are stress testing the VPlan H-CFG construction, we collect the outermost 2146 // loop of every loop nest. 2147 if (L.isInnermost() || VPlanBuildStressTest || 2148 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2149 LoopBlocksRPO RPOT(&L); 2150 RPOT.perform(LI); 2151 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2152 V.push_back(&L); 2153 // TODO: Collect inner loops inside marked outer loops in case 2154 // vectorization fails for the outer loop. Do not invoke 2155 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2156 // already known to be reducible. We can use an inherited attribute for 2157 // that. 2158 return; 2159 } 2160 } 2161 for (Loop *InnerL : L) 2162 collectSupportedLoops(*InnerL, LI, ORE, V); 2163 } 2164 2165 namespace { 2166 2167 /// The LoopVectorize Pass. 2168 struct LoopVectorize : public FunctionPass { 2169 /// Pass identification, replacement for typeid 2170 static char ID; 2171 2172 LoopVectorizePass Impl; 2173 2174 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2175 bool VectorizeOnlyWhenForced = false) 2176 : FunctionPass(ID), 2177 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2178 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2179 } 2180 2181 bool runOnFunction(Function &F) override { 2182 if (skipFunction(F)) 2183 return false; 2184 2185 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2186 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2187 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2188 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2189 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2190 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2191 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2192 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2193 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2194 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2195 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2196 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2197 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2198 2199 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2200 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2201 2202 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2203 GetLAA, *ORE, PSI).MadeAnyChange; 2204 } 2205 2206 void getAnalysisUsage(AnalysisUsage &AU) const override { 2207 AU.addRequired<AssumptionCacheTracker>(); 2208 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2209 AU.addRequired<DominatorTreeWrapperPass>(); 2210 AU.addRequired<LoopInfoWrapperPass>(); 2211 AU.addRequired<ScalarEvolutionWrapperPass>(); 2212 AU.addRequired<TargetTransformInfoWrapperPass>(); 2213 AU.addRequired<AAResultsWrapperPass>(); 2214 AU.addRequired<LoopAccessLegacyAnalysis>(); 2215 AU.addRequired<DemandedBitsWrapperPass>(); 2216 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2217 AU.addRequired<InjectTLIMappingsLegacy>(); 2218 2219 // We currently do not preserve loopinfo/dominator analyses with outer loop 2220 // vectorization. Until this is addressed, mark these analyses as preserved 2221 // only for non-VPlan-native path. 2222 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2223 if (!EnableVPlanNativePath) { 2224 AU.addPreserved<LoopInfoWrapperPass>(); 2225 AU.addPreserved<DominatorTreeWrapperPass>(); 2226 } 2227 2228 AU.addPreserved<BasicAAWrapperPass>(); 2229 AU.addPreserved<GlobalsAAWrapperPass>(); 2230 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2231 } 2232 }; 2233 2234 } // end anonymous namespace 2235 2236 //===----------------------------------------------------------------------===// 2237 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2238 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2239 //===----------------------------------------------------------------------===// 2240 2241 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2242 // We need to place the broadcast of invariant variables outside the loop, 2243 // but only if it's proven safe to do so. Else, broadcast will be inside 2244 // vector loop body. 2245 Instruction *Instr = dyn_cast<Instruction>(V); 2246 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2247 (!Instr || 2248 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2249 // Place the code for broadcasting invariant variables in the new preheader. 2250 IRBuilder<>::InsertPointGuard Guard(Builder); 2251 if (SafeToHoist) 2252 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2253 2254 // Broadcast the scalar into all locations in the vector. 2255 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2256 2257 return Shuf; 2258 } 2259 2260 /// This function adds 2261 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2262 /// to each vector element of Val. The sequence starts at StartIndex. 2263 /// \p Opcode is relevant for FP induction variable. 2264 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2265 Instruction::BinaryOps BinOp, ElementCount VF, 2266 IRBuilderBase &Builder) { 2267 assert(VF.isVector() && "only vector VFs are supported"); 2268 2269 // Create and check the types. 2270 auto *ValVTy = cast<VectorType>(Val->getType()); 2271 ElementCount VLen = ValVTy->getElementCount(); 2272 2273 Type *STy = Val->getType()->getScalarType(); 2274 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2275 "Induction Step must be an integer or FP"); 2276 assert(Step->getType() == STy && "Step has wrong type"); 2277 2278 SmallVector<Constant *, 8> Indices; 2279 2280 // Create a vector of consecutive numbers from zero to VF. 2281 VectorType *InitVecValVTy = ValVTy; 2282 if (STy->isFloatingPointTy()) { 2283 Type *InitVecValSTy = 2284 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2285 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2286 } 2287 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2288 2289 // Splat the StartIdx 2290 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2291 2292 if (STy->isIntegerTy()) { 2293 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2294 Step = Builder.CreateVectorSplat(VLen, Step); 2295 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2296 // FIXME: The newly created binary instructions should contain nsw/nuw 2297 // flags, which can be found from the original scalar operations. 2298 Step = Builder.CreateMul(InitVec, Step); 2299 return Builder.CreateAdd(Val, Step, "induction"); 2300 } 2301 2302 // Floating point induction. 2303 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2304 "Binary Opcode should be specified for FP induction"); 2305 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2306 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2307 2308 Step = Builder.CreateVectorSplat(VLen, Step); 2309 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2310 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2311 } 2312 2313 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2314 /// variable on which to base the steps, \p Step is the size of the step. 2315 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2316 const InductionDescriptor &ID, VPValue *Def, 2317 VPTransformState &State) { 2318 IRBuilderBase &Builder = State.Builder; 2319 // We shouldn't have to build scalar steps if we aren't vectorizing. 2320 assert(State.VF.isVector() && "VF should be greater than one"); 2321 // Get the value type and ensure it and the step have the same integer type. 2322 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2323 assert(ScalarIVTy == Step->getType() && 2324 "Val and Step should have the same type"); 2325 2326 // We build scalar steps for both integer and floating-point induction 2327 // variables. Here, we determine the kind of arithmetic we will perform. 2328 Instruction::BinaryOps AddOp; 2329 Instruction::BinaryOps MulOp; 2330 if (ScalarIVTy->isIntegerTy()) { 2331 AddOp = Instruction::Add; 2332 MulOp = Instruction::Mul; 2333 } else { 2334 AddOp = ID.getInductionOpcode(); 2335 MulOp = Instruction::FMul; 2336 } 2337 2338 // Determine the number of scalars we need to generate for each unroll 2339 // iteration. 2340 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2341 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2342 // Compute the scalar steps and save the results in State. 2343 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2344 ScalarIVTy->getScalarSizeInBits()); 2345 Type *VecIVTy = nullptr; 2346 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2347 if (!FirstLaneOnly && State.VF.isScalable()) { 2348 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2349 UnitStepVec = 2350 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2351 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2352 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2353 } 2354 2355 for (unsigned Part = 0; Part < State.UF; ++Part) { 2356 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2357 2358 if (!FirstLaneOnly && State.VF.isScalable()) { 2359 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2360 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2361 if (ScalarIVTy->isFloatingPointTy()) 2362 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2363 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2364 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2365 State.set(Def, Add, Part); 2366 // It's useful to record the lane values too for the known minimum number 2367 // of elements so we do those below. This improves the code quality when 2368 // trying to extract the first element, for example. 2369 } 2370 2371 if (ScalarIVTy->isFloatingPointTy()) 2372 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2373 2374 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2375 Value *StartIdx = Builder.CreateBinOp( 2376 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2377 // The step returned by `createStepForVF` is a runtime-evaluated value 2378 // when VF is scalable. Otherwise, it should be folded into a Constant. 2379 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2380 "Expected StartIdx to be folded to a constant when VF is not " 2381 "scalable"); 2382 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2383 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2384 State.set(Def, Add, VPIteration(Part, Lane)); 2385 } 2386 } 2387 } 2388 2389 // Generate code for the induction step. Note that induction steps are 2390 // required to be loop-invariant 2391 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2392 Instruction *InsertBefore, 2393 Loop *OrigLoop = nullptr) { 2394 const DataLayout &DL = SE.getDataLayout(); 2395 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2396 "Induction step should be loop invariant"); 2397 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2398 return E->getValue(); 2399 2400 SCEVExpander Exp(SE, DL, "induction"); 2401 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2402 } 2403 2404 /// Compute the transformed value of Index at offset StartValue using step 2405 /// StepValue. 2406 /// For integer induction, returns StartValue + Index * StepValue. 2407 /// For pointer induction, returns StartValue[Index * StepValue]. 2408 /// FIXME: The newly created binary instructions should contain nsw/nuw 2409 /// flags, which can be found from the original scalar operations. 2410 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2411 Value *StartValue, Value *Step, 2412 const InductionDescriptor &ID) { 2413 assert(Index->getType()->getScalarType() == Step->getType() && 2414 "Index scalar type does not match StepValue type"); 2415 2416 // Note: the IR at this point is broken. We cannot use SE to create any new 2417 // SCEV and then expand it, hoping that SCEV's simplification will give us 2418 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2419 // lead to various SCEV crashes. So all we can do is to use builder and rely 2420 // on InstCombine for future simplifications. Here we handle some trivial 2421 // cases only. 2422 auto CreateAdd = [&B](Value *X, Value *Y) { 2423 assert(X->getType() == Y->getType() && "Types don't match!"); 2424 if (auto *CX = dyn_cast<ConstantInt>(X)) 2425 if (CX->isZero()) 2426 return Y; 2427 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2428 if (CY->isZero()) 2429 return X; 2430 return B.CreateAdd(X, Y); 2431 }; 2432 2433 // We allow X to be a vector type, in which case Y will potentially be 2434 // splatted into a vector with the same element count. 2435 auto CreateMul = [&B](Value *X, Value *Y) { 2436 assert(X->getType()->getScalarType() == Y->getType() && 2437 "Types don't match!"); 2438 if (auto *CX = dyn_cast<ConstantInt>(X)) 2439 if (CX->isOne()) 2440 return Y; 2441 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2442 if (CY->isOne()) 2443 return X; 2444 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2445 if (XVTy && !isa<VectorType>(Y->getType())) 2446 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2447 return B.CreateMul(X, Y); 2448 }; 2449 2450 switch (ID.getKind()) { 2451 case InductionDescriptor::IK_IntInduction: { 2452 assert(!isa<VectorType>(Index->getType()) && 2453 "Vector indices not supported for integer inductions yet"); 2454 assert(Index->getType() == StartValue->getType() && 2455 "Index type does not match StartValue type"); 2456 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2457 return B.CreateSub(StartValue, Index); 2458 auto *Offset = CreateMul(Index, Step); 2459 return CreateAdd(StartValue, Offset); 2460 } 2461 case InductionDescriptor::IK_PtrInduction: { 2462 assert(isa<Constant>(Step) && 2463 "Expected constant step for pointer induction"); 2464 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2465 } 2466 case InductionDescriptor::IK_FpInduction: { 2467 assert(!isa<VectorType>(Index->getType()) && 2468 "Vector indices not supported for FP inductions yet"); 2469 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2470 auto InductionBinOp = ID.getInductionBinOp(); 2471 assert(InductionBinOp && 2472 (InductionBinOp->getOpcode() == Instruction::FAdd || 2473 InductionBinOp->getOpcode() == Instruction::FSub) && 2474 "Original bin op should be defined for FP induction"); 2475 2476 Value *MulExp = B.CreateFMul(Step, Index); 2477 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2478 "induction"); 2479 } 2480 case InductionDescriptor::IK_NoInduction: 2481 return nullptr; 2482 } 2483 llvm_unreachable("invalid enum"); 2484 } 2485 2486 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2487 const VPIteration &Instance, 2488 VPTransformState &State) { 2489 Value *ScalarInst = State.get(Def, Instance); 2490 Value *VectorValue = State.get(Def, Instance.Part); 2491 VectorValue = Builder.CreateInsertElement( 2492 VectorValue, ScalarInst, 2493 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2494 State.set(Def, VectorValue, Instance.Part); 2495 } 2496 2497 // Return whether we allow using masked interleave-groups (for dealing with 2498 // strided loads/stores that reside in predicated blocks, or for dealing 2499 // with gaps). 2500 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2501 // If an override option has been passed in for interleaved accesses, use it. 2502 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2503 return EnableMaskedInterleavedMemAccesses; 2504 2505 return TTI.enableMaskedInterleavedAccessVectorization(); 2506 } 2507 2508 // Try to vectorize the interleave group that \p Instr belongs to. 2509 // 2510 // E.g. Translate following interleaved load group (factor = 3): 2511 // for (i = 0; i < N; i+=3) { 2512 // R = Pic[i]; // Member of index 0 2513 // G = Pic[i+1]; // Member of index 1 2514 // B = Pic[i+2]; // Member of index 2 2515 // ... // do something to R, G, B 2516 // } 2517 // To: 2518 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2519 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2520 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2521 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2522 // 2523 // Or translate following interleaved store group (factor = 3): 2524 // for (i = 0; i < N; i+=3) { 2525 // ... do something to R, G, B 2526 // Pic[i] = R; // Member of index 0 2527 // Pic[i+1] = G; // Member of index 1 2528 // Pic[i+2] = B; // Member of index 2 2529 // } 2530 // To: 2531 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2532 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2533 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2534 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2535 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2536 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2537 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2538 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2539 VPValue *BlockInMask) { 2540 Instruction *Instr = Group->getInsertPos(); 2541 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2542 2543 // Prepare for the vector type of the interleaved load/store. 2544 Type *ScalarTy = getLoadStoreType(Instr); 2545 unsigned InterleaveFactor = Group->getFactor(); 2546 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2547 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2548 2549 // Prepare for the new pointers. 2550 SmallVector<Value *, 2> AddrParts; 2551 unsigned Index = Group->getIndex(Instr); 2552 2553 // TODO: extend the masked interleaved-group support to reversed access. 2554 assert((!BlockInMask || !Group->isReverse()) && 2555 "Reversed masked interleave-group not supported."); 2556 2557 // If the group is reverse, adjust the index to refer to the last vector lane 2558 // instead of the first. We adjust the index from the first vector lane, 2559 // rather than directly getting the pointer for lane VF - 1, because the 2560 // pointer operand of the interleaved access is supposed to be uniform. For 2561 // uniform instructions, we're only required to generate a value for the 2562 // first vector lane in each unroll iteration. 2563 if (Group->isReverse()) 2564 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2565 2566 for (unsigned Part = 0; Part < UF; Part++) { 2567 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2568 State.setDebugLocFromInst(AddrPart); 2569 2570 // Notice current instruction could be any index. Need to adjust the address 2571 // to the member of index 0. 2572 // 2573 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2574 // b = A[i]; // Member of index 0 2575 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2576 // 2577 // E.g. A[i+1] = a; // Member of index 1 2578 // A[i] = b; // Member of index 0 2579 // A[i+2] = c; // Member of index 2 (Current instruction) 2580 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2581 2582 bool InBounds = false; 2583 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2584 InBounds = gep->isInBounds(); 2585 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2586 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2587 2588 // Cast to the vector pointer type. 2589 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2590 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2591 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2592 } 2593 2594 State.setDebugLocFromInst(Instr); 2595 Value *PoisonVec = PoisonValue::get(VecTy); 2596 2597 Value *MaskForGaps = nullptr; 2598 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2599 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2600 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2601 } 2602 2603 // Vectorize the interleaved load group. 2604 if (isa<LoadInst>(Instr)) { 2605 // For each unroll part, create a wide load for the group. 2606 SmallVector<Value *, 2> NewLoads; 2607 for (unsigned Part = 0; Part < UF; Part++) { 2608 Instruction *NewLoad; 2609 if (BlockInMask || MaskForGaps) { 2610 assert(useMaskedInterleavedAccesses(*TTI) && 2611 "masked interleaved groups are not allowed."); 2612 Value *GroupMask = MaskForGaps; 2613 if (BlockInMask) { 2614 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2615 Value *ShuffledMask = Builder.CreateShuffleVector( 2616 BlockInMaskPart, 2617 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2618 "interleaved.mask"); 2619 GroupMask = MaskForGaps 2620 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2621 MaskForGaps) 2622 : ShuffledMask; 2623 } 2624 NewLoad = 2625 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2626 GroupMask, PoisonVec, "wide.masked.vec"); 2627 } 2628 else 2629 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2630 Group->getAlign(), "wide.vec"); 2631 Group->addMetadata(NewLoad); 2632 NewLoads.push_back(NewLoad); 2633 } 2634 2635 // For each member in the group, shuffle out the appropriate data from the 2636 // wide loads. 2637 unsigned J = 0; 2638 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2639 Instruction *Member = Group->getMember(I); 2640 2641 // Skip the gaps in the group. 2642 if (!Member) 2643 continue; 2644 2645 auto StrideMask = 2646 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2647 for (unsigned Part = 0; Part < UF; Part++) { 2648 Value *StridedVec = Builder.CreateShuffleVector( 2649 NewLoads[Part], StrideMask, "strided.vec"); 2650 2651 // If this member has different type, cast the result type. 2652 if (Member->getType() != ScalarTy) { 2653 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2654 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2655 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2656 } 2657 2658 if (Group->isReverse()) 2659 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2660 2661 State.set(VPDefs[J], StridedVec, Part); 2662 } 2663 ++J; 2664 } 2665 return; 2666 } 2667 2668 // The sub vector type for current instruction. 2669 auto *SubVT = VectorType::get(ScalarTy, VF); 2670 2671 // Vectorize the interleaved store group. 2672 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2673 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2674 "masked interleaved groups are not allowed."); 2675 assert((!MaskForGaps || !VF.isScalable()) && 2676 "masking gaps for scalable vectors is not yet supported."); 2677 for (unsigned Part = 0; Part < UF; Part++) { 2678 // Collect the stored vector from each member. 2679 SmallVector<Value *, 4> StoredVecs; 2680 for (unsigned i = 0; i < InterleaveFactor; i++) { 2681 assert((Group->getMember(i) || MaskForGaps) && 2682 "Fail to get a member from an interleaved store group"); 2683 Instruction *Member = Group->getMember(i); 2684 2685 // Skip the gaps in the group. 2686 if (!Member) { 2687 Value *Undef = PoisonValue::get(SubVT); 2688 StoredVecs.push_back(Undef); 2689 continue; 2690 } 2691 2692 Value *StoredVec = State.get(StoredValues[i], Part); 2693 2694 if (Group->isReverse()) 2695 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2696 2697 // If this member has different type, cast it to a unified type. 2698 2699 if (StoredVec->getType() != SubVT) 2700 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2701 2702 StoredVecs.push_back(StoredVec); 2703 } 2704 2705 // Concatenate all vectors into a wide vector. 2706 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2707 2708 // Interleave the elements in the wide vector. 2709 Value *IVec = Builder.CreateShuffleVector( 2710 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2711 "interleaved.vec"); 2712 2713 Instruction *NewStoreInstr; 2714 if (BlockInMask || MaskForGaps) { 2715 Value *GroupMask = MaskForGaps; 2716 if (BlockInMask) { 2717 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2718 Value *ShuffledMask = Builder.CreateShuffleVector( 2719 BlockInMaskPart, 2720 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2721 "interleaved.mask"); 2722 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2723 ShuffledMask, MaskForGaps) 2724 : ShuffledMask; 2725 } 2726 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2727 Group->getAlign(), GroupMask); 2728 } else 2729 NewStoreInstr = 2730 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2731 2732 Group->addMetadata(NewStoreInstr); 2733 } 2734 } 2735 2736 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2737 VPReplicateRecipe *RepRecipe, 2738 const VPIteration &Instance, 2739 bool IfPredicateInstr, 2740 VPTransformState &State) { 2741 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2742 2743 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2744 // the first lane and part. 2745 if (isa<NoAliasScopeDeclInst>(Instr)) 2746 if (!Instance.isFirstIteration()) 2747 return; 2748 2749 // Does this instruction return a value ? 2750 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2751 2752 Instruction *Cloned = Instr->clone(); 2753 if (!IsVoidRetTy) 2754 Cloned->setName(Instr->getName() + ".cloned"); 2755 2756 // If the scalarized instruction contributes to the address computation of a 2757 // widen masked load/store which was in a basic block that needed predication 2758 // and is not predicated after vectorization, we can't propagate 2759 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2760 // instruction could feed a poison value to the base address of the widen 2761 // load/store. 2762 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2763 Cloned->dropPoisonGeneratingFlags(); 2764 2765 if (Instr->getDebugLoc()) 2766 State.setDebugLocFromInst(Instr); 2767 2768 // Replace the operands of the cloned instructions with their scalar 2769 // equivalents in the new loop. 2770 for (auto &I : enumerate(RepRecipe->operands())) { 2771 auto InputInstance = Instance; 2772 VPValue *Operand = I.value(); 2773 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2774 if (OperandR && OperandR->isUniform()) 2775 InputInstance.Lane = VPLane::getFirstLane(); 2776 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2777 } 2778 State.addNewMetadata(Cloned, Instr); 2779 2780 // Place the cloned scalar in the new loop. 2781 State.Builder.Insert(Cloned); 2782 2783 State.set(RepRecipe, Cloned, Instance); 2784 2785 // If we just cloned a new assumption, add it the assumption cache. 2786 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2787 AC->registerAssumption(II); 2788 2789 // End if-block. 2790 if (IfPredicateInstr) 2791 PredicatedInstructions.push_back(Cloned); 2792 } 2793 2794 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2795 if (TripCount) 2796 return TripCount; 2797 2798 assert(InsertBlock); 2799 IRBuilder<> Builder(InsertBlock->getTerminator()); 2800 // Find the loop boundaries. 2801 ScalarEvolution *SE = PSE.getSE(); 2802 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2803 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2804 "Invalid loop count"); 2805 2806 Type *IdxTy = Legal->getWidestInductionType(); 2807 assert(IdxTy && "No type for induction"); 2808 2809 // The exit count might have the type of i64 while the phi is i32. This can 2810 // happen if we have an induction variable that is sign extended before the 2811 // compare. The only way that we get a backedge taken count is that the 2812 // induction variable was signed and as such will not overflow. In such a case 2813 // truncation is legal. 2814 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2815 IdxTy->getPrimitiveSizeInBits()) 2816 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2817 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2818 2819 // Get the total trip count from the count by adding 1. 2820 const SCEV *ExitCount = SE->getAddExpr( 2821 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2822 2823 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2824 2825 // Expand the trip count and place the new instructions in the preheader. 2826 // Notice that the pre-header does not change, only the loop body. 2827 SCEVExpander Exp(*SE, DL, "induction"); 2828 2829 // Count holds the overall loop count (N). 2830 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2831 InsertBlock->getTerminator()); 2832 2833 if (TripCount->getType()->isPointerTy()) 2834 TripCount = 2835 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2836 InsertBlock->getTerminator()); 2837 2838 return TripCount; 2839 } 2840 2841 Value * 2842 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2843 if (VectorTripCount) 2844 return VectorTripCount; 2845 2846 Value *TC = getOrCreateTripCount(InsertBlock); 2847 IRBuilder<> Builder(InsertBlock->getTerminator()); 2848 2849 Type *Ty = TC->getType(); 2850 // This is where we can make the step a runtime constant. 2851 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2852 2853 // If the tail is to be folded by masking, round the number of iterations N 2854 // up to a multiple of Step instead of rounding down. This is done by first 2855 // adding Step-1 and then rounding down. Note that it's ok if this addition 2856 // overflows: the vector induction variable will eventually wrap to zero given 2857 // that it starts at zero and its Step is a power of two; the loop will then 2858 // exit, with the last early-exit vector comparison also producing all-true. 2859 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2860 // is accounted for in emitIterationCountCheck that adds an overflow check. 2861 if (Cost->foldTailByMasking()) { 2862 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2863 "VF*UF must be a power of 2 when folding tail by masking"); 2864 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2865 TC = Builder.CreateAdd( 2866 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2867 } 2868 2869 // Now we need to generate the expression for the part of the loop that the 2870 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2871 // iterations are not required for correctness, or N - Step, otherwise. Step 2872 // is equal to the vectorization factor (number of SIMD elements) times the 2873 // unroll factor (number of SIMD instructions). 2874 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2875 2876 // There are cases where we *must* run at least one iteration in the remainder 2877 // loop. See the cost model for when this can happen. If the step evenly 2878 // divides the trip count, we set the remainder to be equal to the step. If 2879 // the step does not evenly divide the trip count, no adjustment is necessary 2880 // since there will already be scalar iterations. Note that the minimum 2881 // iterations check ensures that N >= Step. 2882 if (Cost->requiresScalarEpilogue(VF)) { 2883 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2884 R = Builder.CreateSelect(IsZero, Step, R); 2885 } 2886 2887 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2888 2889 return VectorTripCount; 2890 } 2891 2892 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2893 const DataLayout &DL) { 2894 // Verify that V is a vector type with same number of elements as DstVTy. 2895 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2896 unsigned VF = DstFVTy->getNumElements(); 2897 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2898 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2899 Type *SrcElemTy = SrcVecTy->getElementType(); 2900 Type *DstElemTy = DstFVTy->getElementType(); 2901 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2902 "Vector elements must have same size"); 2903 2904 // Do a direct cast if element types are castable. 2905 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2906 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2907 } 2908 // V cannot be directly casted to desired vector type. 2909 // May happen when V is a floating point vector but DstVTy is a vector of 2910 // pointers or vice-versa. Handle this using a two-step bitcast using an 2911 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2912 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2913 "Only one type should be a pointer type"); 2914 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2915 "Only one type should be a floating point type"); 2916 Type *IntTy = 2917 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2918 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2919 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2920 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2921 } 2922 2923 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2924 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2925 // Reuse existing vector loop preheader for TC checks. 2926 // Note that new preheader block is generated for vector loop. 2927 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2928 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2929 2930 // Generate code to check if the loop's trip count is less than VF * UF, or 2931 // equal to it in case a scalar epilogue is required; this implies that the 2932 // vector trip count is zero. This check also covers the case where adding one 2933 // to the backedge-taken count overflowed leading to an incorrect trip count 2934 // of zero. In this case we will also jump to the scalar loop. 2935 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2936 : ICmpInst::ICMP_ULT; 2937 2938 // If tail is to be folded, vector loop takes care of all iterations. 2939 Type *CountTy = Count->getType(); 2940 Value *CheckMinIters = Builder.getFalse(); 2941 auto CreateStep = [&]() { 2942 // Create step with max(MinProTripCount, UF * VF). 2943 if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue()) 2944 return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2945 return createStepForVF(Builder, CountTy, VF, UF); 2946 }; 2947 2948 if (!Cost->foldTailByMasking()) 2949 CheckMinIters = 2950 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2951 else if (VF.isScalable()) { 2952 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2953 // an overflow to zero when updating induction variables and so an 2954 // additional overflow check is required before entering the vector loop. 2955 2956 // Get the maximum unsigned value for the type. 2957 Value *MaxUIntTripCount = 2958 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2959 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2960 2961 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2962 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2963 } 2964 2965 // Create new preheader for vector loop. 2966 LoopVectorPreHeader = 2967 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2968 "vector.ph"); 2969 2970 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2971 DT->getNode(Bypass)->getIDom()) && 2972 "TC check is expected to dominate Bypass"); 2973 2974 // Update dominator for Bypass & LoopExit (if needed). 2975 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2976 if (!Cost->requiresScalarEpilogue(VF)) 2977 // If there is an epilogue which must run, there's no edge from the 2978 // middle block to exit blocks and thus no need to update the immediate 2979 // dominator of the exit blocks. 2980 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2981 2982 ReplaceInstWithInst( 2983 TCCheckBlock->getTerminator(), 2984 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2985 LoopBypassBlocks.push_back(TCCheckBlock); 2986 } 2987 2988 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2989 BasicBlock *const SCEVCheckBlock = 2990 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 2991 if (!SCEVCheckBlock) 2992 return nullptr; 2993 2994 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2995 (OptForSizeBasedOnProfile && 2996 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2997 "Cannot SCEV check stride or overflow when optimizing for size"); 2998 2999 3000 // Update dominator only if this is first RT check. 3001 if (LoopBypassBlocks.empty()) { 3002 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3003 if (!Cost->requiresScalarEpilogue(VF)) 3004 // If there is an epilogue which must run, there's no edge from the 3005 // middle block to exit blocks and thus no need to update the immediate 3006 // dominator of the exit blocks. 3007 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3008 } 3009 3010 LoopBypassBlocks.push_back(SCEVCheckBlock); 3011 AddedSafetyChecks = true; 3012 return SCEVCheckBlock; 3013 } 3014 3015 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3016 // VPlan-native path does not do any analysis for runtime checks currently. 3017 if (EnableVPlanNativePath) 3018 return nullptr; 3019 3020 BasicBlock *const MemCheckBlock = 3021 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3022 3023 // Check if we generated code that checks in runtime if arrays overlap. We put 3024 // the checks into a separate block to make the more common case of few 3025 // elements faster. 3026 if (!MemCheckBlock) 3027 return nullptr; 3028 3029 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3030 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3031 "Cannot emit memory checks when optimizing for size, unless forced " 3032 "to vectorize."); 3033 ORE->emit([&]() { 3034 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3035 OrigLoop->getStartLoc(), 3036 OrigLoop->getHeader()) 3037 << "Code-size may be reduced by not forcing " 3038 "vectorization, or by source-code modifications " 3039 "eliminating the need for runtime checks " 3040 "(e.g., adding 'restrict')."; 3041 }); 3042 } 3043 3044 LoopBypassBlocks.push_back(MemCheckBlock); 3045 3046 AddedSafetyChecks = true; 3047 3048 return MemCheckBlock; 3049 } 3050 3051 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3052 LoopScalarBody = OrigLoop->getHeader(); 3053 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3054 assert(LoopVectorPreHeader && "Invalid loop structure"); 3055 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3056 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3057 "multiple exit loop without required epilogue?"); 3058 3059 LoopMiddleBlock = 3060 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3061 LI, nullptr, Twine(Prefix) + "middle.block"); 3062 LoopScalarPreHeader = 3063 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3064 nullptr, Twine(Prefix) + "scalar.ph"); 3065 3066 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3067 3068 // Set up the middle block terminator. Two cases: 3069 // 1) If we know that we must execute the scalar epilogue, emit an 3070 // unconditional branch. 3071 // 2) Otherwise, we must have a single unique exit block (due to how we 3072 // implement the multiple exit case). In this case, set up a conditonal 3073 // branch from the middle block to the loop scalar preheader, and the 3074 // exit block. completeLoopSkeleton will update the condition to use an 3075 // iteration check, if required to decide whether to execute the remainder. 3076 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3077 BranchInst::Create(LoopScalarPreHeader) : 3078 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3079 Builder.getTrue()); 3080 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3081 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3082 3083 // Update dominator for loop exit. During skeleton creation, only the vector 3084 // pre-header and the middle block are created. The vector loop is entirely 3085 // created during VPlan exection. 3086 if (!Cost->requiresScalarEpilogue(VF)) 3087 // If there is an epilogue which must run, there's no edge from the 3088 // middle block to exit blocks and thus no need to update the immediate 3089 // dominator of the exit blocks. 3090 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3091 } 3092 3093 void InnerLoopVectorizer::createInductionResumeValues( 3094 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3095 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3096 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3097 "Inconsistent information about additional bypass."); 3098 3099 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3100 assert(VectorTripCount && "Expected valid arguments"); 3101 // We are going to resume the execution of the scalar loop. 3102 // Go over all of the induction variables that we found and fix the 3103 // PHIs that are left in the scalar version of the loop. 3104 // The starting values of PHI nodes depend on the counter of the last 3105 // iteration in the vectorized loop. 3106 // If we come from a bypass edge then we need to start from the original 3107 // start value. 3108 Instruction *OldInduction = Legal->getPrimaryInduction(); 3109 for (auto &InductionEntry : Legal->getInductionVars()) { 3110 PHINode *OrigPhi = InductionEntry.first; 3111 InductionDescriptor II = InductionEntry.second; 3112 3113 Value *&EndValue = IVEndValues[OrigPhi]; 3114 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3115 if (OrigPhi == OldInduction) { 3116 // We know what the end value is. 3117 EndValue = VectorTripCount; 3118 } else { 3119 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3120 3121 // Fast-math-flags propagate from the original induction instruction. 3122 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3123 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3124 3125 Type *StepType = II.getStep()->getType(); 3126 Instruction::CastOps CastOp = 3127 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3128 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3129 Value *Step = 3130 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3131 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3132 EndValue->setName("ind.end"); 3133 3134 // Compute the end value for the additional bypass (if applicable). 3135 if (AdditionalBypass.first) { 3136 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3137 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3138 StepType, true); 3139 Value *Step = 3140 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3141 VTC = 3142 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3143 EndValueFromAdditionalBypass = 3144 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3145 EndValueFromAdditionalBypass->setName("ind.end"); 3146 } 3147 } 3148 3149 // Create phi nodes to merge from the backedge-taken check block. 3150 PHINode *BCResumeVal = 3151 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3152 LoopScalarPreHeader->getTerminator()); 3153 // Copy original phi DL over to the new one. 3154 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3155 3156 // The new PHI merges the original incoming value, in case of a bypass, 3157 // or the value at the end of the vectorized loop. 3158 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3159 3160 // Fix the scalar body counter (PHI node). 3161 // The old induction's phi node in the scalar body needs the truncated 3162 // value. 3163 for (BasicBlock *BB : LoopBypassBlocks) 3164 BCResumeVal->addIncoming(II.getStartValue(), BB); 3165 3166 if (AdditionalBypass.first) 3167 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3168 EndValueFromAdditionalBypass); 3169 3170 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3171 } 3172 } 3173 3174 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3175 // The trip counts should be cached by now. 3176 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3177 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3178 3179 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3180 3181 // Add a check in the middle block to see if we have completed 3182 // all of the iterations in the first vector loop. Three cases: 3183 // 1) If we require a scalar epilogue, there is no conditional branch as 3184 // we unconditionally branch to the scalar preheader. Do nothing. 3185 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3186 // Thus if tail is to be folded, we know we don't need to run the 3187 // remainder and we can use the previous value for the condition (true). 3188 // 3) Otherwise, construct a runtime check. 3189 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3190 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3191 Count, VectorTripCount, "cmp.n", 3192 LoopMiddleBlock->getTerminator()); 3193 3194 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3195 // of the corresponding compare because they may have ended up with 3196 // different line numbers and we want to avoid awkward line stepping while 3197 // debugging. Eg. if the compare has got a line number inside the loop. 3198 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3199 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3200 } 3201 3202 #ifdef EXPENSIVE_CHECKS 3203 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3204 #endif 3205 3206 return LoopVectorPreHeader; 3207 } 3208 3209 std::pair<BasicBlock *, Value *> 3210 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3211 /* 3212 In this function we generate a new loop. The new loop will contain 3213 the vectorized instructions while the old loop will continue to run the 3214 scalar remainder. 3215 3216 [ ] <-- loop iteration number check. 3217 / | 3218 / v 3219 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3220 | / | 3221 | / v 3222 || [ ] <-- vector pre header. 3223 |/ | 3224 | v 3225 | [ ] \ 3226 | [ ]_| <-- vector loop (created during VPlan execution). 3227 | | 3228 | v 3229 \ -[ ] <--- middle-block. 3230 \/ | 3231 /\ v 3232 | ->[ ] <--- new preheader. 3233 | | 3234 (opt) v <-- edge from middle to exit iff epilogue is not required. 3235 | [ ] \ 3236 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3237 \ | 3238 \ v 3239 >[ ] <-- exit block(s). 3240 ... 3241 */ 3242 3243 // Get the metadata of the original loop before it gets modified. 3244 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3245 3246 // Workaround! Compute the trip count of the original loop and cache it 3247 // before we start modifying the CFG. This code has a systemic problem 3248 // wherein it tries to run analysis over partially constructed IR; this is 3249 // wrong, and not simply for SCEV. The trip count of the original loop 3250 // simply happens to be prone to hitting this in practice. In theory, we 3251 // can hit the same issue for any SCEV, or ValueTracking query done during 3252 // mutation. See PR49900. 3253 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3254 3255 // Create an empty vector loop, and prepare basic blocks for the runtime 3256 // checks. 3257 createVectorLoopSkeleton(""); 3258 3259 // Now, compare the new count to zero. If it is zero skip the vector loop and 3260 // jump to the scalar loop. This check also covers the case where the 3261 // backedge-taken count is uint##_max: adding one to it will overflow leading 3262 // to an incorrect trip count of zero. In this (rare) case we will also jump 3263 // to the scalar loop. 3264 emitIterationCountCheck(LoopScalarPreHeader); 3265 3266 // Generate the code to check any assumptions that we've made for SCEV 3267 // expressions. 3268 emitSCEVChecks(LoopScalarPreHeader); 3269 3270 // Generate the code that checks in runtime if arrays overlap. We put the 3271 // checks into a separate block to make the more common case of few elements 3272 // faster. 3273 emitMemRuntimeChecks(LoopScalarPreHeader); 3274 3275 // Emit phis for the new starting index of the scalar loop. 3276 createInductionResumeValues(); 3277 3278 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3279 } 3280 3281 // Fix up external users of the induction variable. At this point, we are 3282 // in LCSSA form, with all external PHIs that use the IV having one input value, 3283 // coming from the remainder loop. We need those PHIs to also have a correct 3284 // value for the IV when arriving directly from the middle block. 3285 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3286 const InductionDescriptor &II, 3287 Value *VectorTripCount, Value *EndValue, 3288 BasicBlock *MiddleBlock, 3289 BasicBlock *VectorHeader, VPlan &Plan) { 3290 // There are two kinds of external IV usages - those that use the value 3291 // computed in the last iteration (the PHI) and those that use the penultimate 3292 // value (the value that feeds into the phi from the loop latch). 3293 // We allow both, but they, obviously, have different values. 3294 3295 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3296 3297 DenseMap<Value *, Value *> MissingVals; 3298 3299 // An external user of the last iteration's value should see the value that 3300 // the remainder loop uses to initialize its own IV. 3301 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3302 for (User *U : PostInc->users()) { 3303 Instruction *UI = cast<Instruction>(U); 3304 if (!OrigLoop->contains(UI)) { 3305 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3306 MissingVals[UI] = EndValue; 3307 } 3308 } 3309 3310 // An external user of the penultimate value need to see EndValue - Step. 3311 // The simplest way to get this is to recompute it from the constituent SCEVs, 3312 // that is Start + (Step * (CRD - 1)). 3313 for (User *U : OrigPhi->users()) { 3314 auto *UI = cast<Instruction>(U); 3315 if (!OrigLoop->contains(UI)) { 3316 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3317 3318 IRBuilder<> B(MiddleBlock->getTerminator()); 3319 3320 // Fast-math-flags propagate from the original induction instruction. 3321 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3322 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3323 3324 Value *CountMinusOne = B.CreateSub( 3325 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3326 Value *CMO = 3327 !II.getStep()->getType()->isIntegerTy() 3328 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3329 II.getStep()->getType()) 3330 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3331 CMO->setName("cast.cmo"); 3332 3333 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3334 VectorHeader->getTerminator()); 3335 Value *Escape = 3336 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3337 Escape->setName("ind.escape"); 3338 MissingVals[UI] = Escape; 3339 } 3340 } 3341 3342 for (auto &I : MissingVals) { 3343 PHINode *PHI = cast<PHINode>(I.first); 3344 // One corner case we have to handle is two IVs "chasing" each-other, 3345 // that is %IV2 = phi [...], [ %IV1, %latch ] 3346 // In this case, if IV1 has an external use, we need to avoid adding both 3347 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3348 // don't already have an incoming value for the middle block. 3349 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3350 PHI->addIncoming(I.second, MiddleBlock); 3351 Plan.removeLiveOut(PHI); 3352 } 3353 } 3354 } 3355 3356 namespace { 3357 3358 struct CSEDenseMapInfo { 3359 static bool canHandle(const Instruction *I) { 3360 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3361 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3362 } 3363 3364 static inline Instruction *getEmptyKey() { 3365 return DenseMapInfo<Instruction *>::getEmptyKey(); 3366 } 3367 3368 static inline Instruction *getTombstoneKey() { 3369 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3370 } 3371 3372 static unsigned getHashValue(const Instruction *I) { 3373 assert(canHandle(I) && "Unknown instruction!"); 3374 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3375 I->value_op_end())); 3376 } 3377 3378 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3379 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3380 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3381 return LHS == RHS; 3382 return LHS->isIdenticalTo(RHS); 3383 } 3384 }; 3385 3386 } // end anonymous namespace 3387 3388 ///Perform cse of induction variable instructions. 3389 static void cse(BasicBlock *BB) { 3390 // Perform simple cse. 3391 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3392 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3393 if (!CSEDenseMapInfo::canHandle(&In)) 3394 continue; 3395 3396 // Check if we can replace this instruction with any of the 3397 // visited instructions. 3398 if (Instruction *V = CSEMap.lookup(&In)) { 3399 In.replaceAllUsesWith(V); 3400 In.eraseFromParent(); 3401 continue; 3402 } 3403 3404 CSEMap[&In] = &In; 3405 } 3406 } 3407 3408 InstructionCost 3409 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3410 bool &NeedToScalarize) const { 3411 Function *F = CI->getCalledFunction(); 3412 Type *ScalarRetTy = CI->getType(); 3413 SmallVector<Type *, 4> Tys, ScalarTys; 3414 for (auto &ArgOp : CI->args()) 3415 ScalarTys.push_back(ArgOp->getType()); 3416 3417 // Estimate cost of scalarized vector call. The source operands are assumed 3418 // to be vectors, so we need to extract individual elements from there, 3419 // execute VF scalar calls, and then gather the result into the vector return 3420 // value. 3421 InstructionCost ScalarCallCost = 3422 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3423 if (VF.isScalar()) 3424 return ScalarCallCost; 3425 3426 // Compute corresponding vector type for return value and arguments. 3427 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3428 for (Type *ScalarTy : ScalarTys) 3429 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3430 3431 // Compute costs of unpacking argument values for the scalar calls and 3432 // packing the return values to a vector. 3433 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3434 3435 InstructionCost Cost = 3436 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3437 3438 // If we can't emit a vector call for this function, then the currently found 3439 // cost is the cost we need to return. 3440 NeedToScalarize = true; 3441 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3442 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3443 3444 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3445 return Cost; 3446 3447 // If the corresponding vector cost is cheaper, return its cost. 3448 InstructionCost VectorCallCost = 3449 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3450 if (VectorCallCost < Cost) { 3451 NeedToScalarize = false; 3452 Cost = VectorCallCost; 3453 } 3454 return Cost; 3455 } 3456 3457 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3458 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3459 return Elt; 3460 return VectorType::get(Elt, VF); 3461 } 3462 3463 InstructionCost 3464 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3465 ElementCount VF) const { 3466 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3467 assert(ID && "Expected intrinsic call!"); 3468 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3469 FastMathFlags FMF; 3470 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3471 FMF = FPMO->getFastMathFlags(); 3472 3473 SmallVector<const Value *> Arguments(CI->args()); 3474 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3475 SmallVector<Type *> ParamTys; 3476 std::transform(FTy->param_begin(), FTy->param_end(), 3477 std::back_inserter(ParamTys), 3478 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3479 3480 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3481 dyn_cast<IntrinsicInst>(CI)); 3482 return TTI.getIntrinsicInstrCost(CostAttrs, 3483 TargetTransformInfo::TCK_RecipThroughput); 3484 } 3485 3486 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3487 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3488 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3489 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3490 } 3491 3492 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3493 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3494 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3495 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3496 } 3497 3498 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3499 // For every instruction `I` in MinBWs, truncate the operands, create a 3500 // truncated version of `I` and reextend its result. InstCombine runs 3501 // later and will remove any ext/trunc pairs. 3502 SmallPtrSet<Value *, 4> Erased; 3503 for (const auto &KV : Cost->getMinimalBitwidths()) { 3504 // If the value wasn't vectorized, we must maintain the original scalar 3505 // type. The absence of the value from State indicates that it 3506 // wasn't vectorized. 3507 // FIXME: Should not rely on getVPValue at this point. 3508 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3509 if (!State.hasAnyVectorValue(Def)) 3510 continue; 3511 for (unsigned Part = 0; Part < UF; ++Part) { 3512 Value *I = State.get(Def, Part); 3513 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3514 continue; 3515 Type *OriginalTy = I->getType(); 3516 Type *ScalarTruncatedTy = 3517 IntegerType::get(OriginalTy->getContext(), KV.second); 3518 auto *TruncatedTy = VectorType::get( 3519 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3520 if (TruncatedTy == OriginalTy) 3521 continue; 3522 3523 IRBuilder<> B(cast<Instruction>(I)); 3524 auto ShrinkOperand = [&](Value *V) -> Value * { 3525 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3526 if (ZI->getSrcTy() == TruncatedTy) 3527 return ZI->getOperand(0); 3528 return B.CreateZExtOrTrunc(V, TruncatedTy); 3529 }; 3530 3531 // The actual instruction modification depends on the instruction type, 3532 // unfortunately. 3533 Value *NewI = nullptr; 3534 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3535 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3536 ShrinkOperand(BO->getOperand(1))); 3537 3538 // Any wrapping introduced by shrinking this operation shouldn't be 3539 // considered undefined behavior. So, we can't unconditionally copy 3540 // arithmetic wrapping flags to NewI. 3541 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3542 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3543 NewI = 3544 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3545 ShrinkOperand(CI->getOperand(1))); 3546 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3547 NewI = B.CreateSelect(SI->getCondition(), 3548 ShrinkOperand(SI->getTrueValue()), 3549 ShrinkOperand(SI->getFalseValue())); 3550 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3551 switch (CI->getOpcode()) { 3552 default: 3553 llvm_unreachable("Unhandled cast!"); 3554 case Instruction::Trunc: 3555 NewI = ShrinkOperand(CI->getOperand(0)); 3556 break; 3557 case Instruction::SExt: 3558 NewI = B.CreateSExtOrTrunc( 3559 CI->getOperand(0), 3560 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3561 break; 3562 case Instruction::ZExt: 3563 NewI = B.CreateZExtOrTrunc( 3564 CI->getOperand(0), 3565 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3566 break; 3567 } 3568 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3569 auto Elements0 = 3570 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3571 auto *O0 = B.CreateZExtOrTrunc( 3572 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3573 auto Elements1 = 3574 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3575 auto *O1 = B.CreateZExtOrTrunc( 3576 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3577 3578 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3579 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3580 // Don't do anything with the operands, just extend the result. 3581 continue; 3582 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3583 auto Elements = 3584 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3585 auto *O0 = B.CreateZExtOrTrunc( 3586 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3587 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3588 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3589 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3590 auto Elements = 3591 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3592 auto *O0 = B.CreateZExtOrTrunc( 3593 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3594 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3595 } else { 3596 // If we don't know what to do, be conservative and don't do anything. 3597 continue; 3598 } 3599 3600 // Lastly, extend the result. 3601 NewI->takeName(cast<Instruction>(I)); 3602 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3603 I->replaceAllUsesWith(Res); 3604 cast<Instruction>(I)->eraseFromParent(); 3605 Erased.insert(I); 3606 State.reset(Def, Res, Part); 3607 } 3608 } 3609 3610 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3611 for (const auto &KV : Cost->getMinimalBitwidths()) { 3612 // If the value wasn't vectorized, we must maintain the original scalar 3613 // type. The absence of the value from State indicates that it 3614 // wasn't vectorized. 3615 // FIXME: Should not rely on getVPValue at this point. 3616 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3617 if (!State.hasAnyVectorValue(Def)) 3618 continue; 3619 for (unsigned Part = 0; Part < UF; ++Part) { 3620 Value *I = State.get(Def, Part); 3621 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3622 if (Inst && Inst->use_empty()) { 3623 Value *NewI = Inst->getOperand(0); 3624 Inst->eraseFromParent(); 3625 State.reset(Def, NewI, Part); 3626 } 3627 } 3628 } 3629 } 3630 3631 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3632 VPlan &Plan) { 3633 // Insert truncates and extends for any truncated instructions as hints to 3634 // InstCombine. 3635 if (VF.isVector()) 3636 truncateToMinimalBitwidths(State); 3637 3638 // Fix widened non-induction PHIs by setting up the PHI operands. 3639 if (EnableVPlanNativePath) 3640 fixNonInductionPHIs(Plan, State); 3641 3642 // At this point every instruction in the original loop is widened to a 3643 // vector form. Now we need to fix the recurrences in the loop. These PHI 3644 // nodes are currently empty because we did not want to introduce cycles. 3645 // This is the second stage of vectorizing recurrences. 3646 fixCrossIterationPHIs(State); 3647 3648 // Forget the original basic block. 3649 PSE.getSE()->forgetLoop(OrigLoop); 3650 3651 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3652 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3653 if (Cost->requiresScalarEpilogue(VF)) { 3654 // No edge from the middle block to the unique exit block has been inserted 3655 // and there is nothing to fix from vector loop; phis should have incoming 3656 // from scalar loop only. 3657 Plan.clearLiveOuts(); 3658 } else { 3659 // If we inserted an edge from the middle block to the unique exit block, 3660 // update uses outside the loop (phis) to account for the newly inserted 3661 // edge. 3662 3663 // Fix-up external users of the induction variables. 3664 for (auto &Entry : Legal->getInductionVars()) 3665 fixupIVUsers(Entry.first, Entry.second, 3666 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3667 IVEndValues[Entry.first], LoopMiddleBlock, 3668 VectorLoop->getHeader(), Plan); 3669 } 3670 3671 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3672 // in the exit block, so update the builder. 3673 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3674 for (auto &KV : Plan.getLiveOuts()) 3675 KV.second->fixPhi(Plan, State); 3676 3677 for (Instruction *PI : PredicatedInstructions) 3678 sinkScalarOperands(&*PI); 3679 3680 // Remove redundant induction instructions. 3681 cse(VectorLoop->getHeader()); 3682 3683 // Set/update profile weights for the vector and remainder loops as original 3684 // loop iterations are now distributed among them. Note that original loop 3685 // represented by LoopScalarBody becomes remainder loop after vectorization. 3686 // 3687 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3688 // end up getting slightly roughened result but that should be OK since 3689 // profile is not inherently precise anyway. Note also possible bypass of 3690 // vector code caused by legality checks is ignored, assigning all the weight 3691 // to the vector loop, optimistically. 3692 // 3693 // For scalable vectorization we can't know at compile time how many iterations 3694 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3695 // vscale of '1'. 3696 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3697 LI->getLoopFor(LoopScalarBody), 3698 VF.getKnownMinValue() * UF); 3699 } 3700 3701 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3702 // In order to support recurrences we need to be able to vectorize Phi nodes. 3703 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3704 // stage #2: We now need to fix the recurrences by adding incoming edges to 3705 // the currently empty PHI nodes. At this point every instruction in the 3706 // original loop is widened to a vector form so we can use them to construct 3707 // the incoming edges. 3708 VPBasicBlock *Header = 3709 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3710 for (VPRecipeBase &R : Header->phis()) { 3711 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3712 fixReduction(ReductionPhi, State); 3713 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3714 fixFirstOrderRecurrence(FOR, State); 3715 } 3716 } 3717 3718 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3719 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3720 // This is the second phase of vectorizing first-order recurrences. An 3721 // overview of the transformation is described below. Suppose we have the 3722 // following loop. 3723 // 3724 // for (int i = 0; i < n; ++i) 3725 // b[i] = a[i] - a[i - 1]; 3726 // 3727 // There is a first-order recurrence on "a". For this loop, the shorthand 3728 // scalar IR looks like: 3729 // 3730 // scalar.ph: 3731 // s_init = a[-1] 3732 // br scalar.body 3733 // 3734 // scalar.body: 3735 // i = phi [0, scalar.ph], [i+1, scalar.body] 3736 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3737 // s2 = a[i] 3738 // b[i] = s2 - s1 3739 // br cond, scalar.body, ... 3740 // 3741 // In this example, s1 is a recurrence because it's value depends on the 3742 // previous iteration. In the first phase of vectorization, we created a 3743 // vector phi v1 for s1. We now complete the vectorization and produce the 3744 // shorthand vector IR shown below (for VF = 4, UF = 1). 3745 // 3746 // vector.ph: 3747 // v_init = vector(..., ..., ..., a[-1]) 3748 // br vector.body 3749 // 3750 // vector.body 3751 // i = phi [0, vector.ph], [i+4, vector.body] 3752 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3753 // v2 = a[i, i+1, i+2, i+3]; 3754 // v3 = vector(v1(3), v2(0, 1, 2)) 3755 // b[i, i+1, i+2, i+3] = v2 - v3 3756 // br cond, vector.body, middle.block 3757 // 3758 // middle.block: 3759 // x = v2(3) 3760 // br scalar.ph 3761 // 3762 // scalar.ph: 3763 // s_init = phi [x, middle.block], [a[-1], otherwise] 3764 // br scalar.body 3765 // 3766 // After execution completes the vector loop, we extract the next value of 3767 // the recurrence (x) to use as the initial value in the scalar loop. 3768 3769 // Extract the last vector element in the middle block. This will be the 3770 // initial value for the recurrence when jumping to the scalar loop. 3771 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3772 Value *Incoming = State.get(PreviousDef, UF - 1); 3773 auto *ExtractForScalar = Incoming; 3774 auto *IdxTy = Builder.getInt32Ty(); 3775 if (VF.isVector()) { 3776 auto *One = ConstantInt::get(IdxTy, 1); 3777 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3778 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3779 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3780 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3781 "vector.recur.extract"); 3782 } 3783 // Extract the second last element in the middle block if the 3784 // Phi is used outside the loop. We need to extract the phi itself 3785 // and not the last element (the phi update in the current iteration). This 3786 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3787 // when the scalar loop is not run at all. 3788 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3789 if (VF.isVector()) { 3790 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3791 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3792 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3793 Incoming, Idx, "vector.recur.extract.for.phi"); 3794 } else if (UF > 1) 3795 // When loop is unrolled without vectorizing, initialize 3796 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3797 // of `Incoming`. This is analogous to the vectorized case above: extracting 3798 // the second last element when VF > 1. 3799 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3800 3801 // Fix the initial value of the original recurrence in the scalar loop. 3802 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3803 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3804 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3805 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3806 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3807 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3808 Start->addIncoming(Incoming, BB); 3809 } 3810 3811 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3812 Phi->setName("scalar.recur"); 3813 3814 // Finally, fix users of the recurrence outside the loop. The users will need 3815 // either the last value of the scalar recurrence or the last value of the 3816 // vector recurrence we extracted in the middle block. Since the loop is in 3817 // LCSSA form, we just need to find all the phi nodes for the original scalar 3818 // recurrence in the exit block, and then add an edge for the middle block. 3819 // Note that LCSSA does not imply single entry when the original scalar loop 3820 // had multiple exiting edges (as we always run the last iteration in the 3821 // scalar epilogue); in that case, there is no edge from middle to exit and 3822 // and thus no phis which needed updated. 3823 if (!Cost->requiresScalarEpilogue(VF)) 3824 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3825 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3826 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3827 State.Plan->removeLiveOut(&LCSSAPhi); 3828 } 3829 } 3830 3831 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3832 VPTransformState &State) { 3833 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3834 // Get it's reduction variable descriptor. 3835 assert(Legal->isReductionVariable(OrigPhi) && 3836 "Unable to find the reduction variable"); 3837 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3838 3839 RecurKind RK = RdxDesc.getRecurrenceKind(); 3840 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3841 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3842 State.setDebugLocFromInst(ReductionStartValue); 3843 3844 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3845 // This is the vector-clone of the value that leaves the loop. 3846 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3847 3848 // Wrap flags are in general invalid after vectorization, clear them. 3849 clearReductionWrapFlags(PhiR, State); 3850 3851 // Before each round, move the insertion point right between 3852 // the PHIs and the values we are going to write. 3853 // This allows us to write both PHINodes and the extractelement 3854 // instructions. 3855 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3856 3857 State.setDebugLocFromInst(LoopExitInst); 3858 3859 Type *PhiTy = OrigPhi->getType(); 3860 3861 VPBasicBlock *LatchVPBB = 3862 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3863 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3864 // If tail is folded by masking, the vector value to leave the loop should be 3865 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3866 // instead of the former. For an inloop reduction the reduction will already 3867 // be predicated, and does not need to be handled here. 3868 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3869 for (unsigned Part = 0; Part < UF; ++Part) { 3870 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3871 SelectInst *Sel = nullptr; 3872 for (User *U : VecLoopExitInst->users()) { 3873 if (isa<SelectInst>(U)) { 3874 assert(!Sel && "Reduction exit feeding two selects"); 3875 Sel = cast<SelectInst>(U); 3876 } else 3877 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3878 } 3879 assert(Sel && "Reduction exit feeds no select"); 3880 State.reset(LoopExitInstDef, Sel, Part); 3881 3882 if (isa<FPMathOperator>(Sel)) 3883 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3884 3885 // If the target can create a predicated operator for the reduction at no 3886 // extra cost in the loop (for example a predicated vadd), it can be 3887 // cheaper for the select to remain in the loop than be sunk out of it, 3888 // and so use the select value for the phi instead of the old 3889 // LoopExitValue. 3890 if (PreferPredicatedReductionSelect || 3891 TTI->preferPredicatedReductionSelect( 3892 RdxDesc.getOpcode(), PhiTy, 3893 TargetTransformInfo::ReductionFlags())) { 3894 auto *VecRdxPhi = 3895 cast<PHINode>(State.get(PhiR, Part)); 3896 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3897 } 3898 } 3899 } 3900 3901 // If the vector reduction can be performed in a smaller type, we truncate 3902 // then extend the loop exit value to enable InstCombine to evaluate the 3903 // entire expression in the smaller type. 3904 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3905 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3906 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3907 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3908 VectorParts RdxParts(UF); 3909 for (unsigned Part = 0; Part < UF; ++Part) { 3910 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3911 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3912 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3913 : Builder.CreateZExt(Trunc, VecTy); 3914 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3915 if (U != Trunc) { 3916 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3917 RdxParts[Part] = Extnd; 3918 } 3919 } 3920 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3921 for (unsigned Part = 0; Part < UF; ++Part) { 3922 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3923 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3924 } 3925 } 3926 3927 // Reduce all of the unrolled parts into a single vector. 3928 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3929 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3930 3931 // The middle block terminator has already been assigned a DebugLoc here (the 3932 // OrigLoop's single latch terminator). We want the whole middle block to 3933 // appear to execute on this line because: (a) it is all compiler generated, 3934 // (b) these instructions are always executed after evaluating the latch 3935 // conditional branch, and (c) other passes may add new predecessors which 3936 // terminate on this line. This is the easiest way to ensure we don't 3937 // accidentally cause an extra step back into the loop while debugging. 3938 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3939 if (PhiR->isOrdered()) 3940 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3941 else { 3942 // Floating-point operations should have some FMF to enable the reduction. 3943 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3944 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3945 for (unsigned Part = 1; Part < UF; ++Part) { 3946 Value *RdxPart = State.get(LoopExitInstDef, Part); 3947 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3948 ReducedPartRdx = Builder.CreateBinOp( 3949 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3950 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3951 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3952 ReducedPartRdx, RdxPart); 3953 else 3954 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3955 } 3956 } 3957 3958 // Create the reduction after the loop. Note that inloop reductions create the 3959 // target reduction in the loop using a Reduction recipe. 3960 if (VF.isVector() && !PhiR->isInLoop()) { 3961 ReducedPartRdx = 3962 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3963 // If the reduction can be performed in a smaller type, we need to extend 3964 // the reduction to the wider type before we branch to the original loop. 3965 if (PhiTy != RdxDesc.getRecurrenceType()) 3966 ReducedPartRdx = RdxDesc.isSigned() 3967 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3968 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3969 } 3970 3971 PHINode *ResumePhi = 3972 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 3973 3974 // Create a phi node that merges control-flow from the backedge-taken check 3975 // block and the middle block. 3976 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 3977 LoopScalarPreHeader->getTerminator()); 3978 3979 // If we are fixing reductions in the epilogue loop then we should already 3980 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 3981 // we carry over the incoming values correctly. 3982 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 3983 if (Incoming == LoopMiddleBlock) 3984 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 3985 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 3986 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 3987 Incoming); 3988 else 3989 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 3990 } 3991 3992 // Set the resume value for this reduction 3993 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 3994 3995 // If there were stores of the reduction value to a uniform memory address 3996 // inside the loop, create the final store here. 3997 if (StoreInst *SI = RdxDesc.IntermediateStore) { 3998 StoreInst *NewSI = 3999 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4000 propagateMetadata(NewSI, SI); 4001 4002 // If the reduction value is used in other places, 4003 // then let the code below create PHI's for that. 4004 } 4005 4006 // Now, we need to fix the users of the reduction variable 4007 // inside and outside of the scalar remainder loop. 4008 4009 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4010 // in the exit blocks. See comment on analogous loop in 4011 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4012 if (!Cost->requiresScalarEpilogue(VF)) 4013 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4014 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4015 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4016 State.Plan->removeLiveOut(&LCSSAPhi); 4017 } 4018 4019 // Fix the scalar loop reduction variable with the incoming reduction sum 4020 // from the vector body and from the backedge value. 4021 int IncomingEdgeBlockIdx = 4022 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4023 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4024 // Pick the other block. 4025 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4026 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4027 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4028 } 4029 4030 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4031 VPTransformState &State) { 4032 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4033 RecurKind RK = RdxDesc.getRecurrenceKind(); 4034 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4035 return; 4036 4037 SmallVector<VPValue *, 8> Worklist; 4038 SmallPtrSet<VPValue *, 8> Visited; 4039 Worklist.push_back(PhiR); 4040 Visited.insert(PhiR); 4041 4042 while (!Worklist.empty()) { 4043 VPValue *Cur = Worklist.pop_back_val(); 4044 for (unsigned Part = 0; Part < UF; ++Part) { 4045 Value *V = State.get(Cur, Part); 4046 if (!isa<OverflowingBinaryOperator>(V)) 4047 break; 4048 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4049 } 4050 4051 for (VPUser *U : Cur->users()) { 4052 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4053 if (!UserRecipe) 4054 continue; 4055 for (VPValue *V : UserRecipe->definedValues()) 4056 if (Visited.insert(V).second) 4057 Worklist.push_back(V); 4058 } 4059 } 4060 } 4061 4062 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4063 // The basic block and loop containing the predicated instruction. 4064 auto *PredBB = PredInst->getParent(); 4065 auto *VectorLoop = LI->getLoopFor(PredBB); 4066 4067 // Initialize a worklist with the operands of the predicated instruction. 4068 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4069 4070 // Holds instructions that we need to analyze again. An instruction may be 4071 // reanalyzed if we don't yet know if we can sink it or not. 4072 SmallVector<Instruction *, 8> InstsToReanalyze; 4073 4074 // Returns true if a given use occurs in the predicated block. Phi nodes use 4075 // their operands in their corresponding predecessor blocks. 4076 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4077 auto *I = cast<Instruction>(U.getUser()); 4078 BasicBlock *BB = I->getParent(); 4079 if (auto *Phi = dyn_cast<PHINode>(I)) 4080 BB = Phi->getIncomingBlock( 4081 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4082 return BB == PredBB; 4083 }; 4084 4085 // Iteratively sink the scalarized operands of the predicated instruction 4086 // into the block we created for it. When an instruction is sunk, it's 4087 // operands are then added to the worklist. The algorithm ends after one pass 4088 // through the worklist doesn't sink a single instruction. 4089 bool Changed; 4090 do { 4091 // Add the instructions that need to be reanalyzed to the worklist, and 4092 // reset the changed indicator. 4093 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4094 InstsToReanalyze.clear(); 4095 Changed = false; 4096 4097 while (!Worklist.empty()) { 4098 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4099 4100 // We can't sink an instruction if it is a phi node, is not in the loop, 4101 // or may have side effects. 4102 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4103 I->mayHaveSideEffects()) 4104 continue; 4105 4106 // If the instruction is already in PredBB, check if we can sink its 4107 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4108 // sinking the scalar instruction I, hence it appears in PredBB; but it 4109 // may have failed to sink I's operands (recursively), which we try 4110 // (again) here. 4111 if (I->getParent() == PredBB) { 4112 Worklist.insert(I->op_begin(), I->op_end()); 4113 continue; 4114 } 4115 4116 // It's legal to sink the instruction if all its uses occur in the 4117 // predicated block. Otherwise, there's nothing to do yet, and we may 4118 // need to reanalyze the instruction. 4119 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4120 InstsToReanalyze.push_back(I); 4121 continue; 4122 } 4123 4124 // Move the instruction to the beginning of the predicated block, and add 4125 // it's operands to the worklist. 4126 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4127 Worklist.insert(I->op_begin(), I->op_end()); 4128 4129 // The sinking may have enabled other instructions to be sunk, so we will 4130 // need to iterate. 4131 Changed = true; 4132 } 4133 } while (Changed); 4134 } 4135 4136 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4137 VPTransformState &State) { 4138 auto Iter = depth_first( 4139 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); 4140 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4141 for (VPRecipeBase &P : VPBB->phis()) { 4142 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4143 if (!VPPhi) 4144 continue; 4145 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4146 // Make sure the builder has a valid insert point. 4147 Builder.SetInsertPoint(NewPhi); 4148 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4149 VPValue *Inc = VPPhi->getIncomingValue(i); 4150 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4151 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4152 } 4153 } 4154 } 4155 } 4156 4157 bool InnerLoopVectorizer::useOrderedReductions( 4158 const RecurrenceDescriptor &RdxDesc) { 4159 return Cost->useOrderedReductions(RdxDesc); 4160 } 4161 4162 /// A helper function for checking whether an integer division-related 4163 /// instruction may divide by zero (in which case it must be predicated if 4164 /// executed conditionally in the scalar code). 4165 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4166 /// Non-zero divisors that are non compile-time constants will not be 4167 /// converted into multiplication, so we will still end up scalarizing 4168 /// the division, but can do so w/o predication. 4169 static bool mayDivideByZero(Instruction &I) { 4170 assert((I.getOpcode() == Instruction::UDiv || 4171 I.getOpcode() == Instruction::SDiv || 4172 I.getOpcode() == Instruction::URem || 4173 I.getOpcode() == Instruction::SRem) && 4174 "Unexpected instruction"); 4175 Value *Divisor = I.getOperand(1); 4176 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4177 return !CInt || CInt->isZero(); 4178 } 4179 4180 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4181 VPUser &ArgOperands, 4182 VPTransformState &State) { 4183 assert(!isa<DbgInfoIntrinsic>(I) && 4184 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4185 State.setDebugLocFromInst(&I); 4186 4187 Module *M = I.getParent()->getParent()->getParent(); 4188 auto *CI = cast<CallInst>(&I); 4189 4190 SmallVector<Type *, 4> Tys; 4191 for (Value *ArgOperand : CI->args()) 4192 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4193 4194 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4195 4196 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4197 // version of the instruction. 4198 // Is it beneficial to perform intrinsic call compared to lib call? 4199 bool NeedToScalarize = false; 4200 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4201 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4202 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4203 assert((UseVectorIntrinsic || !NeedToScalarize) && 4204 "Instruction should be scalarized elsewhere."); 4205 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4206 "Either the intrinsic cost or vector call cost must be valid"); 4207 4208 for (unsigned Part = 0; Part < UF; ++Part) { 4209 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4210 SmallVector<Value *, 4> Args; 4211 for (auto &I : enumerate(ArgOperands.operands())) { 4212 // Some intrinsics have a scalar argument - don't replace it with a 4213 // vector. 4214 Value *Arg; 4215 if (!UseVectorIntrinsic || 4216 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4217 Arg = State.get(I.value(), Part); 4218 else 4219 Arg = State.get(I.value(), VPIteration(0, 0)); 4220 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4221 TysForDecl.push_back(Arg->getType()); 4222 Args.push_back(Arg); 4223 } 4224 4225 Function *VectorF; 4226 if (UseVectorIntrinsic) { 4227 // Use vector version of the intrinsic. 4228 if (VF.isVector()) 4229 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4230 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4231 assert(VectorF && "Can't retrieve vector intrinsic."); 4232 } else { 4233 // Use vector version of the function call. 4234 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4235 #ifndef NDEBUG 4236 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4237 "Can't create vector function."); 4238 #endif 4239 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4240 } 4241 SmallVector<OperandBundleDef, 1> OpBundles; 4242 CI->getOperandBundlesAsDefs(OpBundles); 4243 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4244 4245 if (isa<FPMathOperator>(V)) 4246 V->copyFastMathFlags(CI); 4247 4248 State.set(Def, V, Part); 4249 State.addMetadata(V, &I); 4250 } 4251 } 4252 4253 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4254 // We should not collect Scalars more than once per VF. Right now, this 4255 // function is called from collectUniformsAndScalars(), which already does 4256 // this check. Collecting Scalars for VF=1 does not make any sense. 4257 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4258 "This function should not be visited twice for the same VF"); 4259 4260 // This avoids any chances of creating a REPLICATE recipe during planning 4261 // since that would result in generation of scalarized code during execution, 4262 // which is not supported for scalable vectors. 4263 if (VF.isScalable()) { 4264 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4265 return; 4266 } 4267 4268 SmallSetVector<Instruction *, 8> Worklist; 4269 4270 // These sets are used to seed the analysis with pointers used by memory 4271 // accesses that will remain scalar. 4272 SmallSetVector<Instruction *, 8> ScalarPtrs; 4273 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4274 auto *Latch = TheLoop->getLoopLatch(); 4275 4276 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4277 // The pointer operands of loads and stores will be scalar as long as the 4278 // memory access is not a gather or scatter operation. The value operand of a 4279 // store will remain scalar if the store is scalarized. 4280 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4281 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4282 assert(WideningDecision != CM_Unknown && 4283 "Widening decision should be ready at this moment"); 4284 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4285 if (Ptr == Store->getValueOperand()) 4286 return WideningDecision == CM_Scalarize; 4287 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4288 "Ptr is neither a value or pointer operand"); 4289 return WideningDecision != CM_GatherScatter; 4290 }; 4291 4292 // A helper that returns true if the given value is a bitcast or 4293 // getelementptr instruction contained in the loop. 4294 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4295 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4296 isa<GetElementPtrInst>(V)) && 4297 !TheLoop->isLoopInvariant(V); 4298 }; 4299 4300 // A helper that evaluates a memory access's use of a pointer. If the use will 4301 // be a scalar use and the pointer is only used by memory accesses, we place 4302 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4303 // PossibleNonScalarPtrs. 4304 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4305 // We only care about bitcast and getelementptr instructions contained in 4306 // the loop. 4307 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4308 return; 4309 4310 // If the pointer has already been identified as scalar (e.g., if it was 4311 // also identified as uniform), there's nothing to do. 4312 auto *I = cast<Instruction>(Ptr); 4313 if (Worklist.count(I)) 4314 return; 4315 4316 // If the use of the pointer will be a scalar use, and all users of the 4317 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4318 // place the pointer in PossibleNonScalarPtrs. 4319 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4320 return isa<LoadInst>(U) || isa<StoreInst>(U); 4321 })) 4322 ScalarPtrs.insert(I); 4323 else 4324 PossibleNonScalarPtrs.insert(I); 4325 }; 4326 4327 // We seed the scalars analysis with three classes of instructions: (1) 4328 // instructions marked uniform-after-vectorization and (2) bitcast, 4329 // getelementptr and (pointer) phi instructions used by memory accesses 4330 // requiring a scalar use. 4331 // 4332 // (1) Add to the worklist all instructions that have been identified as 4333 // uniform-after-vectorization. 4334 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4335 4336 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4337 // memory accesses requiring a scalar use. The pointer operands of loads and 4338 // stores will be scalar as long as the memory accesses is not a gather or 4339 // scatter operation. The value operand of a store will remain scalar if the 4340 // store is scalarized. 4341 for (auto *BB : TheLoop->blocks()) 4342 for (auto &I : *BB) { 4343 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4344 evaluatePtrUse(Load, Load->getPointerOperand()); 4345 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4346 evaluatePtrUse(Store, Store->getPointerOperand()); 4347 evaluatePtrUse(Store, Store->getValueOperand()); 4348 } 4349 } 4350 for (auto *I : ScalarPtrs) 4351 if (!PossibleNonScalarPtrs.count(I)) { 4352 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4353 Worklist.insert(I); 4354 } 4355 4356 // Insert the forced scalars. 4357 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4358 // induction variable when the PHI user is scalarized. 4359 auto ForcedScalar = ForcedScalars.find(VF); 4360 if (ForcedScalar != ForcedScalars.end()) 4361 for (auto *I : ForcedScalar->second) 4362 Worklist.insert(I); 4363 4364 // Expand the worklist by looking through any bitcasts and getelementptr 4365 // instructions we've already identified as scalar. This is similar to the 4366 // expansion step in collectLoopUniforms(); however, here we're only 4367 // expanding to include additional bitcasts and getelementptr instructions. 4368 unsigned Idx = 0; 4369 while (Idx != Worklist.size()) { 4370 Instruction *Dst = Worklist[Idx++]; 4371 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4372 continue; 4373 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4374 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4375 auto *J = cast<Instruction>(U); 4376 return !TheLoop->contains(J) || Worklist.count(J) || 4377 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4378 isScalarUse(J, Src)); 4379 })) { 4380 Worklist.insert(Src); 4381 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4382 } 4383 } 4384 4385 // An induction variable will remain scalar if all users of the induction 4386 // variable and induction variable update remain scalar. 4387 for (auto &Induction : Legal->getInductionVars()) { 4388 auto *Ind = Induction.first; 4389 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4390 4391 // If tail-folding is applied, the primary induction variable will be used 4392 // to feed a vector compare. 4393 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4394 continue; 4395 4396 // Returns true if \p Indvar is a pointer induction that is used directly by 4397 // load/store instruction \p I. 4398 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4399 Instruction *I) { 4400 return Induction.second.getKind() == 4401 InductionDescriptor::IK_PtrInduction && 4402 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4403 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4404 }; 4405 4406 // Determine if all users of the induction variable are scalar after 4407 // vectorization. 4408 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4409 auto *I = cast<Instruction>(U); 4410 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4411 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4412 }); 4413 if (!ScalarInd) 4414 continue; 4415 4416 // Determine if all users of the induction variable update instruction are 4417 // scalar after vectorization. 4418 auto ScalarIndUpdate = 4419 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4420 auto *I = cast<Instruction>(U); 4421 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4422 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4423 }); 4424 if (!ScalarIndUpdate) 4425 continue; 4426 4427 // The induction variable and its update instruction will remain scalar. 4428 Worklist.insert(Ind); 4429 Worklist.insert(IndUpdate); 4430 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4431 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4432 << "\n"); 4433 } 4434 4435 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4436 } 4437 4438 bool LoopVectorizationCostModel::isScalarWithPredication( 4439 Instruction *I, ElementCount VF) const { 4440 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4441 return false; 4442 switch(I->getOpcode()) { 4443 default: 4444 break; 4445 case Instruction::Load: 4446 case Instruction::Store: { 4447 if (!Legal->isMaskRequired(I)) 4448 return false; 4449 auto *Ptr = getLoadStorePointerOperand(I); 4450 auto *Ty = getLoadStoreType(I); 4451 Type *VTy = Ty; 4452 if (VF.isVector()) 4453 VTy = VectorType::get(Ty, VF); 4454 const Align Alignment = getLoadStoreAlignment(I); 4455 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4456 TTI.isLegalMaskedGather(VTy, Alignment)) 4457 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4458 TTI.isLegalMaskedScatter(VTy, Alignment)); 4459 } 4460 case Instruction::UDiv: 4461 case Instruction::SDiv: 4462 case Instruction::SRem: 4463 case Instruction::URem: 4464 return mayDivideByZero(*I); 4465 } 4466 return false; 4467 } 4468 4469 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4470 Instruction *I, ElementCount VF) { 4471 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4472 assert(getWideningDecision(I, VF) == CM_Unknown && 4473 "Decision should not be set yet."); 4474 auto *Group = getInterleavedAccessGroup(I); 4475 assert(Group && "Must have a group."); 4476 4477 // If the instruction's allocated size doesn't equal it's type size, it 4478 // requires padding and will be scalarized. 4479 auto &DL = I->getModule()->getDataLayout(); 4480 auto *ScalarTy = getLoadStoreType(I); 4481 if (hasIrregularType(ScalarTy, DL)) 4482 return false; 4483 4484 // If the group involves a non-integral pointer, we may not be able to 4485 // losslessly cast all values to a common type. 4486 unsigned InterleaveFactor = Group->getFactor(); 4487 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4488 for (unsigned i = 0; i < InterleaveFactor; i++) { 4489 Instruction *Member = Group->getMember(i); 4490 if (!Member) 4491 continue; 4492 auto *MemberTy = getLoadStoreType(Member); 4493 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4494 // Don't coerce non-integral pointers to integers or vice versa. 4495 if (MemberNI != ScalarNI) { 4496 // TODO: Consider adding special nullptr value case here 4497 return false; 4498 } else if (MemberNI && ScalarNI && 4499 ScalarTy->getPointerAddressSpace() != 4500 MemberTy->getPointerAddressSpace()) { 4501 return false; 4502 } 4503 } 4504 4505 // Check if masking is required. 4506 // A Group may need masking for one of two reasons: it resides in a block that 4507 // needs predication, or it was decided to use masking to deal with gaps 4508 // (either a gap at the end of a load-access that may result in a speculative 4509 // load, or any gaps in a store-access). 4510 bool PredicatedAccessRequiresMasking = 4511 blockNeedsPredicationForAnyReason(I->getParent()) && 4512 Legal->isMaskRequired(I); 4513 bool LoadAccessWithGapsRequiresEpilogMasking = 4514 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4515 !isScalarEpilogueAllowed(); 4516 bool StoreAccessWithGapsRequiresMasking = 4517 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4518 if (!PredicatedAccessRequiresMasking && 4519 !LoadAccessWithGapsRequiresEpilogMasking && 4520 !StoreAccessWithGapsRequiresMasking) 4521 return true; 4522 4523 // If masked interleaving is required, we expect that the user/target had 4524 // enabled it, because otherwise it either wouldn't have been created or 4525 // it should have been invalidated by the CostModel. 4526 assert(useMaskedInterleavedAccesses(TTI) && 4527 "Masked interleave-groups for predicated accesses are not enabled."); 4528 4529 if (Group->isReverse()) 4530 return false; 4531 4532 auto *Ty = getLoadStoreType(I); 4533 const Align Alignment = getLoadStoreAlignment(I); 4534 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4535 : TTI.isLegalMaskedStore(Ty, Alignment); 4536 } 4537 4538 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4539 Instruction *I, ElementCount VF) { 4540 // Get and ensure we have a valid memory instruction. 4541 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4542 4543 auto *Ptr = getLoadStorePointerOperand(I); 4544 auto *ScalarTy = getLoadStoreType(I); 4545 4546 // In order to be widened, the pointer should be consecutive, first of all. 4547 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4548 return false; 4549 4550 // If the instruction is a store located in a predicated block, it will be 4551 // scalarized. 4552 if (isScalarWithPredication(I, VF)) 4553 return false; 4554 4555 // If the instruction's allocated size doesn't equal it's type size, it 4556 // requires padding and will be scalarized. 4557 auto &DL = I->getModule()->getDataLayout(); 4558 if (hasIrregularType(ScalarTy, DL)) 4559 return false; 4560 4561 return true; 4562 } 4563 4564 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4565 // We should not collect Uniforms more than once per VF. Right now, 4566 // this function is called from collectUniformsAndScalars(), which 4567 // already does this check. Collecting Uniforms for VF=1 does not make any 4568 // sense. 4569 4570 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4571 "This function should not be visited twice for the same VF"); 4572 4573 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4574 // not analyze again. Uniforms.count(VF) will return 1. 4575 Uniforms[VF].clear(); 4576 4577 // We now know that the loop is vectorizable! 4578 // Collect instructions inside the loop that will remain uniform after 4579 // vectorization. 4580 4581 // Global values, params and instructions outside of current loop are out of 4582 // scope. 4583 auto isOutOfScope = [&](Value *V) -> bool { 4584 Instruction *I = dyn_cast<Instruction>(V); 4585 return (!I || !TheLoop->contains(I)); 4586 }; 4587 4588 // Worklist containing uniform instructions demanding lane 0. 4589 SetVector<Instruction *> Worklist; 4590 BasicBlock *Latch = TheLoop->getLoopLatch(); 4591 4592 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4593 // that are scalar with predication must not be considered uniform after 4594 // vectorization, because that would create an erroneous replicating region 4595 // where only a single instance out of VF should be formed. 4596 // TODO: optimize such seldom cases if found important, see PR40816. 4597 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4598 if (isOutOfScope(I)) { 4599 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4600 << *I << "\n"); 4601 return; 4602 } 4603 if (isScalarWithPredication(I, VF)) { 4604 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4605 << *I << "\n"); 4606 return; 4607 } 4608 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4609 Worklist.insert(I); 4610 }; 4611 4612 // Start with the conditional branch. If the branch condition is an 4613 // instruction contained in the loop that is only used by the branch, it is 4614 // uniform. 4615 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4616 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4617 addToWorklistIfAllowed(Cmp); 4618 4619 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4620 InstWidening WideningDecision = getWideningDecision(I, VF); 4621 assert(WideningDecision != CM_Unknown && 4622 "Widening decision should be ready at this moment"); 4623 4624 // A uniform memory op is itself uniform. We exclude uniform stores 4625 // here as they demand the last lane, not the first one. 4626 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4627 assert(WideningDecision == CM_Scalarize); 4628 return true; 4629 } 4630 4631 return (WideningDecision == CM_Widen || 4632 WideningDecision == CM_Widen_Reverse || 4633 WideningDecision == CM_Interleave); 4634 }; 4635 4636 4637 // Returns true if Ptr is the pointer operand of a memory access instruction 4638 // I, and I is known to not require scalarization. 4639 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4640 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4641 }; 4642 4643 // Holds a list of values which are known to have at least one uniform use. 4644 // Note that there may be other uses which aren't uniform. A "uniform use" 4645 // here is something which only demands lane 0 of the unrolled iterations; 4646 // it does not imply that all lanes produce the same value (e.g. this is not 4647 // the usual meaning of uniform) 4648 SetVector<Value *> HasUniformUse; 4649 4650 // Scan the loop for instructions which are either a) known to have only 4651 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4652 for (auto *BB : TheLoop->blocks()) 4653 for (auto &I : *BB) { 4654 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4655 switch (II->getIntrinsicID()) { 4656 case Intrinsic::sideeffect: 4657 case Intrinsic::experimental_noalias_scope_decl: 4658 case Intrinsic::assume: 4659 case Intrinsic::lifetime_start: 4660 case Intrinsic::lifetime_end: 4661 if (TheLoop->hasLoopInvariantOperands(&I)) 4662 addToWorklistIfAllowed(&I); 4663 break; 4664 default: 4665 break; 4666 } 4667 } 4668 4669 // ExtractValue instructions must be uniform, because the operands are 4670 // known to be loop-invariant. 4671 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4672 assert(isOutOfScope(EVI->getAggregateOperand()) && 4673 "Expected aggregate value to be loop invariant"); 4674 addToWorklistIfAllowed(EVI); 4675 continue; 4676 } 4677 4678 // If there's no pointer operand, there's nothing to do. 4679 auto *Ptr = getLoadStorePointerOperand(&I); 4680 if (!Ptr) 4681 continue; 4682 4683 // A uniform memory op is itself uniform. We exclude uniform stores 4684 // here as they demand the last lane, not the first one. 4685 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4686 addToWorklistIfAllowed(&I); 4687 4688 if (isUniformDecision(&I, VF)) { 4689 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4690 HasUniformUse.insert(Ptr); 4691 } 4692 } 4693 4694 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4695 // demanding) users. Since loops are assumed to be in LCSSA form, this 4696 // disallows uses outside the loop as well. 4697 for (auto *V : HasUniformUse) { 4698 if (isOutOfScope(V)) 4699 continue; 4700 auto *I = cast<Instruction>(V); 4701 auto UsersAreMemAccesses = 4702 llvm::all_of(I->users(), [&](User *U) -> bool { 4703 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4704 }); 4705 if (UsersAreMemAccesses) 4706 addToWorklistIfAllowed(I); 4707 } 4708 4709 // Expand Worklist in topological order: whenever a new instruction 4710 // is added , its users should be already inside Worklist. It ensures 4711 // a uniform instruction will only be used by uniform instructions. 4712 unsigned idx = 0; 4713 while (idx != Worklist.size()) { 4714 Instruction *I = Worklist[idx++]; 4715 4716 for (auto OV : I->operand_values()) { 4717 // isOutOfScope operands cannot be uniform instructions. 4718 if (isOutOfScope(OV)) 4719 continue; 4720 // First order recurrence Phi's should typically be considered 4721 // non-uniform. 4722 auto *OP = dyn_cast<PHINode>(OV); 4723 if (OP && Legal->isFirstOrderRecurrence(OP)) 4724 continue; 4725 // If all the users of the operand are uniform, then add the 4726 // operand into the uniform worklist. 4727 auto *OI = cast<Instruction>(OV); 4728 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4729 auto *J = cast<Instruction>(U); 4730 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4731 })) 4732 addToWorklistIfAllowed(OI); 4733 } 4734 } 4735 4736 // For an instruction to be added into Worklist above, all its users inside 4737 // the loop should also be in Worklist. However, this condition cannot be 4738 // true for phi nodes that form a cyclic dependence. We must process phi 4739 // nodes separately. An induction variable will remain uniform if all users 4740 // of the induction variable and induction variable update remain uniform. 4741 // The code below handles both pointer and non-pointer induction variables. 4742 for (auto &Induction : Legal->getInductionVars()) { 4743 auto *Ind = Induction.first; 4744 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4745 4746 // Determine if all users of the induction variable are uniform after 4747 // vectorization. 4748 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4749 auto *I = cast<Instruction>(U); 4750 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4751 isVectorizedMemAccessUse(I, Ind); 4752 }); 4753 if (!UniformInd) 4754 continue; 4755 4756 // Determine if all users of the induction variable update instruction are 4757 // uniform after vectorization. 4758 auto UniformIndUpdate = 4759 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4760 auto *I = cast<Instruction>(U); 4761 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4762 isVectorizedMemAccessUse(I, IndUpdate); 4763 }); 4764 if (!UniformIndUpdate) 4765 continue; 4766 4767 // The induction variable and its update instruction will remain uniform. 4768 addToWorklistIfAllowed(Ind); 4769 addToWorklistIfAllowed(IndUpdate); 4770 } 4771 4772 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4773 } 4774 4775 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4776 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4777 4778 if (Legal->getRuntimePointerChecking()->Need) { 4779 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4780 "runtime pointer checks needed. Enable vectorization of this " 4781 "loop with '#pragma clang loop vectorize(enable)' when " 4782 "compiling with -Os/-Oz", 4783 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4784 return true; 4785 } 4786 4787 if (!PSE.getPredicate().isAlwaysTrue()) { 4788 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4789 "runtime SCEV checks needed. Enable vectorization of this " 4790 "loop with '#pragma clang loop vectorize(enable)' when " 4791 "compiling with -Os/-Oz", 4792 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4793 return true; 4794 } 4795 4796 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4797 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4798 reportVectorizationFailure("Runtime stride check for small trip count", 4799 "runtime stride == 1 checks needed. Enable vectorization of " 4800 "this loop without such check by compiling with -Os/-Oz", 4801 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4802 return true; 4803 } 4804 4805 return false; 4806 } 4807 4808 ElementCount 4809 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4810 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4811 return ElementCount::getScalable(0); 4812 4813 if (Hints->isScalableVectorizationDisabled()) { 4814 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4815 "ScalableVectorizationDisabled", ORE, TheLoop); 4816 return ElementCount::getScalable(0); 4817 } 4818 4819 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4820 4821 auto MaxScalableVF = ElementCount::getScalable( 4822 std::numeric_limits<ElementCount::ScalarTy>::max()); 4823 4824 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4825 // FIXME: While for scalable vectors this is currently sufficient, this should 4826 // be replaced by a more detailed mechanism that filters out specific VFs, 4827 // instead of invalidating vectorization for a whole set of VFs based on the 4828 // MaxVF. 4829 4830 // Disable scalable vectorization if the loop contains unsupported reductions. 4831 if (!canVectorizeReductions(MaxScalableVF)) { 4832 reportVectorizationInfo( 4833 "Scalable vectorization not supported for the reduction " 4834 "operations found in this loop.", 4835 "ScalableVFUnfeasible", ORE, TheLoop); 4836 return ElementCount::getScalable(0); 4837 } 4838 4839 // Disable scalable vectorization if the loop contains any instructions 4840 // with element types not supported for scalable vectors. 4841 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4842 return !Ty->isVoidTy() && 4843 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4844 })) { 4845 reportVectorizationInfo("Scalable vectorization is not supported " 4846 "for all element types found in this loop.", 4847 "ScalableVFUnfeasible", ORE, TheLoop); 4848 return ElementCount::getScalable(0); 4849 } 4850 4851 if (Legal->isSafeForAnyVectorWidth()) 4852 return MaxScalableVF; 4853 4854 // Limit MaxScalableVF by the maximum safe dependence distance. 4855 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4856 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4857 MaxVScale = 4858 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4859 MaxScalableVF = ElementCount::getScalable( 4860 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 4861 if (!MaxScalableVF) 4862 reportVectorizationInfo( 4863 "Max legal vector width too small, scalable vectorization " 4864 "unfeasible.", 4865 "ScalableVFUnfeasible", ORE, TheLoop); 4866 4867 return MaxScalableVF; 4868 } 4869 4870 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4871 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4872 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4873 unsigned SmallestType, WidestType; 4874 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4875 4876 // Get the maximum safe dependence distance in bits computed by LAA. 4877 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4878 // the memory accesses that is most restrictive (involved in the smallest 4879 // dependence distance). 4880 unsigned MaxSafeElements = 4881 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4882 4883 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4884 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4885 4886 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4887 << ".\n"); 4888 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4889 << ".\n"); 4890 4891 // First analyze the UserVF, fall back if the UserVF should be ignored. 4892 if (UserVF) { 4893 auto MaxSafeUserVF = 4894 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4895 4896 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4897 // If `VF=vscale x N` is safe, then so is `VF=N` 4898 if (UserVF.isScalable()) 4899 return FixedScalableVFPair( 4900 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4901 else 4902 return UserVF; 4903 } 4904 4905 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4906 4907 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4908 // is better to ignore the hint and let the compiler choose a suitable VF. 4909 if (!UserVF.isScalable()) { 4910 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4911 << " is unsafe, clamping to max safe VF=" 4912 << MaxSafeFixedVF << ".\n"); 4913 ORE->emit([&]() { 4914 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4915 TheLoop->getStartLoc(), 4916 TheLoop->getHeader()) 4917 << "User-specified vectorization factor " 4918 << ore::NV("UserVectorizationFactor", UserVF) 4919 << " is unsafe, clamping to maximum safe vectorization factor " 4920 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4921 }); 4922 return MaxSafeFixedVF; 4923 } 4924 4925 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4926 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4927 << " is ignored because scalable vectors are not " 4928 "available.\n"); 4929 ORE->emit([&]() { 4930 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4931 TheLoop->getStartLoc(), 4932 TheLoop->getHeader()) 4933 << "User-specified vectorization factor " 4934 << ore::NV("UserVectorizationFactor", UserVF) 4935 << " is ignored because the target does not support scalable " 4936 "vectors. The compiler will pick a more suitable value."; 4937 }); 4938 } else { 4939 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4940 << " is unsafe. Ignoring scalable UserVF.\n"); 4941 ORE->emit([&]() { 4942 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4943 TheLoop->getStartLoc(), 4944 TheLoop->getHeader()) 4945 << "User-specified vectorization factor " 4946 << ore::NV("UserVectorizationFactor", UserVF) 4947 << " is unsafe. Ignoring the hint to let the compiler pick a " 4948 "more suitable value."; 4949 }); 4950 } 4951 } 4952 4953 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4954 << " / " << WidestType << " bits.\n"); 4955 4956 FixedScalableVFPair Result(ElementCount::getFixed(1), 4957 ElementCount::getScalable(0)); 4958 if (auto MaxVF = 4959 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4960 MaxSafeFixedVF, FoldTailByMasking)) 4961 Result.FixedVF = MaxVF; 4962 4963 if (auto MaxVF = 4964 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4965 MaxSafeScalableVF, FoldTailByMasking)) 4966 if (MaxVF.isScalable()) { 4967 Result.ScalableVF = MaxVF; 4968 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4969 << "\n"); 4970 } 4971 4972 return Result; 4973 } 4974 4975 FixedScalableVFPair 4976 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4977 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4978 // TODO: It may by useful to do since it's still likely to be dynamically 4979 // uniform if the target can skip. 4980 reportVectorizationFailure( 4981 "Not inserting runtime ptr check for divergent target", 4982 "runtime pointer checks needed. Not enabled for divergent target", 4983 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4984 return FixedScalableVFPair::getNone(); 4985 } 4986 4987 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4988 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4989 if (TC == 1) { 4990 reportVectorizationFailure("Single iteration (non) loop", 4991 "loop trip count is one, irrelevant for vectorization", 4992 "SingleIterationLoop", ORE, TheLoop); 4993 return FixedScalableVFPair::getNone(); 4994 } 4995 4996 switch (ScalarEpilogueStatus) { 4997 case CM_ScalarEpilogueAllowed: 4998 return computeFeasibleMaxVF(TC, UserVF, false); 4999 case CM_ScalarEpilogueNotAllowedUsePredicate: 5000 LLVM_FALLTHROUGH; 5001 case CM_ScalarEpilogueNotNeededUsePredicate: 5002 LLVM_DEBUG( 5003 dbgs() << "LV: vector predicate hint/switch found.\n" 5004 << "LV: Not allowing scalar epilogue, creating predicated " 5005 << "vector loop.\n"); 5006 break; 5007 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5008 // fallthrough as a special case of OptForSize 5009 case CM_ScalarEpilogueNotAllowedOptSize: 5010 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5011 LLVM_DEBUG( 5012 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5013 else 5014 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5015 << "count.\n"); 5016 5017 // Bail if runtime checks are required, which are not good when optimising 5018 // for size. 5019 if (runtimeChecksRequired()) 5020 return FixedScalableVFPair::getNone(); 5021 5022 break; 5023 } 5024 5025 // The only loops we can vectorize without a scalar epilogue, are loops with 5026 // a bottom-test and a single exiting block. We'd have to handle the fact 5027 // that not every instruction executes on the last iteration. This will 5028 // require a lane mask which varies through the vector loop body. (TODO) 5029 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5030 // If there was a tail-folding hint/switch, but we can't fold the tail by 5031 // masking, fallback to a vectorization with a scalar epilogue. 5032 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5033 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5034 "scalar epilogue instead.\n"); 5035 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5036 return computeFeasibleMaxVF(TC, UserVF, false); 5037 } 5038 return FixedScalableVFPair::getNone(); 5039 } 5040 5041 // Now try the tail folding 5042 5043 // Invalidate interleave groups that require an epilogue if we can't mask 5044 // the interleave-group. 5045 if (!useMaskedInterleavedAccesses(TTI)) { 5046 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5047 "No decisions should have been taken at this point"); 5048 // Note: There is no need to invalidate any cost modeling decisions here, as 5049 // non where taken so far. 5050 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5051 } 5052 5053 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5054 // Avoid tail folding if the trip count is known to be a multiple of any VF 5055 // we chose. 5056 // FIXME: The condition below pessimises the case for fixed-width vectors, 5057 // when scalable VFs are also candidates for vectorization. 5058 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5059 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5060 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5061 "MaxFixedVF must be a power of 2"); 5062 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5063 : MaxFixedVF.getFixedValue(); 5064 ScalarEvolution *SE = PSE.getSE(); 5065 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5066 const SCEV *ExitCount = SE->getAddExpr( 5067 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5068 const SCEV *Rem = SE->getURemExpr( 5069 SE->applyLoopGuards(ExitCount, TheLoop), 5070 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5071 if (Rem->isZero()) { 5072 // Accept MaxFixedVF if we do not have a tail. 5073 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5074 return MaxFactors; 5075 } 5076 } 5077 5078 // If we don't know the precise trip count, or if the trip count that we 5079 // found modulo the vectorization factor is not zero, try to fold the tail 5080 // by masking. 5081 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5082 if (Legal->prepareToFoldTailByMasking()) { 5083 FoldTailByMasking = true; 5084 return MaxFactors; 5085 } 5086 5087 // If there was a tail-folding hint/switch, but we can't fold the tail by 5088 // masking, fallback to a vectorization with a scalar epilogue. 5089 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5090 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5091 "scalar epilogue instead.\n"); 5092 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5093 return MaxFactors; 5094 } 5095 5096 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5097 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5098 return FixedScalableVFPair::getNone(); 5099 } 5100 5101 if (TC == 0) { 5102 reportVectorizationFailure( 5103 "Unable to calculate the loop count due to complex control flow", 5104 "unable to calculate the loop count due to complex control flow", 5105 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5106 return FixedScalableVFPair::getNone(); 5107 } 5108 5109 reportVectorizationFailure( 5110 "Cannot optimize for size and vectorize at the same time.", 5111 "cannot optimize for size and vectorize at the same time. " 5112 "Enable vectorization of this loop with '#pragma clang loop " 5113 "vectorize(enable)' when compiling with -Os/-Oz", 5114 "NoTailLoopWithOptForSize", ORE, TheLoop); 5115 return FixedScalableVFPair::getNone(); 5116 } 5117 5118 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5119 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5120 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5121 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5122 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5123 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5124 : TargetTransformInfo::RGK_FixedWidthVector); 5125 5126 // Convenience function to return the minimum of two ElementCounts. 5127 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5128 assert((LHS.isScalable() == RHS.isScalable()) && 5129 "Scalable flags must match"); 5130 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5131 }; 5132 5133 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5134 // Note that both WidestRegister and WidestType may not be a powers of 2. 5135 auto MaxVectorElementCount = ElementCount::get( 5136 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5137 ComputeScalableMaxVF); 5138 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5139 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5140 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5141 5142 if (!MaxVectorElementCount) { 5143 LLVM_DEBUG(dbgs() << "LV: The target has no " 5144 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5145 << " vector registers.\n"); 5146 return ElementCount::getFixed(1); 5147 } 5148 5149 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5150 if (ConstTripCount && 5151 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5152 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5153 // If loop trip count (TC) is known at compile time there is no point in 5154 // choosing VF greater than TC (as done in the loop below). Select maximum 5155 // power of two which doesn't exceed TC. 5156 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5157 // when the TC is less than or equal to the known number of lanes. 5158 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5159 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5160 "exceeding the constant trip count: " 5161 << ClampedConstTripCount << "\n"); 5162 return ElementCount::getFixed(ClampedConstTripCount); 5163 } 5164 5165 TargetTransformInfo::RegisterKind RegKind = 5166 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5167 : TargetTransformInfo::RGK_FixedWidthVector; 5168 ElementCount MaxVF = MaxVectorElementCount; 5169 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5170 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5171 auto MaxVectorElementCountMaxBW = ElementCount::get( 5172 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5173 ComputeScalableMaxVF); 5174 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5175 5176 // Collect all viable vectorization factors larger than the default MaxVF 5177 // (i.e. MaxVectorElementCount). 5178 SmallVector<ElementCount, 8> VFs; 5179 for (ElementCount VS = MaxVectorElementCount * 2; 5180 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5181 VFs.push_back(VS); 5182 5183 // For each VF calculate its register usage. 5184 auto RUs = calculateRegisterUsage(VFs); 5185 5186 // Select the largest VF which doesn't require more registers than existing 5187 // ones. 5188 for (int i = RUs.size() - 1; i >= 0; --i) { 5189 bool Selected = true; 5190 for (auto &pair : RUs[i].MaxLocalUsers) { 5191 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5192 if (pair.second > TargetNumRegisters) 5193 Selected = false; 5194 } 5195 if (Selected) { 5196 MaxVF = VFs[i]; 5197 break; 5198 } 5199 } 5200 if (ElementCount MinVF = 5201 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5202 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5203 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5204 << ") with target's minimum: " << MinVF << '\n'); 5205 MaxVF = MinVF; 5206 } 5207 } 5208 5209 // Invalidate any widening decisions we might have made, in case the loop 5210 // requires prediction (decided later), but we have already made some 5211 // load/store widening decisions. 5212 invalidateCostModelingDecisions(); 5213 } 5214 return MaxVF; 5215 } 5216 5217 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5218 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5219 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5220 auto Min = Attr.getVScaleRangeMin(); 5221 auto Max = Attr.getVScaleRangeMax(); 5222 if (Max && Min == Max) 5223 return Max; 5224 } 5225 5226 return TTI.getVScaleForTuning(); 5227 } 5228 5229 bool LoopVectorizationCostModel::isMoreProfitable( 5230 const VectorizationFactor &A, const VectorizationFactor &B) const { 5231 InstructionCost CostA = A.Cost; 5232 InstructionCost CostB = B.Cost; 5233 5234 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5235 5236 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5237 MaxTripCount) { 5238 // If we are folding the tail and the trip count is a known (possibly small) 5239 // constant, the trip count will be rounded up to an integer number of 5240 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5241 // which we compare directly. When not folding the tail, the total cost will 5242 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5243 // approximated with the per-lane cost below instead of using the tripcount 5244 // as here. 5245 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5246 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5247 return RTCostA < RTCostB; 5248 } 5249 5250 // Improve estimate for the vector width if it is scalable. 5251 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5252 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5253 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5254 if (A.Width.isScalable()) 5255 EstimatedWidthA *= VScale.getValue(); 5256 if (B.Width.isScalable()) 5257 EstimatedWidthB *= VScale.getValue(); 5258 } 5259 5260 // Assume vscale may be larger than 1 (or the value being tuned for), 5261 // so that scalable vectorization is slightly favorable over fixed-width 5262 // vectorization. 5263 if (A.Width.isScalable() && !B.Width.isScalable()) 5264 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5265 5266 // To avoid the need for FP division: 5267 // (CostA / A.Width) < (CostB / B.Width) 5268 // <=> (CostA * B.Width) < (CostB * A.Width) 5269 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5270 } 5271 5272 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5273 const ElementCountSet &VFCandidates) { 5274 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5275 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5276 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5277 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5278 "Expected Scalar VF to be a candidate"); 5279 5280 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5281 ExpectedCost); 5282 VectorizationFactor ChosenFactor = ScalarCost; 5283 5284 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5285 if (ForceVectorization && VFCandidates.size() > 1) { 5286 // Ignore scalar width, because the user explicitly wants vectorization. 5287 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5288 // evaluation. 5289 ChosenFactor.Cost = InstructionCost::getMax(); 5290 } 5291 5292 SmallVector<InstructionVFPair> InvalidCosts; 5293 for (const auto &i : VFCandidates) { 5294 // The cost for scalar VF=1 is already calculated, so ignore it. 5295 if (i.isScalar()) 5296 continue; 5297 5298 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5299 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5300 5301 #ifndef NDEBUG 5302 unsigned AssumedMinimumVscale = 1; 5303 if (Optional<unsigned> VScale = getVScaleForTuning()) 5304 AssumedMinimumVscale = *VScale; 5305 unsigned Width = 5306 Candidate.Width.isScalable() 5307 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5308 : Candidate.Width.getFixedValue(); 5309 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5310 << " costs: " << (Candidate.Cost / Width)); 5311 if (i.isScalable()) 5312 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5313 << AssumedMinimumVscale << ")"); 5314 LLVM_DEBUG(dbgs() << ".\n"); 5315 #endif 5316 5317 if (!C.second && !ForceVectorization) { 5318 LLVM_DEBUG( 5319 dbgs() << "LV: Not considering vector loop of width " << i 5320 << " because it will not generate any vector instructions.\n"); 5321 continue; 5322 } 5323 5324 // If profitable add it to ProfitableVF list. 5325 if (isMoreProfitable(Candidate, ScalarCost)) 5326 ProfitableVFs.push_back(Candidate); 5327 5328 if (isMoreProfitable(Candidate, ChosenFactor)) 5329 ChosenFactor = Candidate; 5330 } 5331 5332 // Emit a report of VFs with invalid costs in the loop. 5333 if (!InvalidCosts.empty()) { 5334 // Group the remarks per instruction, keeping the instruction order from 5335 // InvalidCosts. 5336 std::map<Instruction *, unsigned> Numbering; 5337 unsigned I = 0; 5338 for (auto &Pair : InvalidCosts) 5339 if (!Numbering.count(Pair.first)) 5340 Numbering[Pair.first] = I++; 5341 5342 // Sort the list, first on instruction(number) then on VF. 5343 llvm::sort(InvalidCosts, 5344 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5345 if (Numbering[A.first] != Numbering[B.first]) 5346 return Numbering[A.first] < Numbering[B.first]; 5347 ElementCountComparator ECC; 5348 return ECC(A.second, B.second); 5349 }); 5350 5351 // For a list of ordered instruction-vf pairs: 5352 // [(load, vf1), (load, vf2), (store, vf1)] 5353 // Group the instructions together to emit separate remarks for: 5354 // load (vf1, vf2) 5355 // store (vf1) 5356 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5357 auto Subset = ArrayRef<InstructionVFPair>(); 5358 do { 5359 if (Subset.empty()) 5360 Subset = Tail.take_front(1); 5361 5362 Instruction *I = Subset.front().first; 5363 5364 // If the next instruction is different, or if there are no other pairs, 5365 // emit a remark for the collated subset. e.g. 5366 // [(load, vf1), (load, vf2))] 5367 // to emit: 5368 // remark: invalid costs for 'load' at VF=(vf, vf2) 5369 if (Subset == Tail || Tail[Subset.size()].first != I) { 5370 std::string OutString; 5371 raw_string_ostream OS(OutString); 5372 assert(!Subset.empty() && "Unexpected empty range"); 5373 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5374 for (auto &Pair : Subset) 5375 OS << (Pair.second == Subset.front().second ? "" : ", ") 5376 << Pair.second; 5377 OS << "):"; 5378 if (auto *CI = dyn_cast<CallInst>(I)) 5379 OS << " call to " << CI->getCalledFunction()->getName(); 5380 else 5381 OS << " " << I->getOpcodeName(); 5382 OS.flush(); 5383 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5384 Tail = Tail.drop_front(Subset.size()); 5385 Subset = {}; 5386 } else 5387 // Grow the subset by one element 5388 Subset = Tail.take_front(Subset.size() + 1); 5389 } while (!Tail.empty()); 5390 } 5391 5392 if (!EnableCondStoresVectorization && NumPredStores) { 5393 reportVectorizationFailure("There are conditional stores.", 5394 "store that is conditionally executed prevents vectorization", 5395 "ConditionalStore", ORE, TheLoop); 5396 ChosenFactor = ScalarCost; 5397 } 5398 5399 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5400 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5401 << "LV: Vectorization seems to be not beneficial, " 5402 << "but was forced by a user.\n"); 5403 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5404 return ChosenFactor; 5405 } 5406 5407 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5408 const Loop &L, ElementCount VF) const { 5409 // Cross iteration phis such as reductions need special handling and are 5410 // currently unsupported. 5411 if (any_of(L.getHeader()->phis(), 5412 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5413 return false; 5414 5415 // Phis with uses outside of the loop require special handling and are 5416 // currently unsupported. 5417 for (auto &Entry : Legal->getInductionVars()) { 5418 // Look for uses of the value of the induction at the last iteration. 5419 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5420 for (User *U : PostInc->users()) 5421 if (!L.contains(cast<Instruction>(U))) 5422 return false; 5423 // Look for uses of penultimate value of the induction. 5424 for (User *U : Entry.first->users()) 5425 if (!L.contains(cast<Instruction>(U))) 5426 return false; 5427 } 5428 5429 // Induction variables that are widened require special handling that is 5430 // currently not supported. 5431 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5432 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5433 this->isProfitableToScalarize(Entry.first, VF)); 5434 })) 5435 return false; 5436 5437 // Epilogue vectorization code has not been auditted to ensure it handles 5438 // non-latch exits properly. It may be fine, but it needs auditted and 5439 // tested. 5440 if (L.getExitingBlock() != L.getLoopLatch()) 5441 return false; 5442 5443 return true; 5444 } 5445 5446 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5447 const ElementCount VF) const { 5448 // FIXME: We need a much better cost-model to take different parameters such 5449 // as register pressure, code size increase and cost of extra branches into 5450 // account. For now we apply a very crude heuristic and only consider loops 5451 // with vectorization factors larger than a certain value. 5452 // We also consider epilogue vectorization unprofitable for targets that don't 5453 // consider interleaving beneficial (eg. MVE). 5454 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5455 return false; 5456 // FIXME: We should consider changing the threshold for scalable 5457 // vectors to take VScaleForTuning into account. 5458 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5459 return true; 5460 return false; 5461 } 5462 5463 VectorizationFactor 5464 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5465 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5466 VectorizationFactor Result = VectorizationFactor::Disabled(); 5467 if (!EnableEpilogueVectorization) { 5468 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5469 return Result; 5470 } 5471 5472 if (!isScalarEpilogueAllowed()) { 5473 LLVM_DEBUG( 5474 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5475 "allowed.\n";); 5476 return Result; 5477 } 5478 5479 // Not really a cost consideration, but check for unsupported cases here to 5480 // simplify the logic. 5481 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5482 LLVM_DEBUG( 5483 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5484 "not a supported candidate.\n";); 5485 return Result; 5486 } 5487 5488 if (EpilogueVectorizationForceVF > 1) { 5489 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5490 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5491 if (LVP.hasPlanWithVF(ForcedEC)) 5492 return {ForcedEC, 0, 0}; 5493 else { 5494 LLVM_DEBUG( 5495 dbgs() 5496 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5497 return Result; 5498 } 5499 } 5500 5501 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5502 TheLoop->getHeader()->getParent()->hasMinSize()) { 5503 LLVM_DEBUG( 5504 dbgs() 5505 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5506 return Result; 5507 } 5508 5509 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5510 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5511 "this loop\n"); 5512 return Result; 5513 } 5514 5515 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5516 // the main loop handles 8 lanes per iteration. We could still benefit from 5517 // vectorizing the epilogue loop with VF=4. 5518 ElementCount EstimatedRuntimeVF = MainLoopVF; 5519 if (MainLoopVF.isScalable()) { 5520 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5521 if (Optional<unsigned> VScale = getVScaleForTuning()) 5522 EstimatedRuntimeVF *= *VScale; 5523 } 5524 5525 for (auto &NextVF : ProfitableVFs) 5526 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5527 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5528 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5529 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5530 LVP.hasPlanWithVF(NextVF.Width)) 5531 Result = NextVF; 5532 5533 if (Result != VectorizationFactor::Disabled()) 5534 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5535 << Result.Width << "\n";); 5536 return Result; 5537 } 5538 5539 std::pair<unsigned, unsigned> 5540 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5541 unsigned MinWidth = -1U; 5542 unsigned MaxWidth = 8; 5543 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5544 // For in-loop reductions, no element types are added to ElementTypesInLoop 5545 // if there are no loads/stores in the loop. In this case, check through the 5546 // reduction variables to determine the maximum width. 5547 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5548 // Reset MaxWidth so that we can find the smallest type used by recurrences 5549 // in the loop. 5550 MaxWidth = -1U; 5551 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5552 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5553 // When finding the min width used by the recurrence we need to account 5554 // for casts on the input operands of the recurrence. 5555 MaxWidth = std::min<unsigned>( 5556 MaxWidth, std::min<unsigned>( 5557 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5558 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5559 } 5560 } else { 5561 for (Type *T : ElementTypesInLoop) { 5562 MinWidth = std::min<unsigned>( 5563 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5564 MaxWidth = std::max<unsigned>( 5565 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5566 } 5567 } 5568 return {MinWidth, MaxWidth}; 5569 } 5570 5571 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5572 ElementTypesInLoop.clear(); 5573 // For each block. 5574 for (BasicBlock *BB : TheLoop->blocks()) { 5575 // For each instruction in the loop. 5576 for (Instruction &I : BB->instructionsWithoutDebug()) { 5577 Type *T = I.getType(); 5578 5579 // Skip ignored values. 5580 if (ValuesToIgnore.count(&I)) 5581 continue; 5582 5583 // Only examine Loads, Stores and PHINodes. 5584 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5585 continue; 5586 5587 // Examine PHI nodes that are reduction variables. Update the type to 5588 // account for the recurrence type. 5589 if (auto *PN = dyn_cast<PHINode>(&I)) { 5590 if (!Legal->isReductionVariable(PN)) 5591 continue; 5592 const RecurrenceDescriptor &RdxDesc = 5593 Legal->getReductionVars().find(PN)->second; 5594 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5595 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5596 RdxDesc.getRecurrenceType(), 5597 TargetTransformInfo::ReductionFlags())) 5598 continue; 5599 T = RdxDesc.getRecurrenceType(); 5600 } 5601 5602 // Examine the stored values. 5603 if (auto *ST = dyn_cast<StoreInst>(&I)) 5604 T = ST->getValueOperand()->getType(); 5605 5606 assert(T->isSized() && 5607 "Expected the load/store/recurrence type to be sized"); 5608 5609 ElementTypesInLoop.insert(T); 5610 } 5611 } 5612 } 5613 5614 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5615 unsigned LoopCost) { 5616 // -- The interleave heuristics -- 5617 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5618 // There are many micro-architectural considerations that we can't predict 5619 // at this level. For example, frontend pressure (on decode or fetch) due to 5620 // code size, or the number and capabilities of the execution ports. 5621 // 5622 // We use the following heuristics to select the interleave count: 5623 // 1. If the code has reductions, then we interleave to break the cross 5624 // iteration dependency. 5625 // 2. If the loop is really small, then we interleave to reduce the loop 5626 // overhead. 5627 // 3. We don't interleave if we think that we will spill registers to memory 5628 // due to the increased register pressure. 5629 5630 if (!isScalarEpilogueAllowed()) 5631 return 1; 5632 5633 // We used the distance for the interleave count. 5634 if (Legal->getMaxSafeDepDistBytes() != -1U) 5635 return 1; 5636 5637 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5638 const bool HasReductions = !Legal->getReductionVars().empty(); 5639 // Do not interleave loops with a relatively small known or estimated trip 5640 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5641 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5642 // because with the above conditions interleaving can expose ILP and break 5643 // cross iteration dependences for reductions. 5644 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5645 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5646 return 1; 5647 5648 // If we did not calculate the cost for VF (because the user selected the VF) 5649 // then we calculate the cost of VF here. 5650 if (LoopCost == 0) { 5651 InstructionCost C = expectedCost(VF).first; 5652 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5653 LoopCost = *C.getValue(); 5654 5655 // Loop body is free and there is no need for interleaving. 5656 if (LoopCost == 0) 5657 return 1; 5658 } 5659 5660 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5661 // We divide by these constants so assume that we have at least one 5662 // instruction that uses at least one register. 5663 for (auto& pair : R.MaxLocalUsers) { 5664 pair.second = std::max(pair.second, 1U); 5665 } 5666 5667 // We calculate the interleave count using the following formula. 5668 // Subtract the number of loop invariants from the number of available 5669 // registers. These registers are used by all of the interleaved instances. 5670 // Next, divide the remaining registers by the number of registers that is 5671 // required by the loop, in order to estimate how many parallel instances 5672 // fit without causing spills. All of this is rounded down if necessary to be 5673 // a power of two. We want power of two interleave count to simplify any 5674 // addressing operations or alignment considerations. 5675 // We also want power of two interleave counts to ensure that the induction 5676 // variable of the vector loop wraps to zero, when tail is folded by masking; 5677 // this currently happens when OptForSize, in which case IC is set to 1 above. 5678 unsigned IC = UINT_MAX; 5679 5680 for (auto& pair : R.MaxLocalUsers) { 5681 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5682 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5683 << " registers of " 5684 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5685 if (VF.isScalar()) { 5686 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5687 TargetNumRegisters = ForceTargetNumScalarRegs; 5688 } else { 5689 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5690 TargetNumRegisters = ForceTargetNumVectorRegs; 5691 } 5692 unsigned MaxLocalUsers = pair.second; 5693 unsigned LoopInvariantRegs = 0; 5694 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5695 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5696 5697 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5698 // Don't count the induction variable as interleaved. 5699 if (EnableIndVarRegisterHeur) { 5700 TmpIC = 5701 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5702 std::max(1U, (MaxLocalUsers - 1))); 5703 } 5704 5705 IC = std::min(IC, TmpIC); 5706 } 5707 5708 // Clamp the interleave ranges to reasonable counts. 5709 unsigned MaxInterleaveCount = 5710 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5711 5712 // Check if the user has overridden the max. 5713 if (VF.isScalar()) { 5714 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5715 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5716 } else { 5717 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5718 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5719 } 5720 5721 // If trip count is known or estimated compile time constant, limit the 5722 // interleave count to be less than the trip count divided by VF, provided it 5723 // is at least 1. 5724 // 5725 // For scalable vectors we can't know if interleaving is beneficial. It may 5726 // not be beneficial for small loops if none of the lanes in the second vector 5727 // iterations is enabled. However, for larger loops, there is likely to be a 5728 // similar benefit as for fixed-width vectors. For now, we choose to leave 5729 // the InterleaveCount as if vscale is '1', although if some information about 5730 // the vector is known (e.g. min vector size), we can make a better decision. 5731 if (BestKnownTC) { 5732 MaxInterleaveCount = 5733 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5734 // Make sure MaxInterleaveCount is greater than 0. 5735 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5736 } 5737 5738 assert(MaxInterleaveCount > 0 && 5739 "Maximum interleave count must be greater than 0"); 5740 5741 // Clamp the calculated IC to be between the 1 and the max interleave count 5742 // that the target and trip count allows. 5743 if (IC > MaxInterleaveCount) 5744 IC = MaxInterleaveCount; 5745 else 5746 // Make sure IC is greater than 0. 5747 IC = std::max(1u, IC); 5748 5749 assert(IC > 0 && "Interleave count must be greater than 0."); 5750 5751 // Interleave if we vectorized this loop and there is a reduction that could 5752 // benefit from interleaving. 5753 if (VF.isVector() && HasReductions) { 5754 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5755 return IC; 5756 } 5757 5758 // For any scalar loop that either requires runtime checks or predication we 5759 // are better off leaving this to the unroller. Note that if we've already 5760 // vectorized the loop we will have done the runtime check and so interleaving 5761 // won't require further checks. 5762 bool ScalarInterleavingRequiresPredication = 5763 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5764 return Legal->blockNeedsPredication(BB); 5765 })); 5766 bool ScalarInterleavingRequiresRuntimePointerCheck = 5767 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5768 5769 // We want to interleave small loops in order to reduce the loop overhead and 5770 // potentially expose ILP opportunities. 5771 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5772 << "LV: IC is " << IC << '\n' 5773 << "LV: VF is " << VF << '\n'); 5774 const bool AggressivelyInterleaveReductions = 5775 TTI.enableAggressiveInterleaving(HasReductions); 5776 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5777 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5778 // We assume that the cost overhead is 1 and we use the cost model 5779 // to estimate the cost of the loop and interleave until the cost of the 5780 // loop overhead is about 5% of the cost of the loop. 5781 unsigned SmallIC = 5782 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5783 5784 // Interleave until store/load ports (estimated by max interleave count) are 5785 // saturated. 5786 unsigned NumStores = Legal->getNumStores(); 5787 unsigned NumLoads = Legal->getNumLoads(); 5788 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5789 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5790 5791 // There is little point in interleaving for reductions containing selects 5792 // and compares when VF=1 since it may just create more overhead than it's 5793 // worth for loops with small trip counts. This is because we still have to 5794 // do the final reduction after the loop. 5795 bool HasSelectCmpReductions = 5796 HasReductions && 5797 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5798 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5799 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5800 RdxDesc.getRecurrenceKind()); 5801 }); 5802 if (HasSelectCmpReductions) { 5803 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5804 return 1; 5805 } 5806 5807 // If we have a scalar reduction (vector reductions are already dealt with 5808 // by this point), we can increase the critical path length if the loop 5809 // we're interleaving is inside another loop. For tree-wise reductions 5810 // set the limit to 2, and for ordered reductions it's best to disable 5811 // interleaving entirely. 5812 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5813 bool HasOrderedReductions = 5814 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5815 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5816 return RdxDesc.isOrdered(); 5817 }); 5818 if (HasOrderedReductions) { 5819 LLVM_DEBUG( 5820 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5821 return 1; 5822 } 5823 5824 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5825 SmallIC = std::min(SmallIC, F); 5826 StoresIC = std::min(StoresIC, F); 5827 LoadsIC = std::min(LoadsIC, F); 5828 } 5829 5830 if (EnableLoadStoreRuntimeInterleave && 5831 std::max(StoresIC, LoadsIC) > SmallIC) { 5832 LLVM_DEBUG( 5833 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5834 return std::max(StoresIC, LoadsIC); 5835 } 5836 5837 // If there are scalar reductions and TTI has enabled aggressive 5838 // interleaving for reductions, we will interleave to expose ILP. 5839 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5840 AggressivelyInterleaveReductions) { 5841 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5842 // Interleave no less than SmallIC but not as aggressive as the normal IC 5843 // to satisfy the rare situation when resources are too limited. 5844 return std::max(IC / 2, SmallIC); 5845 } else { 5846 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5847 return SmallIC; 5848 } 5849 } 5850 5851 // Interleave if this is a large loop (small loops are already dealt with by 5852 // this point) that could benefit from interleaving. 5853 if (AggressivelyInterleaveReductions) { 5854 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5855 return IC; 5856 } 5857 5858 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5859 return 1; 5860 } 5861 5862 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5863 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5864 // This function calculates the register usage by measuring the highest number 5865 // of values that are alive at a single location. Obviously, this is a very 5866 // rough estimation. We scan the loop in a topological order in order and 5867 // assign a number to each instruction. We use RPO to ensure that defs are 5868 // met before their users. We assume that each instruction that has in-loop 5869 // users starts an interval. We record every time that an in-loop value is 5870 // used, so we have a list of the first and last occurrences of each 5871 // instruction. Next, we transpose this data structure into a multi map that 5872 // holds the list of intervals that *end* at a specific location. This multi 5873 // map allows us to perform a linear search. We scan the instructions linearly 5874 // and record each time that a new interval starts, by placing it in a set. 5875 // If we find this value in the multi-map then we remove it from the set. 5876 // The max register usage is the maximum size of the set. 5877 // We also search for instructions that are defined outside the loop, but are 5878 // used inside the loop. We need this number separately from the max-interval 5879 // usage number because when we unroll, loop-invariant values do not take 5880 // more register. 5881 LoopBlocksDFS DFS(TheLoop); 5882 DFS.perform(LI); 5883 5884 RegisterUsage RU; 5885 5886 // Each 'key' in the map opens a new interval. The values 5887 // of the map are the index of the 'last seen' usage of the 5888 // instruction that is the key. 5889 using IntervalMap = DenseMap<Instruction *, unsigned>; 5890 5891 // Maps instruction to its index. 5892 SmallVector<Instruction *, 64> IdxToInstr; 5893 // Marks the end of each interval. 5894 IntervalMap EndPoint; 5895 // Saves the list of instruction indices that are used in the loop. 5896 SmallPtrSet<Instruction *, 8> Ends; 5897 // Saves the list of values that are used in the loop but are 5898 // defined outside the loop, such as arguments and constants. 5899 SmallPtrSet<Value *, 8> LoopInvariants; 5900 5901 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5902 for (Instruction &I : BB->instructionsWithoutDebug()) { 5903 IdxToInstr.push_back(&I); 5904 5905 // Save the end location of each USE. 5906 for (Value *U : I.operands()) { 5907 auto *Instr = dyn_cast<Instruction>(U); 5908 5909 // Ignore non-instruction values such as arguments, constants, etc. 5910 if (!Instr) 5911 continue; 5912 5913 // If this instruction is outside the loop then record it and continue. 5914 if (!TheLoop->contains(Instr)) { 5915 LoopInvariants.insert(Instr); 5916 continue; 5917 } 5918 5919 // Overwrite previous end points. 5920 EndPoint[Instr] = IdxToInstr.size(); 5921 Ends.insert(Instr); 5922 } 5923 } 5924 } 5925 5926 // Saves the list of intervals that end with the index in 'key'. 5927 using InstrList = SmallVector<Instruction *, 2>; 5928 DenseMap<unsigned, InstrList> TransposeEnds; 5929 5930 // Transpose the EndPoints to a list of values that end at each index. 5931 for (auto &Interval : EndPoint) 5932 TransposeEnds[Interval.second].push_back(Interval.first); 5933 5934 SmallPtrSet<Instruction *, 8> OpenIntervals; 5935 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5936 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5937 5938 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5939 5940 const auto &TTICapture = TTI; 5941 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5942 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5943 return 0; 5944 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5945 }; 5946 5947 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5948 Instruction *I = IdxToInstr[i]; 5949 5950 // Remove all of the instructions that end at this location. 5951 InstrList &List = TransposeEnds[i]; 5952 for (Instruction *ToRemove : List) 5953 OpenIntervals.erase(ToRemove); 5954 5955 // Ignore instructions that are never used within the loop. 5956 if (!Ends.count(I)) 5957 continue; 5958 5959 // Skip ignored values. 5960 if (ValuesToIgnore.count(I)) 5961 continue; 5962 5963 // For each VF find the maximum usage of registers. 5964 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5965 // Count the number of live intervals. 5966 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5967 5968 if (VFs[j].isScalar()) { 5969 for (auto Inst : OpenIntervals) { 5970 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5971 if (RegUsage.find(ClassID) == RegUsage.end()) 5972 RegUsage[ClassID] = 1; 5973 else 5974 RegUsage[ClassID] += 1; 5975 } 5976 } else { 5977 collectUniformsAndScalars(VFs[j]); 5978 for (auto Inst : OpenIntervals) { 5979 // Skip ignored values for VF > 1. 5980 if (VecValuesToIgnore.count(Inst)) 5981 continue; 5982 if (isScalarAfterVectorization(Inst, VFs[j])) { 5983 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5984 if (RegUsage.find(ClassID) == RegUsage.end()) 5985 RegUsage[ClassID] = 1; 5986 else 5987 RegUsage[ClassID] += 1; 5988 } else { 5989 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5990 if (RegUsage.find(ClassID) == RegUsage.end()) 5991 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5992 else 5993 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5994 } 5995 } 5996 } 5997 5998 for (auto& pair : RegUsage) { 5999 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6000 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6001 else 6002 MaxUsages[j][pair.first] = pair.second; 6003 } 6004 } 6005 6006 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6007 << OpenIntervals.size() << '\n'); 6008 6009 // Add the current instruction to the list of open intervals. 6010 OpenIntervals.insert(I); 6011 } 6012 6013 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6014 SmallMapVector<unsigned, unsigned, 4> Invariant; 6015 6016 for (auto Inst : LoopInvariants) { 6017 unsigned Usage = 6018 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6019 unsigned ClassID = 6020 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6021 if (Invariant.find(ClassID) == Invariant.end()) 6022 Invariant[ClassID] = Usage; 6023 else 6024 Invariant[ClassID] += Usage; 6025 } 6026 6027 LLVM_DEBUG({ 6028 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6029 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6030 << " item\n"; 6031 for (const auto &pair : MaxUsages[i]) { 6032 dbgs() << "LV(REG): RegisterClass: " 6033 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6034 << " registers\n"; 6035 } 6036 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6037 << " item\n"; 6038 for (const auto &pair : Invariant) { 6039 dbgs() << "LV(REG): RegisterClass: " 6040 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6041 << " registers\n"; 6042 } 6043 }); 6044 6045 RU.LoopInvariantRegs = Invariant; 6046 RU.MaxLocalUsers = MaxUsages[i]; 6047 RUs[i] = RU; 6048 } 6049 6050 return RUs; 6051 } 6052 6053 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6054 ElementCount VF) { 6055 // TODO: Cost model for emulated masked load/store is completely 6056 // broken. This hack guides the cost model to use an artificially 6057 // high enough value to practically disable vectorization with such 6058 // operations, except where previously deployed legality hack allowed 6059 // using very low cost values. This is to avoid regressions coming simply 6060 // from moving "masked load/store" check from legality to cost model. 6061 // Masked Load/Gather emulation was previously never allowed. 6062 // Limited number of Masked Store/Scatter emulation was allowed. 6063 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6064 return isa<LoadInst>(I) || 6065 (isa<StoreInst>(I) && 6066 NumPredStores > NumberOfStoresToPredicate); 6067 } 6068 6069 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6070 // If we aren't vectorizing the loop, or if we've already collected the 6071 // instructions to scalarize, there's nothing to do. Collection may already 6072 // have occurred if we have a user-selected VF and are now computing the 6073 // expected cost for interleaving. 6074 if (VF.isScalar() || VF.isZero() || 6075 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6076 return; 6077 6078 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6079 // not profitable to scalarize any instructions, the presence of VF in the 6080 // map will indicate that we've analyzed it already. 6081 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6082 6083 // Find all the instructions that are scalar with predication in the loop and 6084 // determine if it would be better to not if-convert the blocks they are in. 6085 // If so, we also record the instructions to scalarize. 6086 for (BasicBlock *BB : TheLoop->blocks()) { 6087 if (!blockNeedsPredicationForAnyReason(BB)) 6088 continue; 6089 for (Instruction &I : *BB) 6090 if (isScalarWithPredication(&I, VF)) { 6091 ScalarCostsTy ScalarCosts; 6092 // Do not apply discount if scalable, because that would lead to 6093 // invalid scalarization costs. 6094 // Do not apply discount logic if hacked cost is needed 6095 // for emulated masked memrefs. 6096 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6097 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6098 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6099 // Remember that BB will remain after vectorization. 6100 PredicatedBBsAfterVectorization.insert(BB); 6101 } 6102 } 6103 } 6104 6105 int LoopVectorizationCostModel::computePredInstDiscount( 6106 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6107 assert(!isUniformAfterVectorization(PredInst, VF) && 6108 "Instruction marked uniform-after-vectorization will be predicated"); 6109 6110 // Initialize the discount to zero, meaning that the scalar version and the 6111 // vector version cost the same. 6112 InstructionCost Discount = 0; 6113 6114 // Holds instructions to analyze. The instructions we visit are mapped in 6115 // ScalarCosts. Those instructions are the ones that would be scalarized if 6116 // we find that the scalar version costs less. 6117 SmallVector<Instruction *, 8> Worklist; 6118 6119 // Returns true if the given instruction can be scalarized. 6120 auto canBeScalarized = [&](Instruction *I) -> bool { 6121 // We only attempt to scalarize instructions forming a single-use chain 6122 // from the original predicated block that would otherwise be vectorized. 6123 // Although not strictly necessary, we give up on instructions we know will 6124 // already be scalar to avoid traversing chains that are unlikely to be 6125 // beneficial. 6126 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6127 isScalarAfterVectorization(I, VF)) 6128 return false; 6129 6130 // If the instruction is scalar with predication, it will be analyzed 6131 // separately. We ignore it within the context of PredInst. 6132 if (isScalarWithPredication(I, VF)) 6133 return false; 6134 6135 // If any of the instruction's operands are uniform after vectorization, 6136 // the instruction cannot be scalarized. This prevents, for example, a 6137 // masked load from being scalarized. 6138 // 6139 // We assume we will only emit a value for lane zero of an instruction 6140 // marked uniform after vectorization, rather than VF identical values. 6141 // Thus, if we scalarize an instruction that uses a uniform, we would 6142 // create uses of values corresponding to the lanes we aren't emitting code 6143 // for. This behavior can be changed by allowing getScalarValue to clone 6144 // the lane zero values for uniforms rather than asserting. 6145 for (Use &U : I->operands()) 6146 if (auto *J = dyn_cast<Instruction>(U.get())) 6147 if (isUniformAfterVectorization(J, VF)) 6148 return false; 6149 6150 // Otherwise, we can scalarize the instruction. 6151 return true; 6152 }; 6153 6154 // Compute the expected cost discount from scalarizing the entire expression 6155 // feeding the predicated instruction. We currently only consider expressions 6156 // that are single-use instruction chains. 6157 Worklist.push_back(PredInst); 6158 while (!Worklist.empty()) { 6159 Instruction *I = Worklist.pop_back_val(); 6160 6161 // If we've already analyzed the instruction, there's nothing to do. 6162 if (ScalarCosts.find(I) != ScalarCosts.end()) 6163 continue; 6164 6165 // Compute the cost of the vector instruction. Note that this cost already 6166 // includes the scalarization overhead of the predicated instruction. 6167 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6168 6169 // Compute the cost of the scalarized instruction. This cost is the cost of 6170 // the instruction as if it wasn't if-converted and instead remained in the 6171 // predicated block. We will scale this cost by block probability after 6172 // computing the scalarization overhead. 6173 InstructionCost ScalarCost = 6174 VF.getFixedValue() * 6175 getInstructionCost(I, ElementCount::getFixed(1)).first; 6176 6177 // Compute the scalarization overhead of needed insertelement instructions 6178 // and phi nodes. 6179 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6180 ScalarCost += TTI.getScalarizationOverhead( 6181 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6182 APInt::getAllOnes(VF.getFixedValue()), true, false); 6183 ScalarCost += 6184 VF.getFixedValue() * 6185 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6186 } 6187 6188 // Compute the scalarization overhead of needed extractelement 6189 // instructions. For each of the instruction's operands, if the operand can 6190 // be scalarized, add it to the worklist; otherwise, account for the 6191 // overhead. 6192 for (Use &U : I->operands()) 6193 if (auto *J = dyn_cast<Instruction>(U.get())) { 6194 assert(VectorType::isValidElementType(J->getType()) && 6195 "Instruction has non-scalar type"); 6196 if (canBeScalarized(J)) 6197 Worklist.push_back(J); 6198 else if (needsExtract(J, VF)) { 6199 ScalarCost += TTI.getScalarizationOverhead( 6200 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6201 APInt::getAllOnes(VF.getFixedValue()), false, true); 6202 } 6203 } 6204 6205 // Scale the total scalar cost by block probability. 6206 ScalarCost /= getReciprocalPredBlockProb(); 6207 6208 // Compute the discount. A non-negative discount means the vector version 6209 // of the instruction costs more, and scalarizing would be beneficial. 6210 Discount += VectorCost - ScalarCost; 6211 ScalarCosts[I] = ScalarCost; 6212 } 6213 6214 return *Discount.getValue(); 6215 } 6216 6217 LoopVectorizationCostModel::VectorizationCostTy 6218 LoopVectorizationCostModel::expectedCost( 6219 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6220 VectorizationCostTy Cost; 6221 6222 // For each block. 6223 for (BasicBlock *BB : TheLoop->blocks()) { 6224 VectorizationCostTy BlockCost; 6225 6226 // For each instruction in the old loop. 6227 for (Instruction &I : BB->instructionsWithoutDebug()) { 6228 // Skip ignored values. 6229 if (ValuesToIgnore.count(&I) || 6230 (VF.isVector() && VecValuesToIgnore.count(&I))) 6231 continue; 6232 6233 VectorizationCostTy C = getInstructionCost(&I, VF); 6234 6235 // Check if we should override the cost. 6236 if (C.first.isValid() && 6237 ForceTargetInstructionCost.getNumOccurrences() > 0) 6238 C.first = InstructionCost(ForceTargetInstructionCost); 6239 6240 // Keep a list of instructions with invalid costs. 6241 if (Invalid && !C.first.isValid()) 6242 Invalid->emplace_back(&I, VF); 6243 6244 BlockCost.first += C.first; 6245 BlockCost.second |= C.second; 6246 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6247 << " for VF " << VF << " For instruction: " << I 6248 << '\n'); 6249 } 6250 6251 // If we are vectorizing a predicated block, it will have been 6252 // if-converted. This means that the block's instructions (aside from 6253 // stores and instructions that may divide by zero) will now be 6254 // unconditionally executed. For the scalar case, we may not always execute 6255 // the predicated block, if it is an if-else block. Thus, scale the block's 6256 // cost by the probability of executing it. blockNeedsPredication from 6257 // Legal is used so as to not include all blocks in tail folded loops. 6258 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6259 BlockCost.first /= getReciprocalPredBlockProb(); 6260 6261 Cost.first += BlockCost.first; 6262 Cost.second |= BlockCost.second; 6263 } 6264 6265 return Cost; 6266 } 6267 6268 /// Gets Address Access SCEV after verifying that the access pattern 6269 /// is loop invariant except the induction variable dependence. 6270 /// 6271 /// This SCEV can be sent to the Target in order to estimate the address 6272 /// calculation cost. 6273 static const SCEV *getAddressAccessSCEV( 6274 Value *Ptr, 6275 LoopVectorizationLegality *Legal, 6276 PredicatedScalarEvolution &PSE, 6277 const Loop *TheLoop) { 6278 6279 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6280 if (!Gep) 6281 return nullptr; 6282 6283 // We are looking for a gep with all loop invariant indices except for one 6284 // which should be an induction variable. 6285 auto SE = PSE.getSE(); 6286 unsigned NumOperands = Gep->getNumOperands(); 6287 for (unsigned i = 1; i < NumOperands; ++i) { 6288 Value *Opd = Gep->getOperand(i); 6289 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6290 !Legal->isInductionVariable(Opd)) 6291 return nullptr; 6292 } 6293 6294 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6295 return PSE.getSCEV(Ptr); 6296 } 6297 6298 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6299 return Legal->hasStride(I->getOperand(0)) || 6300 Legal->hasStride(I->getOperand(1)); 6301 } 6302 6303 InstructionCost 6304 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6305 ElementCount VF) { 6306 assert(VF.isVector() && 6307 "Scalarization cost of instruction implies vectorization."); 6308 if (VF.isScalable()) 6309 return InstructionCost::getInvalid(); 6310 6311 Type *ValTy = getLoadStoreType(I); 6312 auto SE = PSE.getSE(); 6313 6314 unsigned AS = getLoadStoreAddressSpace(I); 6315 Value *Ptr = getLoadStorePointerOperand(I); 6316 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6317 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6318 // that it is being called from this specific place. 6319 6320 // Figure out whether the access is strided and get the stride value 6321 // if it's known in compile time 6322 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6323 6324 // Get the cost of the scalar memory instruction and address computation. 6325 InstructionCost Cost = 6326 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6327 6328 // Don't pass *I here, since it is scalar but will actually be part of a 6329 // vectorized loop where the user of it is a vectorized instruction. 6330 const Align Alignment = getLoadStoreAlignment(I); 6331 Cost += VF.getKnownMinValue() * 6332 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6333 AS, TTI::TCK_RecipThroughput); 6334 6335 // Get the overhead of the extractelement and insertelement instructions 6336 // we might create due to scalarization. 6337 Cost += getScalarizationOverhead(I, VF); 6338 6339 // If we have a predicated load/store, it will need extra i1 extracts and 6340 // conditional branches, but may not be executed for each vector lane. Scale 6341 // the cost by the probability of executing the predicated block. 6342 if (isPredicatedInst(I, VF)) { 6343 Cost /= getReciprocalPredBlockProb(); 6344 6345 // Add the cost of an i1 extract and a branch 6346 auto *Vec_i1Ty = 6347 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6348 Cost += TTI.getScalarizationOverhead( 6349 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6350 /*Insert=*/false, /*Extract=*/true); 6351 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6352 6353 if (useEmulatedMaskMemRefHack(I, VF)) 6354 // Artificially setting to a high enough value to practically disable 6355 // vectorization with such operations. 6356 Cost = 3000000; 6357 } 6358 6359 return Cost; 6360 } 6361 6362 InstructionCost 6363 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6364 ElementCount VF) { 6365 Type *ValTy = getLoadStoreType(I); 6366 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6367 Value *Ptr = getLoadStorePointerOperand(I); 6368 unsigned AS = getLoadStoreAddressSpace(I); 6369 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6370 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6371 6372 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6373 "Stride should be 1 or -1 for consecutive memory access"); 6374 const Align Alignment = getLoadStoreAlignment(I); 6375 InstructionCost Cost = 0; 6376 if (Legal->isMaskRequired(I)) 6377 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6378 CostKind); 6379 else 6380 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6381 CostKind, I); 6382 6383 bool Reverse = ConsecutiveStride < 0; 6384 if (Reverse) 6385 Cost += 6386 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6387 return Cost; 6388 } 6389 6390 InstructionCost 6391 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6392 ElementCount VF) { 6393 assert(Legal->isUniformMemOp(*I)); 6394 6395 Type *ValTy = getLoadStoreType(I); 6396 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6397 const Align Alignment = getLoadStoreAlignment(I); 6398 unsigned AS = getLoadStoreAddressSpace(I); 6399 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6400 if (isa<LoadInst>(I)) { 6401 return TTI.getAddressComputationCost(ValTy) + 6402 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6403 CostKind) + 6404 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6405 } 6406 StoreInst *SI = cast<StoreInst>(I); 6407 6408 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6409 return TTI.getAddressComputationCost(ValTy) + 6410 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6411 CostKind) + 6412 (isLoopInvariantStoreValue 6413 ? 0 6414 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6415 VF.getKnownMinValue() - 1)); 6416 } 6417 6418 InstructionCost 6419 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6420 ElementCount VF) { 6421 Type *ValTy = getLoadStoreType(I); 6422 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6423 const Align Alignment = getLoadStoreAlignment(I); 6424 const Value *Ptr = getLoadStorePointerOperand(I); 6425 6426 return TTI.getAddressComputationCost(VectorTy) + 6427 TTI.getGatherScatterOpCost( 6428 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6429 TargetTransformInfo::TCK_RecipThroughput, I); 6430 } 6431 6432 InstructionCost 6433 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6434 ElementCount VF) { 6435 // TODO: Once we have support for interleaving with scalable vectors 6436 // we can calculate the cost properly here. 6437 if (VF.isScalable()) 6438 return InstructionCost::getInvalid(); 6439 6440 Type *ValTy = getLoadStoreType(I); 6441 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6442 unsigned AS = getLoadStoreAddressSpace(I); 6443 6444 auto Group = getInterleavedAccessGroup(I); 6445 assert(Group && "Fail to get an interleaved access group."); 6446 6447 unsigned InterleaveFactor = Group->getFactor(); 6448 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6449 6450 // Holds the indices of existing members in the interleaved group. 6451 SmallVector<unsigned, 4> Indices; 6452 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6453 if (Group->getMember(IF)) 6454 Indices.push_back(IF); 6455 6456 // Calculate the cost of the whole interleaved group. 6457 bool UseMaskForGaps = 6458 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6459 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6460 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6461 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6462 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6463 6464 if (Group->isReverse()) { 6465 // TODO: Add support for reversed masked interleaved access. 6466 assert(!Legal->isMaskRequired(I) && 6467 "Reverse masked interleaved access not supported."); 6468 Cost += 6469 Group->getNumMembers() * 6470 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6471 } 6472 return Cost; 6473 } 6474 6475 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6476 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6477 using namespace llvm::PatternMatch; 6478 // Early exit for no inloop reductions 6479 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6480 return None; 6481 auto *VectorTy = cast<VectorType>(Ty); 6482 6483 // We are looking for a pattern of, and finding the minimal acceptable cost: 6484 // reduce(mul(ext(A), ext(B))) or 6485 // reduce(mul(A, B)) or 6486 // reduce(ext(A)) or 6487 // reduce(A). 6488 // The basic idea is that we walk down the tree to do that, finding the root 6489 // reduction instruction in InLoopReductionImmediateChains. From there we find 6490 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6491 // of the components. If the reduction cost is lower then we return it for the 6492 // reduction instruction and 0 for the other instructions in the pattern. If 6493 // it is not we return an invalid cost specifying the orignal cost method 6494 // should be used. 6495 Instruction *RetI = I; 6496 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6497 if (!RetI->hasOneUser()) 6498 return None; 6499 RetI = RetI->user_back(); 6500 } 6501 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6502 RetI->user_back()->getOpcode() == Instruction::Add) { 6503 if (!RetI->hasOneUser()) 6504 return None; 6505 RetI = RetI->user_back(); 6506 } 6507 6508 // Test if the found instruction is a reduction, and if not return an invalid 6509 // cost specifying the parent to use the original cost modelling. 6510 if (!InLoopReductionImmediateChains.count(RetI)) 6511 return None; 6512 6513 // Find the reduction this chain is a part of and calculate the basic cost of 6514 // the reduction on its own. 6515 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6516 Instruction *ReductionPhi = LastChain; 6517 while (!isa<PHINode>(ReductionPhi)) 6518 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6519 6520 const RecurrenceDescriptor &RdxDesc = 6521 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6522 6523 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6524 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6525 6526 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6527 // normal fmul instruction to the cost of the fadd reduction. 6528 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6529 BaseCost += 6530 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6531 6532 // If we're using ordered reductions then we can just return the base cost 6533 // here, since getArithmeticReductionCost calculates the full ordered 6534 // reduction cost when FP reassociation is not allowed. 6535 if (useOrderedReductions(RdxDesc)) 6536 return BaseCost; 6537 6538 // Get the operand that was not the reduction chain and match it to one of the 6539 // patterns, returning the better cost if it is found. 6540 Instruction *RedOp = RetI->getOperand(1) == LastChain 6541 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6542 : dyn_cast<Instruction>(RetI->getOperand(1)); 6543 6544 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6545 6546 Instruction *Op0, *Op1; 6547 if (RedOp && 6548 match(RedOp, 6549 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6550 match(Op0, m_ZExtOrSExt(m_Value())) && 6551 Op0->getOpcode() == Op1->getOpcode() && 6552 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6553 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6554 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6555 6556 // Matched reduce(ext(mul(ext(A), ext(B))) 6557 // Note that the extend opcodes need to all match, or if A==B they will have 6558 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6559 // which is equally fine. 6560 bool IsUnsigned = isa<ZExtInst>(Op0); 6561 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6562 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6563 6564 InstructionCost ExtCost = 6565 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6566 TTI::CastContextHint::None, CostKind, Op0); 6567 InstructionCost MulCost = 6568 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6569 InstructionCost Ext2Cost = 6570 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6571 TTI::CastContextHint::None, CostKind, RedOp); 6572 6573 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6574 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6575 CostKind); 6576 6577 if (RedCost.isValid() && 6578 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6579 return I == RetI ? RedCost : 0; 6580 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6581 !TheLoop->isLoopInvariant(RedOp)) { 6582 // Matched reduce(ext(A)) 6583 bool IsUnsigned = isa<ZExtInst>(RedOp); 6584 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6585 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6586 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6587 CostKind); 6588 6589 InstructionCost ExtCost = 6590 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6591 TTI::CastContextHint::None, CostKind, RedOp); 6592 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6593 return I == RetI ? RedCost : 0; 6594 } else if (RedOp && 6595 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6596 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6597 Op0->getOpcode() == Op1->getOpcode() && 6598 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6599 bool IsUnsigned = isa<ZExtInst>(Op0); 6600 Type *Op0Ty = Op0->getOperand(0)->getType(); 6601 Type *Op1Ty = Op1->getOperand(0)->getType(); 6602 Type *LargestOpTy = 6603 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6604 : Op0Ty; 6605 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6606 6607 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6608 // different sizes. We take the largest type as the ext to reduce, and add 6609 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6610 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6611 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6612 TTI::CastContextHint::None, CostKind, Op0); 6613 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6614 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6615 TTI::CastContextHint::None, CostKind, Op1); 6616 InstructionCost MulCost = 6617 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6618 6619 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6620 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6621 CostKind); 6622 InstructionCost ExtraExtCost = 0; 6623 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6624 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6625 ExtraExtCost = TTI.getCastInstrCost( 6626 ExtraExtOp->getOpcode(), ExtType, 6627 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6628 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6629 } 6630 6631 if (RedCost.isValid() && 6632 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6633 return I == RetI ? RedCost : 0; 6634 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6635 // Matched reduce(mul()) 6636 InstructionCost MulCost = 6637 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6638 6639 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6640 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6641 CostKind); 6642 6643 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6644 return I == RetI ? RedCost : 0; 6645 } 6646 } 6647 6648 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6649 } 6650 6651 InstructionCost 6652 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6653 ElementCount VF) { 6654 // Calculate scalar cost only. Vectorization cost should be ready at this 6655 // moment. 6656 if (VF.isScalar()) { 6657 Type *ValTy = getLoadStoreType(I); 6658 const Align Alignment = getLoadStoreAlignment(I); 6659 unsigned AS = getLoadStoreAddressSpace(I); 6660 6661 return TTI.getAddressComputationCost(ValTy) + 6662 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6663 TTI::TCK_RecipThroughput, I); 6664 } 6665 return getWideningCost(I, VF); 6666 } 6667 6668 LoopVectorizationCostModel::VectorizationCostTy 6669 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6670 ElementCount VF) { 6671 // If we know that this instruction will remain uniform, check the cost of 6672 // the scalar version. 6673 if (isUniformAfterVectorization(I, VF)) 6674 VF = ElementCount::getFixed(1); 6675 6676 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6677 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6678 6679 // Forced scalars do not have any scalarization overhead. 6680 auto ForcedScalar = ForcedScalars.find(VF); 6681 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6682 auto InstSet = ForcedScalar->second; 6683 if (InstSet.count(I)) 6684 return VectorizationCostTy( 6685 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6686 VF.getKnownMinValue()), 6687 false); 6688 } 6689 6690 Type *VectorTy; 6691 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6692 6693 bool TypeNotScalarized = false; 6694 if (VF.isVector() && VectorTy->isVectorTy()) { 6695 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6696 if (VF.isScalable()) 6697 // <vscale x 1 x iN> is assumed to be profitable over iN because 6698 // scalable registers are a distinct register class from scalar ones. 6699 // If we ever find a target which wants to lower scalable vectors 6700 // back to scalars, we'll need to update this code to explicitly 6701 // ask TTI about the register class uses for each part. 6702 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6703 else 6704 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6705 } else 6706 C = InstructionCost::getInvalid(); 6707 } 6708 return VectorizationCostTy(C, TypeNotScalarized); 6709 } 6710 6711 InstructionCost 6712 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6713 ElementCount VF) const { 6714 6715 // There is no mechanism yet to create a scalable scalarization loop, 6716 // so this is currently Invalid. 6717 if (VF.isScalable()) 6718 return InstructionCost::getInvalid(); 6719 6720 if (VF.isScalar()) 6721 return 0; 6722 6723 InstructionCost Cost = 0; 6724 Type *RetTy = ToVectorTy(I->getType(), VF); 6725 if (!RetTy->isVoidTy() && 6726 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6727 Cost += TTI.getScalarizationOverhead( 6728 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6729 false); 6730 6731 // Some targets keep addresses scalar. 6732 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6733 return Cost; 6734 6735 // Some targets support efficient element stores. 6736 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6737 return Cost; 6738 6739 // Collect operands to consider. 6740 CallInst *CI = dyn_cast<CallInst>(I); 6741 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6742 6743 // Skip operands that do not require extraction/scalarization and do not incur 6744 // any overhead. 6745 SmallVector<Type *> Tys; 6746 for (auto *V : filterExtractingOperands(Ops, VF)) 6747 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6748 return Cost + TTI.getOperandsScalarizationOverhead( 6749 filterExtractingOperands(Ops, VF), Tys); 6750 } 6751 6752 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6753 if (VF.isScalar()) 6754 return; 6755 NumPredStores = 0; 6756 for (BasicBlock *BB : TheLoop->blocks()) { 6757 // For each instruction in the old loop. 6758 for (Instruction &I : *BB) { 6759 Value *Ptr = getLoadStorePointerOperand(&I); 6760 if (!Ptr) 6761 continue; 6762 6763 // TODO: We should generate better code and update the cost model for 6764 // predicated uniform stores. Today they are treated as any other 6765 // predicated store (see added test cases in 6766 // invariant-store-vectorization.ll). 6767 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6768 NumPredStores++; 6769 6770 if (Legal->isUniformMemOp(I)) { 6771 // TODO: Avoid replicating loads and stores instead of 6772 // relying on instcombine to remove them. 6773 // Load: Scalar load + broadcast 6774 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6775 InstructionCost Cost; 6776 if (isa<StoreInst>(&I) && VF.isScalable() && 6777 isLegalGatherOrScatter(&I, VF)) { 6778 Cost = getGatherScatterCost(&I, VF); 6779 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6780 } else { 6781 Cost = getUniformMemOpCost(&I, VF); 6782 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6783 } 6784 continue; 6785 } 6786 6787 // We assume that widening is the best solution when possible. 6788 if (memoryInstructionCanBeWidened(&I, VF)) { 6789 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6790 int ConsecutiveStride = Legal->isConsecutivePtr( 6791 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6792 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6793 "Expected consecutive stride."); 6794 InstWidening Decision = 6795 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6796 setWideningDecision(&I, VF, Decision, Cost); 6797 continue; 6798 } 6799 6800 // Choose between Interleaving, Gather/Scatter or Scalarization. 6801 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6802 unsigned NumAccesses = 1; 6803 if (isAccessInterleaved(&I)) { 6804 auto Group = getInterleavedAccessGroup(&I); 6805 assert(Group && "Fail to get an interleaved access group."); 6806 6807 // Make one decision for the whole group. 6808 if (getWideningDecision(&I, VF) != CM_Unknown) 6809 continue; 6810 6811 NumAccesses = Group->getNumMembers(); 6812 if (interleavedAccessCanBeWidened(&I, VF)) 6813 InterleaveCost = getInterleaveGroupCost(&I, VF); 6814 } 6815 6816 InstructionCost GatherScatterCost = 6817 isLegalGatherOrScatter(&I, VF) 6818 ? getGatherScatterCost(&I, VF) * NumAccesses 6819 : InstructionCost::getInvalid(); 6820 6821 InstructionCost ScalarizationCost = 6822 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6823 6824 // Choose better solution for the current VF, 6825 // write down this decision and use it during vectorization. 6826 InstructionCost Cost; 6827 InstWidening Decision; 6828 if (InterleaveCost <= GatherScatterCost && 6829 InterleaveCost < ScalarizationCost) { 6830 Decision = CM_Interleave; 6831 Cost = InterleaveCost; 6832 } else if (GatherScatterCost < ScalarizationCost) { 6833 Decision = CM_GatherScatter; 6834 Cost = GatherScatterCost; 6835 } else { 6836 Decision = CM_Scalarize; 6837 Cost = ScalarizationCost; 6838 } 6839 // If the instructions belongs to an interleave group, the whole group 6840 // receives the same decision. The whole group receives the cost, but 6841 // the cost will actually be assigned to one instruction. 6842 if (auto Group = getInterleavedAccessGroup(&I)) 6843 setWideningDecision(Group, VF, Decision, Cost); 6844 else 6845 setWideningDecision(&I, VF, Decision, Cost); 6846 } 6847 } 6848 6849 // Make sure that any load of address and any other address computation 6850 // remains scalar unless there is gather/scatter support. This avoids 6851 // inevitable extracts into address registers, and also has the benefit of 6852 // activating LSR more, since that pass can't optimize vectorized 6853 // addresses. 6854 if (TTI.prefersVectorizedAddressing()) 6855 return; 6856 6857 // Start with all scalar pointer uses. 6858 SmallPtrSet<Instruction *, 8> AddrDefs; 6859 for (BasicBlock *BB : TheLoop->blocks()) 6860 for (Instruction &I : *BB) { 6861 Instruction *PtrDef = 6862 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6863 if (PtrDef && TheLoop->contains(PtrDef) && 6864 getWideningDecision(&I, VF) != CM_GatherScatter) 6865 AddrDefs.insert(PtrDef); 6866 } 6867 6868 // Add all instructions used to generate the addresses. 6869 SmallVector<Instruction *, 4> Worklist; 6870 append_range(Worklist, AddrDefs); 6871 while (!Worklist.empty()) { 6872 Instruction *I = Worklist.pop_back_val(); 6873 for (auto &Op : I->operands()) 6874 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6875 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6876 AddrDefs.insert(InstOp).second) 6877 Worklist.push_back(InstOp); 6878 } 6879 6880 for (auto *I : AddrDefs) { 6881 if (isa<LoadInst>(I)) { 6882 // Setting the desired widening decision should ideally be handled in 6883 // by cost functions, but since this involves the task of finding out 6884 // if the loaded register is involved in an address computation, it is 6885 // instead changed here when we know this is the case. 6886 InstWidening Decision = getWideningDecision(I, VF); 6887 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6888 // Scalarize a widened load of address. 6889 setWideningDecision( 6890 I, VF, CM_Scalarize, 6891 (VF.getKnownMinValue() * 6892 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6893 else if (auto Group = getInterleavedAccessGroup(I)) { 6894 // Scalarize an interleave group of address loads. 6895 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6896 if (Instruction *Member = Group->getMember(I)) 6897 setWideningDecision( 6898 Member, VF, CM_Scalarize, 6899 (VF.getKnownMinValue() * 6900 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6901 } 6902 } 6903 } else 6904 // Make sure I gets scalarized and a cost estimate without 6905 // scalarization overhead. 6906 ForcedScalars[VF].insert(I); 6907 } 6908 } 6909 6910 InstructionCost 6911 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6912 Type *&VectorTy) { 6913 Type *RetTy = I->getType(); 6914 if (canTruncateToMinimalBitwidth(I, VF)) 6915 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6916 auto SE = PSE.getSE(); 6917 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6918 6919 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6920 ElementCount VF) -> bool { 6921 if (VF.isScalar()) 6922 return true; 6923 6924 auto Scalarized = InstsToScalarize.find(VF); 6925 assert(Scalarized != InstsToScalarize.end() && 6926 "VF not yet analyzed for scalarization profitability"); 6927 return !Scalarized->second.count(I) && 6928 llvm::all_of(I->users(), [&](User *U) { 6929 auto *UI = cast<Instruction>(U); 6930 return !Scalarized->second.count(UI); 6931 }); 6932 }; 6933 (void) hasSingleCopyAfterVectorization; 6934 6935 if (isScalarAfterVectorization(I, VF)) { 6936 // With the exception of GEPs and PHIs, after scalarization there should 6937 // only be one copy of the instruction generated in the loop. This is 6938 // because the VF is either 1, or any instructions that need scalarizing 6939 // have already been dealt with by the the time we get here. As a result, 6940 // it means we don't have to multiply the instruction cost by VF. 6941 assert(I->getOpcode() == Instruction::GetElementPtr || 6942 I->getOpcode() == Instruction::PHI || 6943 (I->getOpcode() == Instruction::BitCast && 6944 I->getType()->isPointerTy()) || 6945 hasSingleCopyAfterVectorization(I, VF)); 6946 VectorTy = RetTy; 6947 } else 6948 VectorTy = ToVectorTy(RetTy, VF); 6949 6950 // TODO: We need to estimate the cost of intrinsic calls. 6951 switch (I->getOpcode()) { 6952 case Instruction::GetElementPtr: 6953 // We mark this instruction as zero-cost because the cost of GEPs in 6954 // vectorized code depends on whether the corresponding memory instruction 6955 // is scalarized or not. Therefore, we handle GEPs with the memory 6956 // instruction cost. 6957 return 0; 6958 case Instruction::Br: { 6959 // In cases of scalarized and predicated instructions, there will be VF 6960 // predicated blocks in the vectorized loop. Each branch around these 6961 // blocks requires also an extract of its vector compare i1 element. 6962 bool ScalarPredicatedBB = false; 6963 BranchInst *BI = cast<BranchInst>(I); 6964 if (VF.isVector() && BI->isConditional() && 6965 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6966 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6967 ScalarPredicatedBB = true; 6968 6969 if (ScalarPredicatedBB) { 6970 // Not possible to scalarize scalable vector with predicated instructions. 6971 if (VF.isScalable()) 6972 return InstructionCost::getInvalid(); 6973 // Return cost for branches around scalarized and predicated blocks. 6974 auto *Vec_i1Ty = 6975 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6976 return ( 6977 TTI.getScalarizationOverhead( 6978 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 6979 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6980 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6981 // The back-edge branch will remain, as will all scalar branches. 6982 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6983 else 6984 // This branch will be eliminated by if-conversion. 6985 return 0; 6986 // Note: We currently assume zero cost for an unconditional branch inside 6987 // a predicated block since it will become a fall-through, although we 6988 // may decide in the future to call TTI for all branches. 6989 } 6990 case Instruction::PHI: { 6991 auto *Phi = cast<PHINode>(I); 6992 6993 // First-order recurrences are replaced by vector shuffles inside the loop. 6994 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6995 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6996 return TTI.getShuffleCost( 6997 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6998 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6999 7000 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7001 // converted into select instructions. We require N - 1 selects per phi 7002 // node, where N is the number of incoming values. 7003 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7004 return (Phi->getNumIncomingValues() - 1) * 7005 TTI.getCmpSelInstrCost( 7006 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7007 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7008 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7009 7010 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7011 } 7012 case Instruction::UDiv: 7013 case Instruction::SDiv: 7014 case Instruction::URem: 7015 case Instruction::SRem: 7016 // If we have a predicated instruction, it may not be executed for each 7017 // vector lane. Get the scalarization cost and scale this amount by the 7018 // probability of executing the predicated block. If the instruction is not 7019 // predicated, we fall through to the next case. 7020 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7021 InstructionCost Cost = 0; 7022 7023 // These instructions have a non-void type, so account for the phi nodes 7024 // that we will create. This cost is likely to be zero. The phi node 7025 // cost, if any, should be scaled by the block probability because it 7026 // models a copy at the end of each predicated block. 7027 Cost += VF.getKnownMinValue() * 7028 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7029 7030 // The cost of the non-predicated instruction. 7031 Cost += VF.getKnownMinValue() * 7032 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7033 7034 // The cost of insertelement and extractelement instructions needed for 7035 // scalarization. 7036 Cost += getScalarizationOverhead(I, VF); 7037 7038 // Scale the cost by the probability of executing the predicated blocks. 7039 // This assumes the predicated block for each vector lane is equally 7040 // likely. 7041 return Cost / getReciprocalPredBlockProb(); 7042 } 7043 LLVM_FALLTHROUGH; 7044 case Instruction::Add: 7045 case Instruction::FAdd: 7046 case Instruction::Sub: 7047 case Instruction::FSub: 7048 case Instruction::Mul: 7049 case Instruction::FMul: 7050 case Instruction::FDiv: 7051 case Instruction::FRem: 7052 case Instruction::Shl: 7053 case Instruction::LShr: 7054 case Instruction::AShr: 7055 case Instruction::And: 7056 case Instruction::Or: 7057 case Instruction::Xor: { 7058 // Since we will replace the stride by 1 the multiplication should go away. 7059 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7060 return 0; 7061 7062 // Detect reduction patterns 7063 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7064 return *RedCost; 7065 7066 // Certain instructions can be cheaper to vectorize if they have a constant 7067 // second vector operand. One example of this are shifts on x86. 7068 Value *Op2 = I->getOperand(1); 7069 TargetTransformInfo::OperandValueProperties Op2VP; 7070 TargetTransformInfo::OperandValueKind Op2VK = 7071 TTI.getOperandInfo(Op2, Op2VP); 7072 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7073 Op2VK = TargetTransformInfo::OK_UniformValue; 7074 7075 SmallVector<const Value *, 4> Operands(I->operand_values()); 7076 return TTI.getArithmeticInstrCost( 7077 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7078 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7079 } 7080 case Instruction::FNeg: { 7081 return TTI.getArithmeticInstrCost( 7082 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7083 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7084 TargetTransformInfo::OP_None, I->getOperand(0), I); 7085 } 7086 case Instruction::Select: { 7087 SelectInst *SI = cast<SelectInst>(I); 7088 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7089 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7090 7091 const Value *Op0, *Op1; 7092 using namespace llvm::PatternMatch; 7093 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7094 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7095 // select x, y, false --> x & y 7096 // select x, true, y --> x | y 7097 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7098 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7099 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7100 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7101 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7102 Op1->getType()->getScalarSizeInBits() == 1); 7103 7104 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7105 return TTI.getArithmeticInstrCost( 7106 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7107 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7108 } 7109 7110 Type *CondTy = SI->getCondition()->getType(); 7111 if (!ScalarCond) 7112 CondTy = VectorType::get(CondTy, VF); 7113 7114 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7115 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7116 Pred = Cmp->getPredicate(); 7117 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7118 CostKind, I); 7119 } 7120 case Instruction::ICmp: 7121 case Instruction::FCmp: { 7122 Type *ValTy = I->getOperand(0)->getType(); 7123 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7124 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7125 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7126 VectorTy = ToVectorTy(ValTy, VF); 7127 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7128 cast<CmpInst>(I)->getPredicate(), CostKind, 7129 I); 7130 } 7131 case Instruction::Store: 7132 case Instruction::Load: { 7133 ElementCount Width = VF; 7134 if (Width.isVector()) { 7135 InstWidening Decision = getWideningDecision(I, Width); 7136 assert(Decision != CM_Unknown && 7137 "CM decision should be taken at this point"); 7138 if (Decision == CM_Scalarize) { 7139 if (VF.isScalable() && isa<StoreInst>(I)) 7140 // We can't scalarize a scalable vector store (even a uniform one 7141 // currently), return an invalid cost so as to prevent vectorization. 7142 return InstructionCost::getInvalid(); 7143 Width = ElementCount::getFixed(1); 7144 } 7145 } 7146 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7147 return getMemoryInstructionCost(I, VF); 7148 } 7149 case Instruction::BitCast: 7150 if (I->getType()->isPointerTy()) 7151 return 0; 7152 LLVM_FALLTHROUGH; 7153 case Instruction::ZExt: 7154 case Instruction::SExt: 7155 case Instruction::FPToUI: 7156 case Instruction::FPToSI: 7157 case Instruction::FPExt: 7158 case Instruction::PtrToInt: 7159 case Instruction::IntToPtr: 7160 case Instruction::SIToFP: 7161 case Instruction::UIToFP: 7162 case Instruction::Trunc: 7163 case Instruction::FPTrunc: { 7164 // Computes the CastContextHint from a Load/Store instruction. 7165 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7166 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7167 "Expected a load or a store!"); 7168 7169 if (VF.isScalar() || !TheLoop->contains(I)) 7170 return TTI::CastContextHint::Normal; 7171 7172 switch (getWideningDecision(I, VF)) { 7173 case LoopVectorizationCostModel::CM_GatherScatter: 7174 return TTI::CastContextHint::GatherScatter; 7175 case LoopVectorizationCostModel::CM_Interleave: 7176 return TTI::CastContextHint::Interleave; 7177 case LoopVectorizationCostModel::CM_Scalarize: 7178 case LoopVectorizationCostModel::CM_Widen: 7179 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7180 : TTI::CastContextHint::Normal; 7181 case LoopVectorizationCostModel::CM_Widen_Reverse: 7182 return TTI::CastContextHint::Reversed; 7183 case LoopVectorizationCostModel::CM_Unknown: 7184 llvm_unreachable("Instr did not go through cost modelling?"); 7185 } 7186 7187 llvm_unreachable("Unhandled case!"); 7188 }; 7189 7190 unsigned Opcode = I->getOpcode(); 7191 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7192 // For Trunc, the context is the only user, which must be a StoreInst. 7193 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7194 if (I->hasOneUse()) 7195 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7196 CCH = ComputeCCH(Store); 7197 } 7198 // For Z/Sext, the context is the operand, which must be a LoadInst. 7199 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7200 Opcode == Instruction::FPExt) { 7201 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7202 CCH = ComputeCCH(Load); 7203 } 7204 7205 // We optimize the truncation of induction variables having constant 7206 // integer steps. The cost of these truncations is the same as the scalar 7207 // operation. 7208 if (isOptimizableIVTruncate(I, VF)) { 7209 auto *Trunc = cast<TruncInst>(I); 7210 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7211 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7212 } 7213 7214 // Detect reduction patterns 7215 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7216 return *RedCost; 7217 7218 Type *SrcScalarTy = I->getOperand(0)->getType(); 7219 Type *SrcVecTy = 7220 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7221 if (canTruncateToMinimalBitwidth(I, VF)) { 7222 // This cast is going to be shrunk. This may remove the cast or it might 7223 // turn it into slightly different cast. For example, if MinBW == 16, 7224 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7225 // 7226 // Calculate the modified src and dest types. 7227 Type *MinVecTy = VectorTy; 7228 if (Opcode == Instruction::Trunc) { 7229 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7230 VectorTy = 7231 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7232 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7233 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7234 VectorTy = 7235 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7236 } 7237 } 7238 7239 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7240 } 7241 case Instruction::Call: { 7242 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7243 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7244 return *RedCost; 7245 bool NeedToScalarize; 7246 CallInst *CI = cast<CallInst>(I); 7247 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7248 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7249 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7250 return std::min(CallCost, IntrinsicCost); 7251 } 7252 return CallCost; 7253 } 7254 case Instruction::ExtractValue: 7255 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7256 case Instruction::Alloca: 7257 // We cannot easily widen alloca to a scalable alloca, as 7258 // the result would need to be a vector of pointers. 7259 if (VF.isScalable()) 7260 return InstructionCost::getInvalid(); 7261 LLVM_FALLTHROUGH; 7262 default: 7263 // This opcode is unknown. Assume that it is the same as 'mul'. 7264 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7265 } // end of switch. 7266 } 7267 7268 char LoopVectorize::ID = 0; 7269 7270 static const char lv_name[] = "Loop Vectorization"; 7271 7272 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7273 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7274 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7275 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7276 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7277 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7278 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7279 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7280 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7281 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7282 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7283 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7284 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7285 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7286 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7287 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7288 7289 namespace llvm { 7290 7291 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7292 7293 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7294 bool VectorizeOnlyWhenForced) { 7295 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7296 } 7297 7298 } // end namespace llvm 7299 7300 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7301 // Check if the pointer operand of a load or store instruction is 7302 // consecutive. 7303 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7304 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7305 return false; 7306 } 7307 7308 void LoopVectorizationCostModel::collectValuesToIgnore() { 7309 // Ignore ephemeral values. 7310 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7311 7312 // Find all stores to invariant variables. Since they are going to sink 7313 // outside the loop we do not need calculate cost for them. 7314 for (BasicBlock *BB : TheLoop->blocks()) 7315 for (Instruction &I : *BB) { 7316 StoreInst *SI; 7317 if ((SI = dyn_cast<StoreInst>(&I)) && 7318 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7319 ValuesToIgnore.insert(&I); 7320 } 7321 7322 // Ignore type-promoting instructions we identified during reduction 7323 // detection. 7324 for (auto &Reduction : Legal->getReductionVars()) { 7325 const RecurrenceDescriptor &RedDes = Reduction.second; 7326 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7327 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7328 } 7329 // Ignore type-casting instructions we identified during induction 7330 // detection. 7331 for (auto &Induction : Legal->getInductionVars()) { 7332 const InductionDescriptor &IndDes = Induction.second; 7333 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7334 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7335 } 7336 } 7337 7338 void LoopVectorizationCostModel::collectInLoopReductions() { 7339 for (auto &Reduction : Legal->getReductionVars()) { 7340 PHINode *Phi = Reduction.first; 7341 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7342 7343 // We don't collect reductions that are type promoted (yet). 7344 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7345 continue; 7346 7347 // If the target would prefer this reduction to happen "in-loop", then we 7348 // want to record it as such. 7349 unsigned Opcode = RdxDesc.getOpcode(); 7350 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7351 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7352 TargetTransformInfo::ReductionFlags())) 7353 continue; 7354 7355 // Check that we can correctly put the reductions into the loop, by 7356 // finding the chain of operations that leads from the phi to the loop 7357 // exit value. 7358 SmallVector<Instruction *, 4> ReductionOperations = 7359 RdxDesc.getReductionOpChain(Phi, TheLoop); 7360 bool InLoop = !ReductionOperations.empty(); 7361 if (InLoop) { 7362 InLoopReductionChains[Phi] = ReductionOperations; 7363 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7364 Instruction *LastChain = Phi; 7365 for (auto *I : ReductionOperations) { 7366 InLoopReductionImmediateChains[I] = LastChain; 7367 LastChain = I; 7368 } 7369 } 7370 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7371 << " reduction for phi: " << *Phi << "\n"); 7372 } 7373 } 7374 7375 // TODO: we could return a pair of values that specify the max VF and 7376 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7377 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7378 // doesn't have a cost model that can choose which plan to execute if 7379 // more than one is generated. 7380 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7381 LoopVectorizationCostModel &CM) { 7382 unsigned WidestType; 7383 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7384 return WidestVectorRegBits / WidestType; 7385 } 7386 7387 VectorizationFactor 7388 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7389 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7390 ElementCount VF = UserVF; 7391 // Outer loop handling: They may require CFG and instruction level 7392 // transformations before even evaluating whether vectorization is profitable. 7393 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7394 // the vectorization pipeline. 7395 if (!OrigLoop->isInnermost()) { 7396 // If the user doesn't provide a vectorization factor, determine a 7397 // reasonable one. 7398 if (UserVF.isZero()) { 7399 VF = ElementCount::getFixed(determineVPlanVF( 7400 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7401 .getFixedSize(), 7402 CM)); 7403 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7404 7405 // Make sure we have a VF > 1 for stress testing. 7406 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7407 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7408 << "overriding computed VF.\n"); 7409 VF = ElementCount::getFixed(4); 7410 } 7411 } 7412 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7413 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7414 "VF needs to be a power of two"); 7415 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7416 << "VF " << VF << " to build VPlans.\n"); 7417 buildVPlans(VF, VF); 7418 7419 // For VPlan build stress testing, we bail out after VPlan construction. 7420 if (VPlanBuildStressTest) 7421 return VectorizationFactor::Disabled(); 7422 7423 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7424 } 7425 7426 LLVM_DEBUG( 7427 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7428 "VPlan-native path.\n"); 7429 return VectorizationFactor::Disabled(); 7430 } 7431 7432 Optional<VectorizationFactor> 7433 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7434 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7435 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7436 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7437 return None; 7438 7439 // Invalidate interleave groups if all blocks of loop will be predicated. 7440 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7441 !useMaskedInterleavedAccesses(*TTI)) { 7442 LLVM_DEBUG( 7443 dbgs() 7444 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7445 "which requires masked-interleaved support.\n"); 7446 if (CM.InterleaveInfo.invalidateGroups()) 7447 // Invalidating interleave groups also requires invalidating all decisions 7448 // based on them, which includes widening decisions and uniform and scalar 7449 // values. 7450 CM.invalidateCostModelingDecisions(); 7451 } 7452 7453 ElementCount MaxUserVF = 7454 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7455 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7456 if (!UserVF.isZero() && UserVFIsLegal) { 7457 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7458 "VF needs to be a power of two"); 7459 // Collect the instructions (and their associated costs) that will be more 7460 // profitable to scalarize. 7461 if (CM.selectUserVectorizationFactor(UserVF)) { 7462 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7463 CM.collectInLoopReductions(); 7464 buildVPlansWithVPRecipes(UserVF, UserVF); 7465 LLVM_DEBUG(printPlans(dbgs())); 7466 return {{UserVF, 0, 0}}; 7467 } else 7468 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7469 "InvalidCost", ORE, OrigLoop); 7470 } 7471 7472 // Populate the set of Vectorization Factor Candidates. 7473 ElementCountSet VFCandidates; 7474 for (auto VF = ElementCount::getFixed(1); 7475 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7476 VFCandidates.insert(VF); 7477 for (auto VF = ElementCount::getScalable(1); 7478 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7479 VFCandidates.insert(VF); 7480 7481 for (const auto &VF : VFCandidates) { 7482 // Collect Uniform and Scalar instructions after vectorization with VF. 7483 CM.collectUniformsAndScalars(VF); 7484 7485 // Collect the instructions (and their associated costs) that will be more 7486 // profitable to scalarize. 7487 if (VF.isVector()) 7488 CM.collectInstsToScalarize(VF); 7489 } 7490 7491 CM.collectInLoopReductions(); 7492 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7493 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7494 7495 LLVM_DEBUG(printPlans(dbgs())); 7496 if (!MaxFactors.hasVector()) 7497 return VectorizationFactor::Disabled(); 7498 7499 // Select the optimal vectorization factor. 7500 VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); 7501 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7502 return VF; 7503 } 7504 7505 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7506 assert(count_if(VPlans, 7507 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7508 1 && 7509 "Best VF has not a single VPlan."); 7510 7511 for (const VPlanPtr &Plan : VPlans) { 7512 if (Plan->hasVF(VF)) 7513 return *Plan.get(); 7514 } 7515 llvm_unreachable("No plan found!"); 7516 } 7517 7518 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7519 SmallVector<Metadata *, 4> MDs; 7520 // Reserve first location for self reference to the LoopID metadata node. 7521 MDs.push_back(nullptr); 7522 bool IsUnrollMetadata = false; 7523 MDNode *LoopID = L->getLoopID(); 7524 if (LoopID) { 7525 // First find existing loop unrolling disable metadata. 7526 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7527 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7528 if (MD) { 7529 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7530 IsUnrollMetadata = 7531 S && S->getString().startswith("llvm.loop.unroll.disable"); 7532 } 7533 MDs.push_back(LoopID->getOperand(i)); 7534 } 7535 } 7536 7537 if (!IsUnrollMetadata) { 7538 // Add runtime unroll disable metadata. 7539 LLVMContext &Context = L->getHeader()->getContext(); 7540 SmallVector<Metadata *, 1> DisableOperands; 7541 DisableOperands.push_back( 7542 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7543 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7544 MDs.push_back(DisableNode); 7545 MDNode *NewLoopID = MDNode::get(Context, MDs); 7546 // Set operand 0 to refer to the loop id itself. 7547 NewLoopID->replaceOperandWith(0, NewLoopID); 7548 L->setLoopID(NewLoopID); 7549 } 7550 } 7551 7552 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7553 VPlan &BestVPlan, 7554 InnerLoopVectorizer &ILV, 7555 DominatorTree *DT, 7556 bool IsEpilogueVectorization) { 7557 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7558 << '\n'); 7559 7560 // Perform the actual loop transformation. 7561 7562 // 1. Set up the skeleton for vectorization, including vector pre-header and 7563 // middle block. The vector loop is created during VPlan execution. 7564 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7565 Value *CanonicalIVStartValue; 7566 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7567 ILV.createVectorizedLoopSkeleton(); 7568 7569 // Only use noalias metadata when using memory checks guaranteeing no overlap 7570 // across all iterations. 7571 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7572 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7573 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7574 7575 // We currently don't use LoopVersioning for the actual loop cloning but we 7576 // still use it to add the noalias metadata. 7577 // TODO: Find a better way to re-use LoopVersioning functionality to add 7578 // metadata. 7579 State.LVer = std::make_unique<LoopVersioning>( 7580 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7581 PSE.getSE()); 7582 State.LVer->prepareNoAliasMetadata(); 7583 } 7584 7585 ILV.collectPoisonGeneratingRecipes(State); 7586 7587 ILV.printDebugTracesAtStart(); 7588 7589 //===------------------------------------------------===// 7590 // 7591 // Notice: any optimization or new instruction that go 7592 // into the code below should also be implemented in 7593 // the cost-model. 7594 // 7595 //===------------------------------------------------===// 7596 7597 // 2. Copy and widen instructions from the old loop into the new loop. 7598 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7599 ILV.getOrCreateVectorTripCount(nullptr), 7600 CanonicalIVStartValue, State, 7601 IsEpilogueVectorization); 7602 7603 BestVPlan.execute(&State); 7604 7605 // Keep all loop hints from the original loop on the vector loop (we'll 7606 // replace the vectorizer-specific hints below). 7607 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7608 7609 Optional<MDNode *> VectorizedLoopID = 7610 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7611 LLVMLoopVectorizeFollowupVectorized}); 7612 7613 VPBasicBlock *HeaderVPBB = 7614 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7615 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7616 if (VectorizedLoopID) 7617 L->setLoopID(VectorizedLoopID.getValue()); 7618 else { 7619 // Keep all loop hints from the original loop on the vector loop (we'll 7620 // replace the vectorizer-specific hints below). 7621 if (MDNode *LID = OrigLoop->getLoopID()) 7622 L->setLoopID(LID); 7623 7624 LoopVectorizeHints Hints(L, true, *ORE); 7625 Hints.setAlreadyVectorized(); 7626 } 7627 // Disable runtime unrolling when vectorizing the epilogue loop. 7628 if (CanonicalIVStartValue) 7629 AddRuntimeUnrollDisableMetaData(L); 7630 7631 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7632 // predication, updating analyses. 7633 ILV.fixVectorizedLoop(State, BestVPlan); 7634 7635 ILV.printDebugTracesAtEnd(); 7636 } 7637 7638 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7639 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7640 for (const auto &Plan : VPlans) 7641 if (PrintVPlansInDotFormat) 7642 Plan->printDOT(O); 7643 else 7644 Plan->print(O); 7645 } 7646 #endif 7647 7648 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7649 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7650 7651 // We create new control-flow for the vectorized loop, so the original exit 7652 // conditions will be dead after vectorization if it's only used by the 7653 // terminator 7654 SmallVector<BasicBlock*> ExitingBlocks; 7655 OrigLoop->getExitingBlocks(ExitingBlocks); 7656 for (auto *BB : ExitingBlocks) { 7657 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7658 if (!Cmp || !Cmp->hasOneUse()) 7659 continue; 7660 7661 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7662 if (!DeadInstructions.insert(Cmp).second) 7663 continue; 7664 7665 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7666 // TODO: can recurse through operands in general 7667 for (Value *Op : Cmp->operands()) { 7668 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7669 DeadInstructions.insert(cast<Instruction>(Op)); 7670 } 7671 } 7672 7673 // We create new "steps" for induction variable updates to which the original 7674 // induction variables map. An original update instruction will be dead if 7675 // all its users except the induction variable are dead. 7676 auto *Latch = OrigLoop->getLoopLatch(); 7677 for (auto &Induction : Legal->getInductionVars()) { 7678 PHINode *Ind = Induction.first; 7679 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7680 7681 // If the tail is to be folded by masking, the primary induction variable, 7682 // if exists, isn't dead: it will be used for masking. Don't kill it. 7683 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7684 continue; 7685 7686 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7687 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7688 })) 7689 DeadInstructions.insert(IndUpdate); 7690 } 7691 } 7692 7693 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7694 7695 //===--------------------------------------------------------------------===// 7696 // EpilogueVectorizerMainLoop 7697 //===--------------------------------------------------------------------===// 7698 7699 /// This function is partially responsible for generating the control flow 7700 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7701 std::pair<BasicBlock *, Value *> 7702 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7703 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7704 7705 // Workaround! Compute the trip count of the original loop and cache it 7706 // before we start modifying the CFG. This code has a systemic problem 7707 // wherein it tries to run analysis over partially constructed IR; this is 7708 // wrong, and not simply for SCEV. The trip count of the original loop 7709 // simply happens to be prone to hitting this in practice. In theory, we 7710 // can hit the same issue for any SCEV, or ValueTracking query done during 7711 // mutation. See PR49900. 7712 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7713 createVectorLoopSkeleton(""); 7714 7715 // Generate the code to check the minimum iteration count of the vector 7716 // epilogue (see below). 7717 EPI.EpilogueIterationCountCheck = 7718 emitIterationCountCheck(LoopScalarPreHeader, true); 7719 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7720 7721 // Generate the code to check any assumptions that we've made for SCEV 7722 // expressions. 7723 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7724 7725 // Generate the code that checks at runtime if arrays overlap. We put the 7726 // checks into a separate block to make the more common case of few elements 7727 // faster. 7728 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7729 7730 // Generate the iteration count check for the main loop, *after* the check 7731 // for the epilogue loop, so that the path-length is shorter for the case 7732 // that goes directly through the vector epilogue. The longer-path length for 7733 // the main loop is compensated for, by the gain from vectorizing the larger 7734 // trip count. Note: the branch will get updated later on when we vectorize 7735 // the epilogue. 7736 EPI.MainLoopIterationCountCheck = 7737 emitIterationCountCheck(LoopScalarPreHeader, false); 7738 7739 // Generate the induction variable. 7740 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7741 7742 // Skip induction resume value creation here because they will be created in 7743 // the second pass. If we created them here, they wouldn't be used anyway, 7744 // because the vplan in the second pass still contains the inductions from the 7745 // original loop. 7746 7747 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7748 } 7749 7750 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7751 LLVM_DEBUG({ 7752 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7753 << "Main Loop VF:" << EPI.MainLoopVF 7754 << ", Main Loop UF:" << EPI.MainLoopUF 7755 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7756 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7757 }); 7758 } 7759 7760 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7761 DEBUG_WITH_TYPE(VerboseDebug, { 7762 dbgs() << "intermediate fn:\n" 7763 << *OrigLoop->getHeader()->getParent() << "\n"; 7764 }); 7765 } 7766 7767 BasicBlock * 7768 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7769 bool ForEpilogue) { 7770 assert(Bypass && "Expected valid bypass basic block."); 7771 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7772 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7773 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7774 // Reuse existing vector loop preheader for TC checks. 7775 // Note that new preheader block is generated for vector loop. 7776 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7777 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7778 7779 // Generate code to check if the loop's trip count is less than VF * UF of the 7780 // main vector loop. 7781 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7782 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7783 7784 Value *CheckMinIters = Builder.CreateICmp( 7785 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7786 "min.iters.check"); 7787 7788 if (!ForEpilogue) 7789 TCCheckBlock->setName("vector.main.loop.iter.check"); 7790 7791 // Create new preheader for vector loop. 7792 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7793 DT, LI, nullptr, "vector.ph"); 7794 7795 if (ForEpilogue) { 7796 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7797 DT->getNode(Bypass)->getIDom()) && 7798 "TC check is expected to dominate Bypass"); 7799 7800 // Update dominator for Bypass & LoopExit. 7801 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7802 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7803 // For loops with multiple exits, there's no edge from the middle block 7804 // to exit blocks (as the epilogue must run) and thus no need to update 7805 // the immediate dominator of the exit blocks. 7806 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7807 7808 LoopBypassBlocks.push_back(TCCheckBlock); 7809 7810 // Save the trip count so we don't have to regenerate it in the 7811 // vec.epilog.iter.check. This is safe to do because the trip count 7812 // generated here dominates the vector epilog iter check. 7813 EPI.TripCount = Count; 7814 } 7815 7816 ReplaceInstWithInst( 7817 TCCheckBlock->getTerminator(), 7818 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7819 7820 return TCCheckBlock; 7821 } 7822 7823 //===--------------------------------------------------------------------===// 7824 // EpilogueVectorizerEpilogueLoop 7825 //===--------------------------------------------------------------------===// 7826 7827 /// This function is partially responsible for generating the control flow 7828 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7829 std::pair<BasicBlock *, Value *> 7830 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7831 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7832 createVectorLoopSkeleton("vec.epilog."); 7833 7834 // Now, compare the remaining count and if there aren't enough iterations to 7835 // execute the vectorized epilogue skip to the scalar part. 7836 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7837 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7838 LoopVectorPreHeader = 7839 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7840 LI, nullptr, "vec.epilog.ph"); 7841 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7842 VecEpilogueIterationCountCheck); 7843 7844 // Adjust the control flow taking the state info from the main loop 7845 // vectorization into account. 7846 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7847 "expected this to be saved from the previous pass."); 7848 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7849 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7850 7851 DT->changeImmediateDominator(LoopVectorPreHeader, 7852 EPI.MainLoopIterationCountCheck); 7853 7854 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7855 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7856 7857 if (EPI.SCEVSafetyCheck) 7858 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7859 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7860 if (EPI.MemSafetyCheck) 7861 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7862 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7863 7864 DT->changeImmediateDominator( 7865 VecEpilogueIterationCountCheck, 7866 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7867 7868 DT->changeImmediateDominator(LoopScalarPreHeader, 7869 EPI.EpilogueIterationCountCheck); 7870 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7871 // If there is an epilogue which must run, there's no edge from the 7872 // middle block to exit blocks and thus no need to update the immediate 7873 // dominator of the exit blocks. 7874 DT->changeImmediateDominator(LoopExitBlock, 7875 EPI.EpilogueIterationCountCheck); 7876 7877 // Keep track of bypass blocks, as they feed start values to the induction 7878 // phis in the scalar loop preheader. 7879 if (EPI.SCEVSafetyCheck) 7880 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7881 if (EPI.MemSafetyCheck) 7882 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7883 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7884 7885 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7886 // merge control-flow from the latch block and the middle block. Update the 7887 // incoming values here and move the Phi into the preheader. 7888 SmallVector<PHINode *, 4> PhisInBlock; 7889 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7890 PhisInBlock.push_back(&Phi); 7891 7892 for (PHINode *Phi : PhisInBlock) { 7893 Phi->replaceIncomingBlockWith( 7894 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7895 VecEpilogueIterationCountCheck); 7896 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7897 if (EPI.SCEVSafetyCheck) 7898 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7899 if (EPI.MemSafetyCheck) 7900 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7901 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7902 } 7903 7904 // Generate a resume induction for the vector epilogue and put it in the 7905 // vector epilogue preheader 7906 Type *IdxTy = Legal->getWidestInductionType(); 7907 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7908 LoopVectorPreHeader->getFirstNonPHI()); 7909 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7910 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7911 EPI.MainLoopIterationCountCheck); 7912 7913 // Generate induction resume values. These variables save the new starting 7914 // indexes for the scalar loop. They are used to test if there are any tail 7915 // iterations left once the vector loop has completed. 7916 // Note that when the vectorized epilogue is skipped due to iteration count 7917 // check, then the resume value for the induction variable comes from 7918 // the trip count of the main vector loop, hence passing the AdditionalBypass 7919 // argument. 7920 createInductionResumeValues({VecEpilogueIterationCountCheck, 7921 EPI.VectorTripCount} /* AdditionalBypass */); 7922 7923 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7924 } 7925 7926 BasicBlock * 7927 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7928 BasicBlock *Bypass, BasicBlock *Insert) { 7929 7930 assert(EPI.TripCount && 7931 "Expected trip count to have been safed in the first pass."); 7932 assert( 7933 (!isa<Instruction>(EPI.TripCount) || 7934 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7935 "saved trip count does not dominate insertion point."); 7936 Value *TC = EPI.TripCount; 7937 IRBuilder<> Builder(Insert->getTerminator()); 7938 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7939 7940 // Generate code to check if the loop's trip count is less than VF * UF of the 7941 // vector epilogue loop. 7942 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7943 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7944 7945 Value *CheckMinIters = 7946 Builder.CreateICmp(P, Count, 7947 createStepForVF(Builder, Count->getType(), 7948 EPI.EpilogueVF, EPI.EpilogueUF), 7949 "min.epilog.iters.check"); 7950 7951 ReplaceInstWithInst( 7952 Insert->getTerminator(), 7953 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7954 7955 LoopBypassBlocks.push_back(Insert); 7956 return Insert; 7957 } 7958 7959 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7960 LLVM_DEBUG({ 7961 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7962 << "Epilogue Loop VF:" << EPI.EpilogueVF 7963 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7964 }); 7965 } 7966 7967 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7968 DEBUG_WITH_TYPE(VerboseDebug, { 7969 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7970 }); 7971 } 7972 7973 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7974 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7975 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7976 bool PredicateAtRangeStart = Predicate(Range.Start); 7977 7978 for (ElementCount TmpVF = Range.Start * 2; 7979 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7980 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7981 Range.End = TmpVF; 7982 break; 7983 } 7984 7985 return PredicateAtRangeStart; 7986 } 7987 7988 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7989 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7990 /// of VF's starting at a given VF and extending it as much as possible. Each 7991 /// vectorization decision can potentially shorten this sub-range during 7992 /// buildVPlan(). 7993 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7994 ElementCount MaxVF) { 7995 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7996 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7997 VFRange SubRange = {VF, MaxVFPlusOne}; 7998 VPlans.push_back(buildVPlan(SubRange)); 7999 VF = SubRange.End; 8000 } 8001 } 8002 8003 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8004 VPlanPtr &Plan) { 8005 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8006 8007 // Look for cached value. 8008 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8009 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8010 if (ECEntryIt != EdgeMaskCache.end()) 8011 return ECEntryIt->second; 8012 8013 VPValue *SrcMask = createBlockInMask(Src, Plan); 8014 8015 // The terminator has to be a branch inst! 8016 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8017 assert(BI && "Unexpected terminator found"); 8018 8019 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8020 return EdgeMaskCache[Edge] = SrcMask; 8021 8022 // If source is an exiting block, we know the exit edge is dynamically dead 8023 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8024 // adding uses of an otherwise potentially dead instruction. 8025 if (OrigLoop->isLoopExiting(Src)) 8026 return EdgeMaskCache[Edge] = SrcMask; 8027 8028 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8029 assert(EdgeMask && "No Edge Mask found for condition"); 8030 8031 if (BI->getSuccessor(0) != Dst) 8032 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8033 8034 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8035 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8036 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8037 // The select version does not introduce new UB if SrcMask is false and 8038 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8039 VPValue *False = Plan->getOrAddVPValue( 8040 ConstantInt::getFalse(BI->getCondition()->getType())); 8041 EdgeMask = 8042 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8043 } 8044 8045 return EdgeMaskCache[Edge] = EdgeMask; 8046 } 8047 8048 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8049 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8050 8051 // Look for cached value. 8052 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8053 if (BCEntryIt != BlockMaskCache.end()) 8054 return BCEntryIt->second; 8055 8056 // All-one mask is modelled as no-mask following the convention for masked 8057 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8058 VPValue *BlockMask = nullptr; 8059 8060 if (OrigLoop->getHeader() == BB) { 8061 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8062 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8063 8064 // Introduce the early-exit compare IV <= BTC to form header block mask. 8065 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8066 // constructing the desired canonical IV in the header block as its first 8067 // non-phi instructions. 8068 assert(CM.foldTailByMasking() && "must fold the tail"); 8069 VPBasicBlock *HeaderVPBB = 8070 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8071 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8072 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8073 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8074 8075 VPBuilder::InsertPointGuard Guard(Builder); 8076 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8077 if (CM.TTI.emitGetActiveLaneMask()) { 8078 VPValue *TC = Plan->getOrCreateTripCount(); 8079 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8080 } else { 8081 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8082 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8083 } 8084 return BlockMaskCache[BB] = BlockMask; 8085 } 8086 8087 // This is the block mask. We OR all incoming edges. 8088 for (auto *Predecessor : predecessors(BB)) { 8089 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8090 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8091 return BlockMaskCache[BB] = EdgeMask; 8092 8093 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8094 BlockMask = EdgeMask; 8095 continue; 8096 } 8097 8098 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8099 } 8100 8101 return BlockMaskCache[BB] = BlockMask; 8102 } 8103 8104 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8105 ArrayRef<VPValue *> Operands, 8106 VFRange &Range, 8107 VPlanPtr &Plan) { 8108 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8109 "Must be called with either a load or store"); 8110 8111 auto willWiden = [&](ElementCount VF) -> bool { 8112 LoopVectorizationCostModel::InstWidening Decision = 8113 CM.getWideningDecision(I, VF); 8114 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8115 "CM decision should be taken at this point."); 8116 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8117 return true; 8118 if (CM.isScalarAfterVectorization(I, VF) || 8119 CM.isProfitableToScalarize(I, VF)) 8120 return false; 8121 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8122 }; 8123 8124 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8125 return nullptr; 8126 8127 VPValue *Mask = nullptr; 8128 if (Legal->isMaskRequired(I)) 8129 Mask = createBlockInMask(I->getParent(), Plan); 8130 8131 // Determine if the pointer operand of the access is either consecutive or 8132 // reverse consecutive. 8133 LoopVectorizationCostModel::InstWidening Decision = 8134 CM.getWideningDecision(I, Range.Start); 8135 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8136 bool Consecutive = 8137 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8138 8139 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8140 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8141 Consecutive, Reverse); 8142 8143 StoreInst *Store = cast<StoreInst>(I); 8144 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8145 Mask, Consecutive, Reverse); 8146 } 8147 8148 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8149 /// insert a recipe to expand the step for the induction recipe. 8150 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8151 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8152 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8153 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8154 // Returns true if an instruction \p I should be scalarized instead of 8155 // vectorized for the chosen vectorization factor. 8156 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8157 return CM.isScalarAfterVectorization(I, VF) || 8158 CM.isProfitableToScalarize(I, VF); 8159 }; 8160 8161 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8162 [&](ElementCount VF) { 8163 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8164 }, 8165 Range); 8166 assert(IndDesc.getStartValue() == 8167 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8168 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8169 "step must be loop invariant"); 8170 8171 VPValue *Step = 8172 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8173 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8174 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8175 !NeedsScalarIVOnly); 8176 } 8177 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8178 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8179 !NeedsScalarIVOnly); 8180 } 8181 8182 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8183 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8184 8185 // Check if this is an integer or fp induction. If so, build the recipe that 8186 // produces its scalar and vector values. 8187 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8188 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8189 *PSE.getSE(), *OrigLoop, Range); 8190 8191 // Check if this is pointer induction. If so, build the recipe for it. 8192 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8193 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8194 *PSE.getSE()); 8195 return nullptr; 8196 } 8197 8198 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8199 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8200 // Optimize the special case where the source is a constant integer 8201 // induction variable. Notice that we can only optimize the 'trunc' case 8202 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8203 // (c) other casts depend on pointer size. 8204 8205 // Determine whether \p K is a truncation based on an induction variable that 8206 // can be optimized. 8207 auto isOptimizableIVTruncate = 8208 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8209 return [=](ElementCount VF) -> bool { 8210 return CM.isOptimizableIVTruncate(K, VF); 8211 }; 8212 }; 8213 8214 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8215 isOptimizableIVTruncate(I), Range)) { 8216 8217 auto *Phi = cast<PHINode>(I->getOperand(0)); 8218 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8219 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8220 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8221 *PSE.getSE(), *OrigLoop, Range); 8222 } 8223 return nullptr; 8224 } 8225 8226 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8227 ArrayRef<VPValue *> Operands, 8228 VPlanPtr &Plan) { 8229 // If all incoming values are equal, the incoming VPValue can be used directly 8230 // instead of creating a new VPBlendRecipe. 8231 VPValue *FirstIncoming = Operands[0]; 8232 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8233 return FirstIncoming == Inc; 8234 })) { 8235 return Operands[0]; 8236 } 8237 8238 unsigned NumIncoming = Phi->getNumIncomingValues(); 8239 // For in-loop reductions, we do not need to create an additional select. 8240 VPValue *InLoopVal = nullptr; 8241 for (unsigned In = 0; In < NumIncoming; In++) { 8242 PHINode *PhiOp = 8243 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8244 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8245 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8246 InLoopVal = Operands[In]; 8247 } 8248 } 8249 8250 assert((!InLoopVal || NumIncoming == 2) && 8251 "Found an in-loop reduction for PHI with unexpected number of " 8252 "incoming values"); 8253 if (InLoopVal) 8254 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8255 8256 // We know that all PHIs in non-header blocks are converted into selects, so 8257 // we don't have to worry about the insertion order and we can just use the 8258 // builder. At this point we generate the predication tree. There may be 8259 // duplications since this is a simple recursive scan, but future 8260 // optimizations will clean it up. 8261 SmallVector<VPValue *, 2> OperandsWithMask; 8262 8263 for (unsigned In = 0; In < NumIncoming; In++) { 8264 VPValue *EdgeMask = 8265 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8266 assert((EdgeMask || NumIncoming == 1) && 8267 "Multiple predecessors with one having a full mask"); 8268 OperandsWithMask.push_back(Operands[In]); 8269 if (EdgeMask) 8270 OperandsWithMask.push_back(EdgeMask); 8271 } 8272 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8273 } 8274 8275 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8276 ArrayRef<VPValue *> Operands, 8277 VFRange &Range) const { 8278 8279 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8280 [this, CI](ElementCount VF) { 8281 return CM.isScalarWithPredication(CI, VF); 8282 }, 8283 Range); 8284 8285 if (IsPredicated) 8286 return nullptr; 8287 8288 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8289 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8290 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8291 ID == Intrinsic::pseudoprobe || 8292 ID == Intrinsic::experimental_noalias_scope_decl)) 8293 return nullptr; 8294 8295 auto willWiden = [&](ElementCount VF) -> bool { 8296 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8297 // The following case may be scalarized depending on the VF. 8298 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8299 // version of the instruction. 8300 // Is it beneficial to perform intrinsic call compared to lib call? 8301 bool NeedToScalarize = false; 8302 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8303 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8304 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8305 return UseVectorIntrinsic || !NeedToScalarize; 8306 }; 8307 8308 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8309 return nullptr; 8310 8311 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8312 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8313 } 8314 8315 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8316 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8317 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8318 // Instruction should be widened, unless it is scalar after vectorization, 8319 // scalarization is profitable or it is predicated. 8320 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8321 return CM.isScalarAfterVectorization(I, VF) || 8322 CM.isProfitableToScalarize(I, VF) || 8323 CM.isScalarWithPredication(I, VF); 8324 }; 8325 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8326 Range); 8327 } 8328 8329 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8330 ArrayRef<VPValue *> Operands) const { 8331 auto IsVectorizableOpcode = [](unsigned Opcode) { 8332 switch (Opcode) { 8333 case Instruction::Add: 8334 case Instruction::And: 8335 case Instruction::AShr: 8336 case Instruction::BitCast: 8337 case Instruction::FAdd: 8338 case Instruction::FCmp: 8339 case Instruction::FDiv: 8340 case Instruction::FMul: 8341 case Instruction::FNeg: 8342 case Instruction::FPExt: 8343 case Instruction::FPToSI: 8344 case Instruction::FPToUI: 8345 case Instruction::FPTrunc: 8346 case Instruction::FRem: 8347 case Instruction::FSub: 8348 case Instruction::ICmp: 8349 case Instruction::IntToPtr: 8350 case Instruction::LShr: 8351 case Instruction::Mul: 8352 case Instruction::Or: 8353 case Instruction::PtrToInt: 8354 case Instruction::SDiv: 8355 case Instruction::Select: 8356 case Instruction::SExt: 8357 case Instruction::Shl: 8358 case Instruction::SIToFP: 8359 case Instruction::SRem: 8360 case Instruction::Sub: 8361 case Instruction::Trunc: 8362 case Instruction::UDiv: 8363 case Instruction::UIToFP: 8364 case Instruction::URem: 8365 case Instruction::Xor: 8366 case Instruction::ZExt: 8367 case Instruction::Freeze: 8368 return true; 8369 } 8370 return false; 8371 }; 8372 8373 if (!IsVectorizableOpcode(I->getOpcode())) 8374 return nullptr; 8375 8376 // Success: widen this instruction. 8377 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8378 } 8379 8380 void VPRecipeBuilder::fixHeaderPhis() { 8381 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8382 for (VPHeaderPHIRecipe *R : PhisToFix) { 8383 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8384 VPRecipeBase *IncR = 8385 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8386 R->addOperand(IncR->getVPSingleValue()); 8387 } 8388 } 8389 8390 VPBasicBlock *VPRecipeBuilder::handleReplication( 8391 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8392 VPlanPtr &Plan) { 8393 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8394 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8395 Range); 8396 8397 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8398 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8399 Range); 8400 8401 // Even if the instruction is not marked as uniform, there are certain 8402 // intrinsic calls that can be effectively treated as such, so we check for 8403 // them here. Conservatively, we only do this for scalable vectors, since 8404 // for fixed-width VFs we can always fall back on full scalarization. 8405 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8406 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8407 case Intrinsic::assume: 8408 case Intrinsic::lifetime_start: 8409 case Intrinsic::lifetime_end: 8410 // For scalable vectors if one of the operands is variant then we still 8411 // want to mark as uniform, which will generate one instruction for just 8412 // the first lane of the vector. We can't scalarize the call in the same 8413 // way as for fixed-width vectors because we don't know how many lanes 8414 // there are. 8415 // 8416 // The reasons for doing it this way for scalable vectors are: 8417 // 1. For the assume intrinsic generating the instruction for the first 8418 // lane is still be better than not generating any at all. For 8419 // example, the input may be a splat across all lanes. 8420 // 2. For the lifetime start/end intrinsics the pointer operand only 8421 // does anything useful when the input comes from a stack object, 8422 // which suggests it should always be uniform. For non-stack objects 8423 // the effect is to poison the object, which still allows us to 8424 // remove the call. 8425 IsUniform = true; 8426 break; 8427 default: 8428 break; 8429 } 8430 } 8431 8432 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8433 IsUniform, IsPredicated); 8434 setRecipe(I, Recipe); 8435 Plan->addVPValue(I, Recipe); 8436 8437 // Find if I uses a predicated instruction. If so, it will use its scalar 8438 // value. Avoid hoisting the insert-element which packs the scalar value into 8439 // a vector value, as that happens iff all users use the vector value. 8440 for (VPValue *Op : Recipe->operands()) { 8441 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8442 if (!PredR) 8443 continue; 8444 auto *RepR = 8445 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8446 assert(RepR->isPredicated() && 8447 "expected Replicate recipe to be predicated"); 8448 RepR->setAlsoPack(false); 8449 } 8450 8451 // Finalize the recipe for Instr, first if it is not predicated. 8452 if (!IsPredicated) { 8453 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8454 VPBB->appendRecipe(Recipe); 8455 return VPBB; 8456 } 8457 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8458 8459 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8460 assert(SingleSucc && "VPBB must have a single successor when handling " 8461 "predicated replication."); 8462 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8463 // Record predicated instructions for above packing optimizations. 8464 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8465 VPBlockUtils::insertBlockAfter(Region, VPBB); 8466 auto *RegSucc = new VPBasicBlock(); 8467 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8468 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8469 return RegSucc; 8470 } 8471 8472 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8473 VPRecipeBase *PredRecipe, 8474 VPlanPtr &Plan) { 8475 // Instructions marked for predication are replicated and placed under an 8476 // if-then construct to prevent side-effects. 8477 8478 // Generate recipes to compute the block mask for this region. 8479 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8480 8481 // Build the triangular if-then region. 8482 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8483 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8484 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8485 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8486 auto *PHIRecipe = Instr->getType()->isVoidTy() 8487 ? nullptr 8488 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8489 if (PHIRecipe) { 8490 Plan->removeVPValueFor(Instr); 8491 Plan->addVPValue(Instr, PHIRecipe); 8492 } 8493 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8494 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8495 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8496 8497 // Note: first set Entry as region entry and then connect successors starting 8498 // from it in order, to propagate the "parent" of each VPBasicBlock. 8499 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8500 VPBlockUtils::connectBlocks(Pred, Exiting); 8501 8502 return Region; 8503 } 8504 8505 VPRecipeOrVPValueTy 8506 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8507 ArrayRef<VPValue *> Operands, 8508 VFRange &Range, VPlanPtr &Plan) { 8509 // First, check for specific widening recipes that deal with inductions, Phi 8510 // nodes, calls and memory operations. 8511 VPRecipeBase *Recipe; 8512 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8513 if (Phi->getParent() != OrigLoop->getHeader()) 8514 return tryToBlend(Phi, Operands, Plan); 8515 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8516 return toVPRecipeResult(Recipe); 8517 8518 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8519 assert((Legal->isReductionVariable(Phi) || 8520 Legal->isFirstOrderRecurrence(Phi)) && 8521 "can only widen reductions and first-order recurrences here"); 8522 VPValue *StartV = Operands[0]; 8523 if (Legal->isReductionVariable(Phi)) { 8524 const RecurrenceDescriptor &RdxDesc = 8525 Legal->getReductionVars().find(Phi)->second; 8526 assert(RdxDesc.getRecurrenceStartValue() == 8527 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8528 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8529 CM.isInLoopReduction(Phi), 8530 CM.useOrderedReductions(RdxDesc)); 8531 } else { 8532 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8533 } 8534 8535 // Record the incoming value from the backedge, so we can add the incoming 8536 // value from the backedge after all recipes have been created. 8537 recordRecipeOf(cast<Instruction>( 8538 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8539 PhisToFix.push_back(PhiRecipe); 8540 return toVPRecipeResult(PhiRecipe); 8541 } 8542 8543 if (isa<TruncInst>(Instr) && 8544 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8545 Range, *Plan))) 8546 return toVPRecipeResult(Recipe); 8547 8548 // All widen recipes below deal only with VF > 1. 8549 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8550 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8551 return nullptr; 8552 8553 if (auto *CI = dyn_cast<CallInst>(Instr)) 8554 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8555 8556 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8557 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8558 8559 if (!shouldWiden(Instr, Range)) 8560 return nullptr; 8561 8562 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8563 return toVPRecipeResult(new VPWidenGEPRecipe( 8564 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8565 8566 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8567 bool InvariantCond = 8568 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8569 return toVPRecipeResult(new VPWidenSelectRecipe( 8570 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8571 } 8572 8573 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8574 } 8575 8576 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8577 ElementCount MaxVF) { 8578 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8579 8580 // Collect instructions from the original loop that will become trivially dead 8581 // in the vectorized loop. We don't need to vectorize these instructions. For 8582 // example, original induction update instructions can become dead because we 8583 // separately emit induction "steps" when generating code for the new loop. 8584 // Similarly, we create a new latch condition when setting up the structure 8585 // of the new loop, so the old one can become dead. 8586 SmallPtrSet<Instruction *, 4> DeadInstructions; 8587 collectTriviallyDeadInstructions(DeadInstructions); 8588 8589 // Add assume instructions we need to drop to DeadInstructions, to prevent 8590 // them from being added to the VPlan. 8591 // TODO: We only need to drop assumes in blocks that get flattend. If the 8592 // control flow is preserved, we should keep them. 8593 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8594 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8595 8596 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8597 // Dead instructions do not need sinking. Remove them from SinkAfter. 8598 for (Instruction *I : DeadInstructions) 8599 SinkAfter.erase(I); 8600 8601 // Cannot sink instructions after dead instructions (there won't be any 8602 // recipes for them). Instead, find the first non-dead previous instruction. 8603 for (auto &P : Legal->getSinkAfter()) { 8604 Instruction *SinkTarget = P.second; 8605 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8606 (void)FirstInst; 8607 while (DeadInstructions.contains(SinkTarget)) { 8608 assert( 8609 SinkTarget != FirstInst && 8610 "Must find a live instruction (at least the one feeding the " 8611 "first-order recurrence PHI) before reaching beginning of the block"); 8612 SinkTarget = SinkTarget->getPrevNode(); 8613 assert(SinkTarget != P.first && 8614 "sink source equals target, no sinking required"); 8615 } 8616 P.second = SinkTarget; 8617 } 8618 8619 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8620 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8621 VFRange SubRange = {VF, MaxVFPlusOne}; 8622 VPlans.push_back( 8623 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8624 VF = SubRange.End; 8625 } 8626 } 8627 8628 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8629 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8630 // BranchOnCount VPInstruction to the latch. 8631 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8632 bool HasNUW) { 8633 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8634 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8635 8636 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8637 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8638 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8639 Header->insert(CanonicalIVPHI, Header->begin()); 8640 8641 auto *CanonicalIVIncrement = 8642 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8643 : VPInstruction::CanonicalIVIncrement, 8644 {CanonicalIVPHI}, DL); 8645 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8646 8647 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8648 EB->appendRecipe(CanonicalIVIncrement); 8649 8650 auto *BranchOnCount = 8651 new VPInstruction(VPInstruction::BranchOnCount, 8652 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8653 EB->appendRecipe(BranchOnCount); 8654 } 8655 8656 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8657 // original exit block. 8658 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8659 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8660 VPlan &Plan) { 8661 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8662 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8663 // Only handle single-exit loops with unique exit blocks for now. 8664 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8665 return; 8666 8667 // Introduce VPUsers modeling the exit values. 8668 for (PHINode &ExitPhi : ExitBB->phis()) { 8669 Value *IncomingValue = 8670 ExitPhi.getIncomingValueForBlock(ExitingBB); 8671 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8672 Plan.addLiveOut(&ExitPhi, V); 8673 } 8674 } 8675 8676 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8677 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8678 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8679 8680 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8681 8682 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8683 8684 // --------------------------------------------------------------------------- 8685 // Pre-construction: record ingredients whose recipes we'll need to further 8686 // process after constructing the initial VPlan. 8687 // --------------------------------------------------------------------------- 8688 8689 // Mark instructions we'll need to sink later and their targets as 8690 // ingredients whose recipe we'll need to record. 8691 for (auto &Entry : SinkAfter) { 8692 RecipeBuilder.recordRecipeOf(Entry.first); 8693 RecipeBuilder.recordRecipeOf(Entry.second); 8694 } 8695 for (auto &Reduction : CM.getInLoopReductionChains()) { 8696 PHINode *Phi = Reduction.first; 8697 RecurKind Kind = 8698 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8699 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8700 8701 RecipeBuilder.recordRecipeOf(Phi); 8702 for (auto &R : ReductionOperations) { 8703 RecipeBuilder.recordRecipeOf(R); 8704 // For min/max reductions, where we have a pair of icmp/select, we also 8705 // need to record the ICmp recipe, so it can be removed later. 8706 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8707 "Only min/max recurrences allowed for inloop reductions"); 8708 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8709 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8710 } 8711 } 8712 8713 // For each interleave group which is relevant for this (possibly trimmed) 8714 // Range, add it to the set of groups to be later applied to the VPlan and add 8715 // placeholders for its members' Recipes which we'll be replacing with a 8716 // single VPInterleaveRecipe. 8717 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8718 auto applyIG = [IG, this](ElementCount VF) -> bool { 8719 return (VF.isVector() && // Query is illegal for VF == 1 8720 CM.getWideningDecision(IG->getInsertPos(), VF) == 8721 LoopVectorizationCostModel::CM_Interleave); 8722 }; 8723 if (!getDecisionAndClampRange(applyIG, Range)) 8724 continue; 8725 InterleaveGroups.insert(IG); 8726 for (unsigned i = 0; i < IG->getFactor(); i++) 8727 if (Instruction *Member = IG->getMember(i)) 8728 RecipeBuilder.recordRecipeOf(Member); 8729 }; 8730 8731 // --------------------------------------------------------------------------- 8732 // Build initial VPlan: Scan the body of the loop in a topological order to 8733 // visit each basic block after having visited its predecessor basic blocks. 8734 // --------------------------------------------------------------------------- 8735 8736 // Create initial VPlan skeleton, starting with a block for the pre-header, 8737 // followed by a region for the vector loop, followed by the middle block. The 8738 // skeleton vector loop region contains a header and latch block. 8739 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8740 auto Plan = std::make_unique<VPlan>(Preheader); 8741 8742 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8743 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8744 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8745 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8746 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8747 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8748 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8749 8750 Instruction *DLInst = 8751 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8752 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8753 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8754 !CM.foldTailByMasking()); 8755 8756 // Scan the body of the loop in a topological order to visit each basic block 8757 // after having visited its predecessor basic blocks. 8758 LoopBlocksDFS DFS(OrigLoop); 8759 DFS.perform(LI); 8760 8761 VPBasicBlock *VPBB = HeaderVPBB; 8762 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8763 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8764 // Relevant instructions from basic block BB will be grouped into VPRecipe 8765 // ingredients and fill a new VPBasicBlock. 8766 unsigned VPBBsForBB = 0; 8767 if (VPBB != HeaderVPBB) 8768 VPBB->setName(BB->getName()); 8769 Builder.setInsertPoint(VPBB); 8770 8771 // Introduce each ingredient into VPlan. 8772 // TODO: Model and preserve debug intrinsics in VPlan. 8773 for (Instruction &I : BB->instructionsWithoutDebug()) { 8774 Instruction *Instr = &I; 8775 8776 // First filter out irrelevant instructions, to ensure no recipes are 8777 // built for them. 8778 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8779 continue; 8780 8781 SmallVector<VPValue *, 4> Operands; 8782 auto *Phi = dyn_cast<PHINode>(Instr); 8783 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8784 Operands.push_back(Plan->getOrAddVPValue( 8785 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8786 } else { 8787 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8788 Operands = {OpRange.begin(), OpRange.end()}; 8789 } 8790 8791 // Invariant stores inside loop will be deleted and a single store 8792 // with the final reduction value will be added to the exit block 8793 StoreInst *SI; 8794 if ((SI = dyn_cast<StoreInst>(&I)) && 8795 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8796 continue; 8797 8798 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8799 Instr, Operands, Range, Plan)) { 8800 // If Instr can be simplified to an existing VPValue, use it. 8801 if (RecipeOrValue.is<VPValue *>()) { 8802 auto *VPV = RecipeOrValue.get<VPValue *>(); 8803 Plan->addVPValue(Instr, VPV); 8804 // If the re-used value is a recipe, register the recipe for the 8805 // instruction, in case the recipe for Instr needs to be recorded. 8806 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8807 RecipeBuilder.setRecipe(Instr, R); 8808 continue; 8809 } 8810 // Otherwise, add the new recipe. 8811 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8812 for (auto *Def : Recipe->definedValues()) { 8813 auto *UV = Def->getUnderlyingValue(); 8814 Plan->addVPValue(UV, Def); 8815 } 8816 8817 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8818 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8819 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8820 // of the header block. That can happen for truncates of induction 8821 // variables. Those recipes are moved to the phi section of the header 8822 // block after applying SinkAfter, which relies on the original 8823 // position of the trunc. 8824 assert(isa<TruncInst>(Instr)); 8825 InductionsToMove.push_back( 8826 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8827 } 8828 RecipeBuilder.setRecipe(Instr, Recipe); 8829 VPBB->appendRecipe(Recipe); 8830 continue; 8831 } 8832 8833 // Otherwise, if all widening options failed, Instruction is to be 8834 // replicated. This may create a successor for VPBB. 8835 VPBasicBlock *NextVPBB = 8836 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8837 if (NextVPBB != VPBB) { 8838 VPBB = NextVPBB; 8839 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8840 : ""); 8841 } 8842 } 8843 8844 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8845 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8846 } 8847 8848 HeaderVPBB->setName("vector.body"); 8849 8850 // Fold the last, empty block into its predecessor. 8851 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8852 assert(VPBB && "expected to fold last (empty) block"); 8853 // After here, VPBB should not be used. 8854 VPBB = nullptr; 8855 8856 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8857 8858 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8859 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8860 "entry block must be set to a VPRegionBlock having a non-empty entry " 8861 "VPBasicBlock"); 8862 RecipeBuilder.fixHeaderPhis(); 8863 8864 // --------------------------------------------------------------------------- 8865 // Transform initial VPlan: Apply previously taken decisions, in order, to 8866 // bring the VPlan to its final state. 8867 // --------------------------------------------------------------------------- 8868 8869 // Apply Sink-After legal constraints. 8870 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8871 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8872 if (Region && Region->isReplicator()) { 8873 assert(Region->getNumSuccessors() == 1 && 8874 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8875 assert(R->getParent()->size() == 1 && 8876 "A recipe in an original replicator region must be the only " 8877 "recipe in its block"); 8878 return Region; 8879 } 8880 return nullptr; 8881 }; 8882 for (auto &Entry : SinkAfter) { 8883 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8884 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8885 8886 auto *TargetRegion = GetReplicateRegion(Target); 8887 auto *SinkRegion = GetReplicateRegion(Sink); 8888 if (!SinkRegion) { 8889 // If the sink source is not a replicate region, sink the recipe directly. 8890 if (TargetRegion) { 8891 // The target is in a replication region, make sure to move Sink to 8892 // the block after it, not into the replication region itself. 8893 VPBasicBlock *NextBlock = 8894 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8895 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8896 } else 8897 Sink->moveAfter(Target); 8898 continue; 8899 } 8900 8901 // The sink source is in a replicate region. Unhook the region from the CFG. 8902 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8903 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8904 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8905 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8906 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8907 8908 if (TargetRegion) { 8909 // The target recipe is also in a replicate region, move the sink region 8910 // after the target region. 8911 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8912 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8913 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8914 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8915 } else { 8916 // The sink source is in a replicate region, we need to move the whole 8917 // replicate region, which should only contain a single recipe in the 8918 // main block. 8919 auto *SplitBlock = 8920 Target->getParent()->splitAt(std::next(Target->getIterator())); 8921 8922 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8923 8924 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8925 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8926 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8927 } 8928 } 8929 8930 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8931 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8932 8933 // Now that sink-after is done, move induction recipes for optimized truncates 8934 // to the phi section of the header block. 8935 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8936 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8937 8938 // Adjust the recipes for any inloop reductions. 8939 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 8940 RecipeBuilder, Range.Start); 8941 8942 // Introduce a recipe to combine the incoming and previous values of a 8943 // first-order recurrence. 8944 for (VPRecipeBase &R : 8945 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8946 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8947 if (!RecurPhi) 8948 continue; 8949 8950 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8951 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8952 auto *Region = GetReplicateRegion(PrevRecipe); 8953 if (Region) 8954 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); 8955 if (!InsertBlock) { 8956 InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); 8957 VPBlockUtils::insertBlockAfter(InsertBlock, Region); 8958 } 8959 if (Region || PrevRecipe->isPhi()) 8960 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8961 else 8962 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 8963 8964 auto *RecurSplice = cast<VPInstruction>( 8965 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 8966 {RecurPhi, RecurPhi->getBackedgeValue()})); 8967 8968 RecurPhi->replaceAllUsesWith(RecurSplice); 8969 // Set the first operand of RecurSplice to RecurPhi again, after replacing 8970 // all users. 8971 RecurSplice->setOperand(0, RecurPhi); 8972 } 8973 8974 // Interleave memory: for each Interleave Group we marked earlier as relevant 8975 // for this VPlan, replace the Recipes widening its memory instructions with a 8976 // single VPInterleaveRecipe at its insertion point. 8977 for (auto IG : InterleaveGroups) { 8978 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8979 RecipeBuilder.getRecipe(IG->getInsertPos())); 8980 SmallVector<VPValue *, 4> StoredValues; 8981 for (unsigned i = 0; i < IG->getFactor(); ++i) 8982 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8983 auto *StoreR = 8984 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8985 StoredValues.push_back(StoreR->getStoredValue()); 8986 } 8987 8988 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8989 Recipe->getMask()); 8990 VPIG->insertBefore(Recipe); 8991 unsigned J = 0; 8992 for (unsigned i = 0; i < IG->getFactor(); ++i) 8993 if (Instruction *Member = IG->getMember(i)) { 8994 if (!Member->getType()->isVoidTy()) { 8995 VPValue *OriginalV = Plan->getVPValue(Member); 8996 Plan->removeVPValueFor(Member); 8997 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8998 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8999 J++; 9000 } 9001 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9002 } 9003 } 9004 9005 std::string PlanName; 9006 raw_string_ostream RSO(PlanName); 9007 ElementCount VF = Range.Start; 9008 Plan->addVF(VF); 9009 RSO << "Initial VPlan for VF={" << VF; 9010 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9011 Plan->addVF(VF); 9012 RSO << "," << VF; 9013 } 9014 RSO << "},UF>=1"; 9015 RSO.flush(); 9016 Plan->setName(PlanName); 9017 9018 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9019 // in ways that accessing values using original IR values is incorrect. 9020 Plan->disableValue2VPValue(); 9021 9022 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9023 VPlanTransforms::sinkScalarOperands(*Plan); 9024 VPlanTransforms::mergeReplicateRegions(*Plan); 9025 VPlanTransforms::removeDeadRecipes(*Plan); 9026 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9027 9028 // Fold Exit block into its predecessor if possible. 9029 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9030 // VPBasicBlock as exit. 9031 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); 9032 9033 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9034 return Plan; 9035 } 9036 9037 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9038 // Outer loop handling: They may require CFG and instruction level 9039 // transformations before even evaluating whether vectorization is profitable. 9040 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9041 // the vectorization pipeline. 9042 assert(!OrigLoop->isInnermost()); 9043 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9044 9045 // Create new empty VPlan 9046 auto Plan = std::make_unique<VPlan>(); 9047 9048 // Build hierarchical CFG 9049 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9050 HCFGBuilder.buildHierarchicalCFG(); 9051 9052 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9053 VF *= 2) 9054 Plan->addVF(VF); 9055 9056 SmallPtrSet<Instruction *, 1> DeadInstructions; 9057 VPlanTransforms::VPInstructionsToVPRecipes( 9058 OrigLoop, Plan, 9059 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9060 DeadInstructions, *PSE.getSE()); 9061 9062 // Remove the existing terminator of the exiting block of the top-most region. 9063 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9064 auto *Term = 9065 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9066 Term->eraseFromParent(); 9067 9068 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9069 true); 9070 return Plan; 9071 } 9072 9073 // Adjust the recipes for reductions. For in-loop reductions the chain of 9074 // instructions leading from the loop exit instr to the phi need to be converted 9075 // to reductions, with one operand being vector and the other being the scalar 9076 // reduction chain. For other reductions, a select is introduced between the phi 9077 // and live-out recipes when folding the tail. 9078 void LoopVectorizationPlanner::adjustRecipesForReductions( 9079 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9080 ElementCount MinVF) { 9081 for (auto &Reduction : CM.getInLoopReductionChains()) { 9082 PHINode *Phi = Reduction.first; 9083 const RecurrenceDescriptor &RdxDesc = 9084 Legal->getReductionVars().find(Phi)->second; 9085 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9086 9087 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9088 continue; 9089 9090 // ReductionOperations are orders top-down from the phi's use to the 9091 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9092 // which of the two operands will remain scalar and which will be reduced. 9093 // For minmax the chain will be the select instructions. 9094 Instruction *Chain = Phi; 9095 for (Instruction *R : ReductionOperations) { 9096 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9097 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9098 9099 VPValue *ChainOp = Plan->getVPValue(Chain); 9100 unsigned FirstOpId; 9101 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9102 "Only min/max recurrences allowed for inloop reductions"); 9103 // Recognize a call to the llvm.fmuladd intrinsic. 9104 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9105 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9106 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9107 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9108 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9109 "Expected to replace a VPWidenSelectSC"); 9110 FirstOpId = 1; 9111 } else { 9112 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9113 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9114 "Expected to replace a VPWidenSC"); 9115 FirstOpId = 0; 9116 } 9117 unsigned VecOpId = 9118 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9119 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9120 9121 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9122 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9123 : nullptr; 9124 9125 if (IsFMulAdd) { 9126 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9127 // need to create an fmul recipe to use as the vector operand for the 9128 // fadd reduction. 9129 VPInstruction *FMulRecipe = new VPInstruction( 9130 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9131 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9132 WidenRecipe->getParent()->insert(FMulRecipe, 9133 WidenRecipe->getIterator()); 9134 VecOp = FMulRecipe; 9135 } 9136 VPReductionRecipe *RedRecipe = 9137 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9138 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9139 Plan->removeVPValueFor(R); 9140 Plan->addVPValue(R, RedRecipe); 9141 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9142 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9143 WidenRecipe->eraseFromParent(); 9144 9145 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9146 VPRecipeBase *CompareRecipe = 9147 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9148 assert(isa<VPWidenRecipe>(CompareRecipe) && 9149 "Expected to replace a VPWidenSC"); 9150 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9151 "Expected no remaining users"); 9152 CompareRecipe->eraseFromParent(); 9153 } 9154 Chain = R; 9155 } 9156 } 9157 9158 // If tail is folded by masking, introduce selects between the phi 9159 // and the live-out instruction of each reduction, at the beginning of the 9160 // dedicated latch block. 9161 if (CM.foldTailByMasking()) { 9162 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9163 for (VPRecipeBase &R : 9164 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9165 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9166 if (!PhiR || PhiR->isInLoop()) 9167 continue; 9168 VPValue *Cond = 9169 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9170 VPValue *Red = PhiR->getBackedgeValue(); 9171 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9172 "reduction recipe must be defined before latch"); 9173 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9174 } 9175 } 9176 } 9177 9178 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9179 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9180 VPSlotTracker &SlotTracker) const { 9181 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9182 IG->getInsertPos()->printAsOperand(O, false); 9183 O << ", "; 9184 getAddr()->printAsOperand(O, SlotTracker); 9185 VPValue *Mask = getMask(); 9186 if (Mask) { 9187 O << ", "; 9188 Mask->printAsOperand(O, SlotTracker); 9189 } 9190 9191 unsigned OpIdx = 0; 9192 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9193 if (!IG->getMember(i)) 9194 continue; 9195 if (getNumStoreOperands() > 0) { 9196 O << "\n" << Indent << " store "; 9197 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9198 O << " to index " << i; 9199 } else { 9200 O << "\n" << Indent << " "; 9201 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9202 O << " = load from index " << i; 9203 } 9204 ++OpIdx; 9205 } 9206 } 9207 #endif 9208 9209 void VPWidenCallRecipe::execute(VPTransformState &State) { 9210 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9211 *this, State); 9212 } 9213 9214 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9215 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9216 State.setDebugLocFromInst(&I); 9217 9218 // The condition can be loop invariant but still defined inside the 9219 // loop. This means that we can't just use the original 'cond' value. 9220 // We have to take the 'vectorized' value and pick the first lane. 9221 // Instcombine will make this a no-op. 9222 auto *InvarCond = 9223 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9224 9225 for (unsigned Part = 0; Part < State.UF; ++Part) { 9226 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9227 Value *Op0 = State.get(getOperand(1), Part); 9228 Value *Op1 = State.get(getOperand(2), Part); 9229 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9230 State.set(this, Sel, Part); 9231 State.addMetadata(Sel, &I); 9232 } 9233 } 9234 9235 void VPWidenRecipe::execute(VPTransformState &State) { 9236 auto &I = *cast<Instruction>(getUnderlyingValue()); 9237 auto &Builder = State.Builder; 9238 switch (I.getOpcode()) { 9239 case Instruction::Call: 9240 case Instruction::Br: 9241 case Instruction::PHI: 9242 case Instruction::GetElementPtr: 9243 case Instruction::Select: 9244 llvm_unreachable("This instruction is handled by a different recipe."); 9245 case Instruction::UDiv: 9246 case Instruction::SDiv: 9247 case Instruction::SRem: 9248 case Instruction::URem: 9249 case Instruction::Add: 9250 case Instruction::FAdd: 9251 case Instruction::Sub: 9252 case Instruction::FSub: 9253 case Instruction::FNeg: 9254 case Instruction::Mul: 9255 case Instruction::FMul: 9256 case Instruction::FDiv: 9257 case Instruction::FRem: 9258 case Instruction::Shl: 9259 case Instruction::LShr: 9260 case Instruction::AShr: 9261 case Instruction::And: 9262 case Instruction::Or: 9263 case Instruction::Xor: { 9264 // Just widen unops and binops. 9265 State.setDebugLocFromInst(&I); 9266 9267 for (unsigned Part = 0; Part < State.UF; ++Part) { 9268 SmallVector<Value *, 2> Ops; 9269 for (VPValue *VPOp : operands()) 9270 Ops.push_back(State.get(VPOp, Part)); 9271 9272 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9273 9274 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9275 VecOp->copyIRFlags(&I); 9276 9277 // If the instruction is vectorized and was in a basic block that needed 9278 // predication, we can't propagate poison-generating flags (nuw/nsw, 9279 // exact, etc.). The control flow has been linearized and the 9280 // instruction is no longer guarded by the predicate, which could make 9281 // the flag properties to no longer hold. 9282 if (State.MayGeneratePoisonRecipes.contains(this)) 9283 VecOp->dropPoisonGeneratingFlags(); 9284 } 9285 9286 // Use this vector value for all users of the original instruction. 9287 State.set(this, V, Part); 9288 State.addMetadata(V, &I); 9289 } 9290 9291 break; 9292 } 9293 case Instruction::Freeze: { 9294 State.setDebugLocFromInst(&I); 9295 9296 for (unsigned Part = 0; Part < State.UF; ++Part) { 9297 Value *Op = State.get(getOperand(0), Part); 9298 9299 Value *Freeze = Builder.CreateFreeze(Op); 9300 State.set(this, Freeze, Part); 9301 } 9302 break; 9303 } 9304 case Instruction::ICmp: 9305 case Instruction::FCmp: { 9306 // Widen compares. Generate vector compares. 9307 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9308 auto *Cmp = cast<CmpInst>(&I); 9309 State.setDebugLocFromInst(Cmp); 9310 for (unsigned Part = 0; Part < State.UF; ++Part) { 9311 Value *A = State.get(getOperand(0), Part); 9312 Value *B = State.get(getOperand(1), Part); 9313 Value *C = nullptr; 9314 if (FCmp) { 9315 // Propagate fast math flags. 9316 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9317 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9318 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9319 } else { 9320 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9321 } 9322 State.set(this, C, Part); 9323 State.addMetadata(C, &I); 9324 } 9325 9326 break; 9327 } 9328 9329 case Instruction::ZExt: 9330 case Instruction::SExt: 9331 case Instruction::FPToUI: 9332 case Instruction::FPToSI: 9333 case Instruction::FPExt: 9334 case Instruction::PtrToInt: 9335 case Instruction::IntToPtr: 9336 case Instruction::SIToFP: 9337 case Instruction::UIToFP: 9338 case Instruction::Trunc: 9339 case Instruction::FPTrunc: 9340 case Instruction::BitCast: { 9341 auto *CI = cast<CastInst>(&I); 9342 State.setDebugLocFromInst(CI); 9343 9344 /// Vectorize casts. 9345 Type *DestTy = (State.VF.isScalar()) 9346 ? CI->getType() 9347 : VectorType::get(CI->getType(), State.VF); 9348 9349 for (unsigned Part = 0; Part < State.UF; ++Part) { 9350 Value *A = State.get(getOperand(0), Part); 9351 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9352 State.set(this, Cast, Part); 9353 State.addMetadata(Cast, &I); 9354 } 9355 break; 9356 } 9357 default: 9358 // This instruction is not vectorized by simple widening. 9359 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9360 llvm_unreachable("Unhandled instruction!"); 9361 } // end of switch. 9362 } 9363 9364 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9365 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9366 // Construct a vector GEP by widening the operands of the scalar GEP as 9367 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9368 // results in a vector of pointers when at least one operand of the GEP 9369 // is vector-typed. Thus, to keep the representation compact, we only use 9370 // vector-typed operands for loop-varying values. 9371 9372 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9373 // If we are vectorizing, but the GEP has only loop-invariant operands, 9374 // the GEP we build (by only using vector-typed operands for 9375 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9376 // produce a vector of pointers, we need to either arbitrarily pick an 9377 // operand to broadcast, or broadcast a clone of the original GEP. 9378 // Here, we broadcast a clone of the original. 9379 // 9380 // TODO: If at some point we decide to scalarize instructions having 9381 // loop-invariant operands, this special case will no longer be 9382 // required. We would add the scalarization decision to 9383 // collectLoopScalars() and teach getVectorValue() to broadcast 9384 // the lane-zero scalar value. 9385 auto *Clone = State.Builder.Insert(GEP->clone()); 9386 for (unsigned Part = 0; Part < State.UF; ++Part) { 9387 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9388 State.set(this, EntryPart, Part); 9389 State.addMetadata(EntryPart, GEP); 9390 } 9391 } else { 9392 // If the GEP has at least one loop-varying operand, we are sure to 9393 // produce a vector of pointers. But if we are only unrolling, we want 9394 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9395 // produce with the code below will be scalar (if VF == 1) or vector 9396 // (otherwise). Note that for the unroll-only case, we still maintain 9397 // values in the vector mapping with initVector, as we do for other 9398 // instructions. 9399 for (unsigned Part = 0; Part < State.UF; ++Part) { 9400 // The pointer operand of the new GEP. If it's loop-invariant, we 9401 // won't broadcast it. 9402 auto *Ptr = IsPtrLoopInvariant 9403 ? State.get(getOperand(0), VPIteration(0, 0)) 9404 : State.get(getOperand(0), Part); 9405 9406 // Collect all the indices for the new GEP. If any index is 9407 // loop-invariant, we won't broadcast it. 9408 SmallVector<Value *, 4> Indices; 9409 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9410 VPValue *Operand = getOperand(I); 9411 if (IsIndexLoopInvariant[I - 1]) 9412 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9413 else 9414 Indices.push_back(State.get(Operand, Part)); 9415 } 9416 9417 // If the GEP instruction is vectorized and was in a basic block that 9418 // needed predication, we can't propagate the poison-generating 'inbounds' 9419 // flag. The control flow has been linearized and the GEP is no longer 9420 // guarded by the predicate, which could make the 'inbounds' properties to 9421 // no longer hold. 9422 bool IsInBounds = 9423 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9424 9425 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9426 // but it should be a vector, otherwise. 9427 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 9428 Indices, "", IsInBounds); 9429 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9430 "NewGEP is not a pointer vector"); 9431 State.set(this, NewGEP, Part); 9432 State.addMetadata(NewGEP, GEP); 9433 } 9434 } 9435 } 9436 9437 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9438 assert(!State.Instance && "Int or FP induction being replicated."); 9439 9440 Value *Start = getStartValue()->getLiveInIRValue(); 9441 const InductionDescriptor &ID = getInductionDescriptor(); 9442 TruncInst *Trunc = getTruncInst(); 9443 IRBuilderBase &Builder = State.Builder; 9444 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9445 assert(State.VF.isVector() && "must have vector VF"); 9446 9447 // The value from the original loop to which we are mapping the new induction 9448 // variable. 9449 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9450 9451 // Fast-math-flags propagate from the original induction instruction. 9452 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9453 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9454 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9455 9456 // Now do the actual transformations, and start with fetching the step value. 9457 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9458 9459 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9460 "Expected either an induction phi-node or a truncate of it!"); 9461 9462 // Construct the initial value of the vector IV in the vector loop preheader 9463 auto CurrIP = Builder.saveIP(); 9464 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9465 Builder.SetInsertPoint(VectorPH->getTerminator()); 9466 if (isa<TruncInst>(EntryVal)) { 9467 assert(Start->getType()->isIntegerTy() && 9468 "Truncation requires an integer type"); 9469 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9470 Step = Builder.CreateTrunc(Step, TruncType); 9471 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9472 } 9473 9474 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9475 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9476 Value *SteppedStart = getStepVector( 9477 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9478 9479 // We create vector phi nodes for both integer and floating-point induction 9480 // variables. Here, we determine the kind of arithmetic we will perform. 9481 Instruction::BinaryOps AddOp; 9482 Instruction::BinaryOps MulOp; 9483 if (Step->getType()->isIntegerTy()) { 9484 AddOp = Instruction::Add; 9485 MulOp = Instruction::Mul; 9486 } else { 9487 AddOp = ID.getInductionOpcode(); 9488 MulOp = Instruction::FMul; 9489 } 9490 9491 // Multiply the vectorization factor by the step using integer or 9492 // floating-point arithmetic as appropriate. 9493 Type *StepType = Step->getType(); 9494 Value *RuntimeVF; 9495 if (Step->getType()->isFloatingPointTy()) 9496 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9497 else 9498 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9499 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9500 9501 // Create a vector splat to use in the induction update. 9502 // 9503 // FIXME: If the step is non-constant, we create the vector splat with 9504 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9505 // handle a constant vector splat. 9506 Value *SplatVF = isa<Constant>(Mul) 9507 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9508 : Builder.CreateVectorSplat(State.VF, Mul); 9509 Builder.restoreIP(CurrIP); 9510 9511 // We may need to add the step a number of times, depending on the unroll 9512 // factor. The last of those goes into the PHI. 9513 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9514 &*State.CFG.PrevBB->getFirstInsertionPt()); 9515 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9516 Instruction *LastInduction = VecInd; 9517 for (unsigned Part = 0; Part < State.UF; ++Part) { 9518 State.set(this, LastInduction, Part); 9519 9520 if (isa<TruncInst>(EntryVal)) 9521 State.addMetadata(LastInduction, EntryVal); 9522 9523 LastInduction = cast<Instruction>( 9524 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9525 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9526 } 9527 9528 LastInduction->setName("vec.ind.next"); 9529 VecInd->addIncoming(SteppedStart, VectorPH); 9530 // Add induction update using an incorrect block temporarily. The phi node 9531 // will be fixed after VPlan execution. Note that at this point the latch 9532 // block cannot be used, as it does not exist yet. 9533 // TODO: Model increment value in VPlan, by turning the recipe into a 9534 // multi-def and a subclass of VPHeaderPHIRecipe. 9535 VecInd->addIncoming(LastInduction, VectorPH); 9536 } 9537 9538 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9539 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9540 "Not a pointer induction according to InductionDescriptor!"); 9541 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9542 "Unexpected type."); 9543 9544 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9545 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9546 9547 if (onlyScalarsGenerated(State.VF)) { 9548 // This is the normalized GEP that starts counting at zero. 9549 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9550 CanonicalIV, IndDesc.getStep()->getType()); 9551 // Determine the number of scalars we need to generate for each unroll 9552 // iteration. If the instruction is uniform, we only need to generate the 9553 // first lane. Otherwise, we generate all VF values. 9554 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9555 assert((IsUniform || !State.VF.isScalable()) && 9556 "Cannot scalarize a scalable VF"); 9557 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9558 9559 for (unsigned Part = 0; Part < State.UF; ++Part) { 9560 Value *PartStart = 9561 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9562 9563 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9564 Value *Idx = State.Builder.CreateAdd( 9565 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9566 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9567 9568 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9569 State.CFG.PrevBB->getTerminator()); 9570 Value *SclrGep = emitTransformedIndex( 9571 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9572 SclrGep->setName("next.gep"); 9573 State.set(this, SclrGep, VPIteration(Part, Lane)); 9574 } 9575 } 9576 return; 9577 } 9578 9579 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9580 "Induction step not a SCEV constant!"); 9581 Type *PhiType = IndDesc.getStep()->getType(); 9582 9583 // Build a pointer phi 9584 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9585 Type *ScStValueType = ScalarStartValue->getType(); 9586 PHINode *NewPointerPhi = 9587 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9588 9589 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9590 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9591 9592 // A pointer induction, performed by using a gep 9593 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9594 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9595 9596 const SCEV *ScalarStep = IndDesc.getStep(); 9597 SCEVExpander Exp(SE, DL, "induction"); 9598 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9599 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9600 Value *NumUnrolledElems = 9601 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9602 Value *InductionGEP = GetElementPtrInst::Create( 9603 IndDesc.getElementType(), NewPointerPhi, 9604 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9605 InductionLoc); 9606 // Add induction update using an incorrect block temporarily. The phi node 9607 // will be fixed after VPlan execution. Note that at this point the latch 9608 // block cannot be used, as it does not exist yet. 9609 // TODO: Model increment value in VPlan, by turning the recipe into a 9610 // multi-def and a subclass of VPHeaderPHIRecipe. 9611 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9612 9613 // Create UF many actual address geps that use the pointer 9614 // phi as base and a vectorized version of the step value 9615 // (<step*0, ..., step*N>) as offset. 9616 for (unsigned Part = 0; Part < State.UF; ++Part) { 9617 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9618 Value *StartOffsetScalar = 9619 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9620 Value *StartOffset = 9621 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9622 // Create a vector of consecutive numbers from zero to VF. 9623 StartOffset = State.Builder.CreateAdd( 9624 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9625 9626 Value *GEP = State.Builder.CreateGEP( 9627 IndDesc.getElementType(), NewPointerPhi, 9628 State.Builder.CreateMul( 9629 StartOffset, 9630 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9631 "vector.gep")); 9632 State.set(this, GEP, Part); 9633 } 9634 } 9635 9636 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9637 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9638 9639 // Fast-math-flags propagate from the original induction instruction. 9640 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9641 if (IndDesc.getInductionBinOp() && 9642 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9643 State.Builder.setFastMathFlags( 9644 IndDesc.getInductionBinOp()->getFastMathFlags()); 9645 9646 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9647 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9648 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9649 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9650 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9651 ScalarIV = 9652 Ty->isIntegerTy() 9653 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9654 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9655 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9656 getStartValue()->getLiveInIRValue(), Step, 9657 IndDesc); 9658 ScalarIV->setName("offset.idx"); 9659 } 9660 if (TruncToTy) { 9661 assert(Step->getType()->isIntegerTy() && 9662 "Truncation requires an integer step"); 9663 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9664 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9665 } 9666 return ScalarIV; 9667 }; 9668 9669 Value *ScalarIV = CreateScalarIV(Step); 9670 if (State.VF.isVector()) { 9671 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9672 return; 9673 } 9674 9675 for (unsigned Part = 0; Part < State.UF; ++Part) { 9676 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9677 Value *EntryPart; 9678 if (Step->getType()->isFloatingPointTy()) { 9679 Value *StartIdx = 9680 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9681 // Floating-point operations inherit FMF via the builder's flags. 9682 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9683 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9684 ScalarIV, MulOp); 9685 } else { 9686 Value *StartIdx = 9687 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9688 EntryPart = State.Builder.CreateAdd( 9689 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9690 } 9691 State.set(this, EntryPart, Part); 9692 } 9693 } 9694 9695 void VPBlendRecipe::execute(VPTransformState &State) { 9696 State.setDebugLocFromInst(Phi); 9697 // We know that all PHIs in non-header blocks are converted into 9698 // selects, so we don't have to worry about the insertion order and we 9699 // can just use the builder. 9700 // At this point we generate the predication tree. There may be 9701 // duplications since this is a simple recursive scan, but future 9702 // optimizations will clean it up. 9703 9704 unsigned NumIncoming = getNumIncomingValues(); 9705 9706 // Generate a sequence of selects of the form: 9707 // SELECT(Mask3, In3, 9708 // SELECT(Mask2, In2, 9709 // SELECT(Mask1, In1, 9710 // In0))) 9711 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9712 // are essentially undef are taken from In0. 9713 InnerLoopVectorizer::VectorParts Entry(State.UF); 9714 for (unsigned In = 0; In < NumIncoming; ++In) { 9715 for (unsigned Part = 0; Part < State.UF; ++Part) { 9716 // We might have single edge PHIs (blocks) - use an identity 9717 // 'select' for the first PHI operand. 9718 Value *In0 = State.get(getIncomingValue(In), Part); 9719 if (In == 0) 9720 Entry[Part] = In0; // Initialize with the first incoming value. 9721 else { 9722 // Select between the current value and the previous incoming edge 9723 // based on the incoming mask. 9724 Value *Cond = State.get(getMask(In), Part); 9725 Entry[Part] = 9726 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9727 } 9728 } 9729 } 9730 for (unsigned Part = 0; Part < State.UF; ++Part) 9731 State.set(this, Entry[Part], Part); 9732 } 9733 9734 void VPInterleaveRecipe::execute(VPTransformState &State) { 9735 assert(!State.Instance && "Interleave group being replicated."); 9736 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9737 getStoredValues(), getMask()); 9738 } 9739 9740 void VPReductionRecipe::execute(VPTransformState &State) { 9741 assert(!State.Instance && "Reduction being replicated."); 9742 Value *PrevInChain = State.get(getChainOp(), 0); 9743 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9744 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9745 // Propagate the fast-math flags carried by the underlying instruction. 9746 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9747 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9748 for (unsigned Part = 0; Part < State.UF; ++Part) { 9749 Value *NewVecOp = State.get(getVecOp(), Part); 9750 if (VPValue *Cond = getCondOp()) { 9751 Value *NewCond = State.get(Cond, Part); 9752 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9753 Value *Iden = RdxDesc->getRecurrenceIdentity( 9754 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9755 Value *IdenVec = 9756 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9757 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9758 NewVecOp = Select; 9759 } 9760 Value *NewRed; 9761 Value *NextInChain; 9762 if (IsOrdered) { 9763 if (State.VF.isVector()) 9764 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9765 PrevInChain); 9766 else 9767 NewRed = State.Builder.CreateBinOp( 9768 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9769 NewVecOp); 9770 PrevInChain = NewRed; 9771 } else { 9772 PrevInChain = State.get(getChainOp(), Part); 9773 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9774 } 9775 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9776 NextInChain = 9777 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9778 NewRed, PrevInChain); 9779 } else if (IsOrdered) 9780 NextInChain = NewRed; 9781 else 9782 NextInChain = State.Builder.CreateBinOp( 9783 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9784 PrevInChain); 9785 State.set(this, NextInChain, Part); 9786 } 9787 } 9788 9789 void VPReplicateRecipe::execute(VPTransformState &State) { 9790 if (State.Instance) { // Generate a single instance. 9791 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9792 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9793 IsPredicated, State); 9794 // Insert scalar instance packing it into a vector. 9795 if (AlsoPack && State.VF.isVector()) { 9796 // If we're constructing lane 0, initialize to start from poison. 9797 if (State.Instance->Lane.isFirstLane()) { 9798 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9799 Value *Poison = PoisonValue::get( 9800 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9801 State.set(this, Poison, State.Instance->Part); 9802 } 9803 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9804 } 9805 return; 9806 } 9807 9808 // Generate scalar instances for all VF lanes of all UF parts, unless the 9809 // instruction is uniform inwhich case generate only the first lane for each 9810 // of the UF parts. 9811 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9812 assert((!State.VF.isScalable() || IsUniform) && 9813 "Can't scalarize a scalable vector"); 9814 for (unsigned Part = 0; Part < State.UF; ++Part) 9815 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9816 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9817 VPIteration(Part, Lane), IsPredicated, 9818 State); 9819 } 9820 9821 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9822 assert(State.Instance && "Branch on Mask works only on single instance."); 9823 9824 unsigned Part = State.Instance->Part; 9825 unsigned Lane = State.Instance->Lane.getKnownLane(); 9826 9827 Value *ConditionBit = nullptr; 9828 VPValue *BlockInMask = getMask(); 9829 if (BlockInMask) { 9830 ConditionBit = State.get(BlockInMask, Part); 9831 if (ConditionBit->getType()->isVectorTy()) 9832 ConditionBit = State.Builder.CreateExtractElement( 9833 ConditionBit, State.Builder.getInt32(Lane)); 9834 } else // Block in mask is all-one. 9835 ConditionBit = State.Builder.getTrue(); 9836 9837 // Replace the temporary unreachable terminator with a new conditional branch, 9838 // whose two destinations will be set later when they are created. 9839 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9840 assert(isa<UnreachableInst>(CurrentTerminator) && 9841 "Expected to replace unreachable terminator with conditional branch."); 9842 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9843 CondBr->setSuccessor(0, nullptr); 9844 ReplaceInstWithInst(CurrentTerminator, CondBr); 9845 } 9846 9847 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9848 assert(State.Instance && "Predicated instruction PHI works per instance."); 9849 Instruction *ScalarPredInst = 9850 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9851 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9852 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9853 assert(PredicatingBB && "Predicated block has no single predecessor."); 9854 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9855 "operand must be VPReplicateRecipe"); 9856 9857 // By current pack/unpack logic we need to generate only a single phi node: if 9858 // a vector value for the predicated instruction exists at this point it means 9859 // the instruction has vector users only, and a phi for the vector value is 9860 // needed. In this case the recipe of the predicated instruction is marked to 9861 // also do that packing, thereby "hoisting" the insert-element sequence. 9862 // Otherwise, a phi node for the scalar value is needed. 9863 unsigned Part = State.Instance->Part; 9864 if (State.hasVectorValue(getOperand(0), Part)) { 9865 Value *VectorValue = State.get(getOperand(0), Part); 9866 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9867 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9868 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9869 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9870 if (State.hasVectorValue(this, Part)) 9871 State.reset(this, VPhi, Part); 9872 else 9873 State.set(this, VPhi, Part); 9874 // NOTE: Currently we need to update the value of the operand, so the next 9875 // predicated iteration inserts its generated value in the correct vector. 9876 State.reset(getOperand(0), VPhi, Part); 9877 } else { 9878 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9879 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9880 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9881 PredicatingBB); 9882 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9883 if (State.hasScalarValue(this, *State.Instance)) 9884 State.reset(this, Phi, *State.Instance); 9885 else 9886 State.set(this, Phi, *State.Instance); 9887 // NOTE: Currently we need to update the value of the operand, so the next 9888 // predicated iteration inserts its generated value in the correct vector. 9889 State.reset(getOperand(0), Phi, *State.Instance); 9890 } 9891 } 9892 9893 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9894 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9895 9896 // Attempt to issue a wide load. 9897 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9898 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9899 9900 assert((LI || SI) && "Invalid Load/Store instruction"); 9901 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9902 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9903 9904 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9905 9906 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9907 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9908 bool CreateGatherScatter = !Consecutive; 9909 9910 auto &Builder = State.Builder; 9911 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9912 bool isMaskRequired = getMask(); 9913 if (isMaskRequired) 9914 for (unsigned Part = 0; Part < State.UF; ++Part) 9915 BlockInMaskParts[Part] = State.get(getMask(), Part); 9916 9917 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9918 // Calculate the pointer for the specific unroll-part. 9919 GetElementPtrInst *PartPtr = nullptr; 9920 9921 bool InBounds = false; 9922 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9923 InBounds = gep->isInBounds(); 9924 if (Reverse) { 9925 // If the address is consecutive but reversed, then the 9926 // wide store needs to start at the last vector element. 9927 // RunTimeVF = VScale * VF.getKnownMinValue() 9928 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9929 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9930 // NumElt = -Part * RunTimeVF 9931 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9932 // LastLane = 1 - RunTimeVF 9933 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9934 PartPtr = 9935 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9936 PartPtr->setIsInBounds(InBounds); 9937 PartPtr = cast<GetElementPtrInst>( 9938 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9939 PartPtr->setIsInBounds(InBounds); 9940 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9941 BlockInMaskParts[Part] = 9942 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9943 } else { 9944 Value *Increment = 9945 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9946 PartPtr = cast<GetElementPtrInst>( 9947 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9948 PartPtr->setIsInBounds(InBounds); 9949 } 9950 9951 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9952 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9953 }; 9954 9955 // Handle Stores: 9956 if (SI) { 9957 State.setDebugLocFromInst(SI); 9958 9959 for (unsigned Part = 0; Part < State.UF; ++Part) { 9960 Instruction *NewSI = nullptr; 9961 Value *StoredVal = State.get(StoredValue, Part); 9962 if (CreateGatherScatter) { 9963 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9964 Value *VectorGep = State.get(getAddr(), Part); 9965 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9966 MaskPart); 9967 } else { 9968 if (Reverse) { 9969 // If we store to reverse consecutive memory locations, then we need 9970 // to reverse the order of elements in the stored value. 9971 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9972 // We don't want to update the value in the map as it might be used in 9973 // another expression. So don't call resetVectorValue(StoredVal). 9974 } 9975 auto *VecPtr = 9976 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9977 if (isMaskRequired) 9978 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9979 BlockInMaskParts[Part]); 9980 else 9981 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9982 } 9983 State.addMetadata(NewSI, SI); 9984 } 9985 return; 9986 } 9987 9988 // Handle loads. 9989 assert(LI && "Must have a load instruction"); 9990 State.setDebugLocFromInst(LI); 9991 for (unsigned Part = 0; Part < State.UF; ++Part) { 9992 Value *NewLI; 9993 if (CreateGatherScatter) { 9994 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9995 Value *VectorGep = State.get(getAddr(), Part); 9996 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9997 nullptr, "wide.masked.gather"); 9998 State.addMetadata(NewLI, LI); 9999 } else { 10000 auto *VecPtr = 10001 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10002 if (isMaskRequired) 10003 NewLI = Builder.CreateMaskedLoad( 10004 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10005 PoisonValue::get(DataTy), "wide.masked.load"); 10006 else 10007 NewLI = 10008 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10009 10010 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10011 State.addMetadata(NewLI, LI); 10012 if (Reverse) 10013 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10014 } 10015 10016 State.set(getVPSingleValue(), NewLI, Part); 10017 } 10018 } 10019 10020 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10021 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10022 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10023 // for predication. 10024 static ScalarEpilogueLowering getScalarEpilogueLowering( 10025 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10026 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10027 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10028 LoopVectorizationLegality &LVL) { 10029 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10030 // don't look at hints or options, and don't request a scalar epilogue. 10031 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10032 // LoopAccessInfo (due to code dependency and not being able to reliably get 10033 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10034 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10035 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10036 // back to the old way and vectorize with versioning when forced. See D81345.) 10037 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10038 PGSOQueryType::IRPass) && 10039 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10040 return CM_ScalarEpilogueNotAllowedOptSize; 10041 10042 // 2) If set, obey the directives 10043 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10044 switch (PreferPredicateOverEpilogue) { 10045 case PreferPredicateTy::ScalarEpilogue: 10046 return CM_ScalarEpilogueAllowed; 10047 case PreferPredicateTy::PredicateElseScalarEpilogue: 10048 return CM_ScalarEpilogueNotNeededUsePredicate; 10049 case PreferPredicateTy::PredicateOrDontVectorize: 10050 return CM_ScalarEpilogueNotAllowedUsePredicate; 10051 }; 10052 } 10053 10054 // 3) If set, obey the hints 10055 switch (Hints.getPredicate()) { 10056 case LoopVectorizeHints::FK_Enabled: 10057 return CM_ScalarEpilogueNotNeededUsePredicate; 10058 case LoopVectorizeHints::FK_Disabled: 10059 return CM_ScalarEpilogueAllowed; 10060 }; 10061 10062 // 4) if the TTI hook indicates this is profitable, request predication. 10063 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10064 LVL.getLAI())) 10065 return CM_ScalarEpilogueNotNeededUsePredicate; 10066 10067 return CM_ScalarEpilogueAllowed; 10068 } 10069 10070 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10071 // If Values have been set for this Def return the one relevant for \p Part. 10072 if (hasVectorValue(Def, Part)) 10073 return Data.PerPartOutput[Def][Part]; 10074 10075 if (!hasScalarValue(Def, {Part, 0})) { 10076 Value *IRV = Def->getLiveInIRValue(); 10077 Value *B = ILV->getBroadcastInstrs(IRV); 10078 set(Def, B, Part); 10079 return B; 10080 } 10081 10082 Value *ScalarValue = get(Def, {Part, 0}); 10083 // If we aren't vectorizing, we can just copy the scalar map values over 10084 // to the vector map. 10085 if (VF.isScalar()) { 10086 set(Def, ScalarValue, Part); 10087 return ScalarValue; 10088 } 10089 10090 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10091 bool IsUniform = RepR && RepR->isUniform(); 10092 10093 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10094 // Check if there is a scalar value for the selected lane. 10095 if (!hasScalarValue(Def, {Part, LastLane})) { 10096 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10097 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10098 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10099 "unexpected recipe found to be invariant"); 10100 IsUniform = true; 10101 LastLane = 0; 10102 } 10103 10104 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10105 // Set the insert point after the last scalarized instruction or after the 10106 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10107 // will directly follow the scalar definitions. 10108 auto OldIP = Builder.saveIP(); 10109 auto NewIP = 10110 isa<PHINode>(LastInst) 10111 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10112 : std::next(BasicBlock::iterator(LastInst)); 10113 Builder.SetInsertPoint(&*NewIP); 10114 10115 // However, if we are vectorizing, we need to construct the vector values. 10116 // If the value is known to be uniform after vectorization, we can just 10117 // broadcast the scalar value corresponding to lane zero for each unroll 10118 // iteration. Otherwise, we construct the vector values using 10119 // insertelement instructions. Since the resulting vectors are stored in 10120 // State, we will only generate the insertelements once. 10121 Value *VectorValue = nullptr; 10122 if (IsUniform) { 10123 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10124 set(Def, VectorValue, Part); 10125 } else { 10126 // Initialize packing with insertelements to start from undef. 10127 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10128 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10129 set(Def, Undef, Part); 10130 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10131 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10132 VectorValue = get(Def, Part); 10133 } 10134 Builder.restoreIP(OldIP); 10135 return VectorValue; 10136 } 10137 10138 // Process the loop in the VPlan-native vectorization path. This path builds 10139 // VPlan upfront in the vectorization pipeline, which allows to apply 10140 // VPlan-to-VPlan transformations from the very beginning without modifying the 10141 // input LLVM IR. 10142 static bool processLoopInVPlanNativePath( 10143 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10144 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10145 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10146 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10147 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10148 LoopVectorizationRequirements &Requirements) { 10149 10150 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10151 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10152 return false; 10153 } 10154 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10155 Function *F = L->getHeader()->getParent(); 10156 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10157 10158 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10159 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10160 10161 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10162 &Hints, IAI); 10163 // Use the planner for outer loop vectorization. 10164 // TODO: CM is not used at this point inside the planner. Turn CM into an 10165 // optional argument if we don't need it in the future. 10166 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); 10167 10168 // Get user vectorization factor. 10169 ElementCount UserVF = Hints.getWidth(); 10170 10171 CM.collectElementTypesForWidening(); 10172 10173 // Plan how to best vectorize, return the best VF and its cost. 10174 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10175 10176 // If we are stress testing VPlan builds, do not attempt to generate vector 10177 // code. Masked vector code generation support will follow soon. 10178 // Also, do not attempt to vectorize if no vector code will be produced. 10179 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10180 return false; 10181 10182 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10183 10184 { 10185 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10186 F->getParent()->getDataLayout()); 10187 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10188 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 10189 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10190 << L->getHeader()->getParent()->getName() << "\"\n"); 10191 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 10192 } 10193 10194 // Mark the loop as already vectorized to avoid vectorizing again. 10195 Hints.setAlreadyVectorized(); 10196 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10197 return true; 10198 } 10199 10200 // Emit a remark if there are stores to floats that required a floating point 10201 // extension. If the vectorized loop was generated with floating point there 10202 // will be a performance penalty from the conversion overhead and the change in 10203 // the vector width. 10204 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10205 SmallVector<Instruction *, 4> Worklist; 10206 for (BasicBlock *BB : L->getBlocks()) { 10207 for (Instruction &Inst : *BB) { 10208 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10209 if (S->getValueOperand()->getType()->isFloatTy()) 10210 Worklist.push_back(S); 10211 } 10212 } 10213 } 10214 10215 // Traverse the floating point stores upwards searching, for floating point 10216 // conversions. 10217 SmallPtrSet<const Instruction *, 4> Visited; 10218 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10219 while (!Worklist.empty()) { 10220 auto *I = Worklist.pop_back_val(); 10221 if (!L->contains(I)) 10222 continue; 10223 if (!Visited.insert(I).second) 10224 continue; 10225 10226 // Emit a remark if the floating point store required a floating 10227 // point conversion. 10228 // TODO: More work could be done to identify the root cause such as a 10229 // constant or a function return type and point the user to it. 10230 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10231 ORE->emit([&]() { 10232 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10233 I->getDebugLoc(), L->getHeader()) 10234 << "floating point conversion changes vector width. " 10235 << "Mixed floating point precision requires an up/down " 10236 << "cast that will negatively impact performance."; 10237 }); 10238 10239 for (Use &Op : I->operands()) 10240 if (auto *OpI = dyn_cast<Instruction>(Op)) 10241 Worklist.push_back(OpI); 10242 } 10243 } 10244 10245 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10246 VectorizationFactor &VF, 10247 Optional<unsigned> VScale, Loop *L, 10248 ScalarEvolution &SE) { 10249 InstructionCost CheckCost = Checks.getCost(); 10250 if (!CheckCost.isValid()) 10251 return false; 10252 10253 // When interleaving only scalar and vector cost will be equal, which in turn 10254 // would lead to a divide by 0. Fall back to hard threshold. 10255 if (VF.Width.isScalar()) { 10256 if (CheckCost > VectorizeMemoryCheckThreshold) { 10257 LLVM_DEBUG( 10258 dbgs() 10259 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10260 return false; 10261 } 10262 return true; 10263 } 10264 10265 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10266 double ScalarC = *VF.ScalarCost.getValue(); 10267 if (ScalarC == 0) 10268 return true; 10269 10270 // First, compute the minimum iteration count required so that the vector 10271 // loop outperforms the scalar loop. 10272 // The total cost of the scalar loop is 10273 // ScalarC * TC 10274 // where 10275 // * TC is the actual trip count of the loop. 10276 // * ScalarC is the cost of a single scalar iteration. 10277 // 10278 // The total cost of the vector loop is 10279 // RtC + VecC * (TC / VF) + EpiC 10280 // where 10281 // * RtC is the cost of the generated runtime checks 10282 // * VecC is the cost of a single vector iteration. 10283 // * TC is the actual trip count of the loop 10284 // * VF is the vectorization factor 10285 // * EpiCost is the cost of the generated epilogue, including the cost 10286 // of the remaining scalar operations. 10287 // 10288 // Vectorization is profitable once the total vector cost is less than the 10289 // total scalar cost: 10290 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10291 // 10292 // Now we can compute the minimum required trip count TC as 10293 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 10294 // 10295 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10296 // the computations are performed on doubles, not integers and the result 10297 // is rounded up, hence we get an upper estimate of the TC. 10298 unsigned IntVF = VF.Width.getKnownMinValue(); 10299 if (VF.Width.isScalable()) { 10300 unsigned AssumedMinimumVscale = 1; 10301 if (VScale) 10302 AssumedMinimumVscale = *VScale; 10303 IntVF *= AssumedMinimumVscale; 10304 } 10305 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 10306 double RtC = *CheckCost.getValue(); 10307 double MinTC1 = RtC / (ScalarC - VecCOverVF); 10308 10309 // Second, compute a minimum iteration count so that the cost of the 10310 // runtime checks is only a fraction of the total scalar loop cost. This 10311 // adds a loop-dependent bound on the overhead incurred if the runtime 10312 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10313 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10314 // cost, compute 10315 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10316 double MinTC2 = RtC * 10 / ScalarC; 10317 10318 // Now pick the larger minimum. If it is not a multiple of VF, choose the 10319 // next closest multiple of VF. This should partly compensate for ignoring 10320 // the epilogue cost. 10321 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 10322 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); 10323 10324 LLVM_DEBUG( 10325 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10326 << VF.MinProfitableTripCount << "\n"); 10327 10328 // Skip vectorization if the expected trip count is less than the minimum 10329 // required trip count. 10330 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 10331 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10332 VF.MinProfitableTripCount)) { 10333 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10334 "trip count < minimum profitable VF (" 10335 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10336 << ")\n"); 10337 10338 return false; 10339 } 10340 } 10341 return true; 10342 } 10343 10344 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10345 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10346 !EnableLoopInterleaving), 10347 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10348 !EnableLoopVectorization) {} 10349 10350 bool LoopVectorizePass::processLoop(Loop *L) { 10351 assert((EnableVPlanNativePath || L->isInnermost()) && 10352 "VPlan-native path is not enabled. Only process inner loops."); 10353 10354 #ifndef NDEBUG 10355 const std::string DebugLocStr = getDebugLocString(L); 10356 #endif /* NDEBUG */ 10357 10358 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10359 << L->getHeader()->getParent()->getName() << "' from " 10360 << DebugLocStr << "\n"); 10361 10362 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10363 10364 LLVM_DEBUG( 10365 dbgs() << "LV: Loop hints:" 10366 << " force=" 10367 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10368 ? "disabled" 10369 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10370 ? "enabled" 10371 : "?")) 10372 << " width=" << Hints.getWidth() 10373 << " interleave=" << Hints.getInterleave() << "\n"); 10374 10375 // Function containing loop 10376 Function *F = L->getHeader()->getParent(); 10377 10378 // Looking at the diagnostic output is the only way to determine if a loop 10379 // was vectorized (other than looking at the IR or machine code), so it 10380 // is important to generate an optimization remark for each loop. Most of 10381 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10382 // generated as OptimizationRemark and OptimizationRemarkMissed are 10383 // less verbose reporting vectorized loops and unvectorized loops that may 10384 // benefit from vectorization, respectively. 10385 10386 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10387 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10388 return false; 10389 } 10390 10391 PredicatedScalarEvolution PSE(*SE, *L); 10392 10393 // Check if it is legal to vectorize the loop. 10394 LoopVectorizationRequirements Requirements; 10395 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10396 &Requirements, &Hints, DB, AC, BFI, PSI); 10397 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10398 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10399 Hints.emitRemarkWithHints(); 10400 return false; 10401 } 10402 10403 // Check the function attributes and profiles to find out if this function 10404 // should be optimized for size. 10405 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10406 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10407 10408 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10409 // here. They may require CFG and instruction level transformations before 10410 // even evaluating whether vectorization is profitable. Since we cannot modify 10411 // the incoming IR, we need to build VPlan upfront in the vectorization 10412 // pipeline. 10413 if (!L->isInnermost()) 10414 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10415 ORE, BFI, PSI, Hints, Requirements); 10416 10417 assert(L->isInnermost() && "Inner loop expected."); 10418 10419 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10420 // count by optimizing for size, to minimize overheads. 10421 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10422 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10423 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10424 << "This loop is worth vectorizing only if no scalar " 10425 << "iteration overheads are incurred."); 10426 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10427 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10428 else { 10429 LLVM_DEBUG(dbgs() << "\n"); 10430 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10431 } 10432 } 10433 10434 // Check the function attributes to see if implicit floats are allowed. 10435 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10436 // an integer loop and the vector instructions selected are purely integer 10437 // vector instructions? 10438 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10439 reportVectorizationFailure( 10440 "Can't vectorize when the NoImplicitFloat attribute is used", 10441 "loop not vectorized due to NoImplicitFloat attribute", 10442 "NoImplicitFloat", ORE, L); 10443 Hints.emitRemarkWithHints(); 10444 return false; 10445 } 10446 10447 // Check if the target supports potentially unsafe FP vectorization. 10448 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10449 // for the target we're vectorizing for, to make sure none of the 10450 // additional fp-math flags can help. 10451 if (Hints.isPotentiallyUnsafe() && 10452 TTI->isFPVectorizationPotentiallyUnsafe()) { 10453 reportVectorizationFailure( 10454 "Potentially unsafe FP op prevents vectorization", 10455 "loop not vectorized due to unsafe FP support.", 10456 "UnsafeFP", ORE, L); 10457 Hints.emitRemarkWithHints(); 10458 return false; 10459 } 10460 10461 bool AllowOrderedReductions; 10462 // If the flag is set, use that instead and override the TTI behaviour. 10463 if (ForceOrderedReductions.getNumOccurrences() > 0) 10464 AllowOrderedReductions = ForceOrderedReductions; 10465 else 10466 AllowOrderedReductions = TTI->enableOrderedReductions(); 10467 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10468 ORE->emit([&]() { 10469 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10470 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10471 ExactFPMathInst->getDebugLoc(), 10472 ExactFPMathInst->getParent()) 10473 << "loop not vectorized: cannot prove it is safe to reorder " 10474 "floating-point operations"; 10475 }); 10476 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10477 "reorder floating-point operations\n"); 10478 Hints.emitRemarkWithHints(); 10479 return false; 10480 } 10481 10482 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10483 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10484 10485 // If an override option has been passed in for interleaved accesses, use it. 10486 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10487 UseInterleaved = EnableInterleavedMemAccesses; 10488 10489 // Analyze interleaved memory accesses. 10490 if (UseInterleaved) { 10491 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10492 } 10493 10494 // Use the cost model. 10495 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10496 F, &Hints, IAI); 10497 CM.collectValuesToIgnore(); 10498 CM.collectElementTypesForWidening(); 10499 10500 // Use the planner for vectorization. 10501 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); 10502 10503 // Get user vectorization factor and interleave count. 10504 ElementCount UserVF = Hints.getWidth(); 10505 unsigned UserIC = Hints.getInterleave(); 10506 10507 // Plan how to best vectorize, return the best VF and its cost. 10508 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10509 10510 VectorizationFactor VF = VectorizationFactor::Disabled(); 10511 unsigned IC = 1; 10512 10513 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10514 F->getParent()->getDataLayout()); 10515 if (MaybeVF) { 10516 VF = *MaybeVF; 10517 // Select the interleave count. 10518 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10519 10520 unsigned SelectedIC = std::max(IC, UserIC); 10521 // Optimistically generate runtime checks if they are needed. Drop them if 10522 // they turn out to not be profitable. 10523 if (VF.Width.isVector() || SelectedIC > 1) 10524 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10525 10526 // Check if it is profitable to vectorize with runtime checks. 10527 bool ForceVectorization = 10528 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10529 if (!ForceVectorization && 10530 !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, 10531 *PSE.getSE())) { 10532 ORE->emit([&]() { 10533 return OptimizationRemarkAnalysisAliasing( 10534 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10535 L->getHeader()) 10536 << "loop not vectorized: cannot prove it is safe to reorder " 10537 "memory operations"; 10538 }); 10539 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10540 Hints.emitRemarkWithHints(); 10541 return false; 10542 } 10543 } 10544 10545 // Identify the diagnostic messages that should be produced. 10546 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10547 bool VectorizeLoop = true, InterleaveLoop = true; 10548 if (VF.Width.isScalar()) { 10549 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10550 VecDiagMsg = std::make_pair( 10551 "VectorizationNotBeneficial", 10552 "the cost-model indicates that vectorization is not beneficial"); 10553 VectorizeLoop = false; 10554 } 10555 10556 if (!MaybeVF && UserIC > 1) { 10557 // Tell the user interleaving was avoided up-front, despite being explicitly 10558 // requested. 10559 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10560 "interleaving should be avoided up front\n"); 10561 IntDiagMsg = std::make_pair( 10562 "InterleavingAvoided", 10563 "Ignoring UserIC, because interleaving was avoided up front"); 10564 InterleaveLoop = false; 10565 } else if (IC == 1 && UserIC <= 1) { 10566 // Tell the user interleaving is not beneficial. 10567 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10568 IntDiagMsg = std::make_pair( 10569 "InterleavingNotBeneficial", 10570 "the cost-model indicates that interleaving is not beneficial"); 10571 InterleaveLoop = false; 10572 if (UserIC == 1) { 10573 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10574 IntDiagMsg.second += 10575 " and is explicitly disabled or interleave count is set to 1"; 10576 } 10577 } else if (IC > 1 && UserIC == 1) { 10578 // Tell the user interleaving is beneficial, but it explicitly disabled. 10579 LLVM_DEBUG( 10580 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10581 IntDiagMsg = std::make_pair( 10582 "InterleavingBeneficialButDisabled", 10583 "the cost-model indicates that interleaving is beneficial " 10584 "but is explicitly disabled or interleave count is set to 1"); 10585 InterleaveLoop = false; 10586 } 10587 10588 // Override IC if user provided an interleave count. 10589 IC = UserIC > 0 ? UserIC : IC; 10590 10591 // Emit diagnostic messages, if any. 10592 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10593 if (!VectorizeLoop && !InterleaveLoop) { 10594 // Do not vectorize or interleaving the loop. 10595 ORE->emit([&]() { 10596 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10597 L->getStartLoc(), L->getHeader()) 10598 << VecDiagMsg.second; 10599 }); 10600 ORE->emit([&]() { 10601 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10602 L->getStartLoc(), L->getHeader()) 10603 << IntDiagMsg.second; 10604 }); 10605 return false; 10606 } else if (!VectorizeLoop && InterleaveLoop) { 10607 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10608 ORE->emit([&]() { 10609 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10610 L->getStartLoc(), L->getHeader()) 10611 << VecDiagMsg.second; 10612 }); 10613 } else if (VectorizeLoop && !InterleaveLoop) { 10614 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10615 << ") in " << DebugLocStr << '\n'); 10616 ORE->emit([&]() { 10617 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10618 L->getStartLoc(), L->getHeader()) 10619 << IntDiagMsg.second; 10620 }); 10621 } else if (VectorizeLoop && InterleaveLoop) { 10622 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10623 << ") in " << DebugLocStr << '\n'); 10624 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10625 } 10626 10627 bool DisableRuntimeUnroll = false; 10628 MDNode *OrigLoopID = L->getLoopID(); 10629 { 10630 using namespace ore; 10631 if (!VectorizeLoop) { 10632 assert(IC > 1 && "interleave count should not be 1 or 0"); 10633 // If we decided that it is not legal to vectorize the loop, then 10634 // interleave it. 10635 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10636 &CM, BFI, PSI, Checks); 10637 10638 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10639 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10640 10641 ORE->emit([&]() { 10642 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10643 L->getHeader()) 10644 << "interleaved loop (interleaved count: " 10645 << NV("InterleaveCount", IC) << ")"; 10646 }); 10647 } else { 10648 // If we decided that it is *legal* to vectorize the loop, then do it. 10649 10650 // Consider vectorizing the epilogue too if it's profitable. 10651 VectorizationFactor EpilogueVF = 10652 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10653 if (EpilogueVF.Width.isVector()) { 10654 10655 // The first pass vectorizes the main loop and creates a scalar epilogue 10656 // to be vectorized by executing the plan (potentially with a different 10657 // factor) again shortly afterwards. 10658 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10659 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10660 EPI, &LVL, &CM, BFI, PSI, Checks); 10661 10662 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10663 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10664 DT, true); 10665 ++LoopsVectorized; 10666 10667 // Second pass vectorizes the epilogue and adjusts the control flow 10668 // edges from the first pass. 10669 EPI.MainLoopVF = EPI.EpilogueVF; 10670 EPI.MainLoopUF = EPI.EpilogueUF; 10671 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10672 ORE, EPI, &LVL, &CM, BFI, PSI, 10673 Checks); 10674 10675 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10676 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10677 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10678 Header->setName("vec.epilog.vector.body"); 10679 10680 // Ensure that the start values for any VPReductionPHIRecipes are 10681 // updated before vectorising the epilogue loop. 10682 for (VPRecipeBase &R : Header->phis()) { 10683 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10684 if (auto *Resume = MainILV.getReductionResumeValue( 10685 ReductionPhi->getRecurrenceDescriptor())) { 10686 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10687 ReductionPhi->setOperand(0, StartVal); 10688 } 10689 } 10690 } 10691 10692 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10693 DT, true); 10694 ++LoopsEpilogueVectorized; 10695 10696 if (!MainILV.areSafetyChecksAdded()) 10697 DisableRuntimeUnroll = true; 10698 } else { 10699 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10700 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10701 PSI, Checks); 10702 10703 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10704 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10705 ++LoopsVectorized; 10706 10707 // Add metadata to disable runtime unrolling a scalar loop when there 10708 // are no runtime checks about strides and memory. A scalar loop that is 10709 // rarely used is not worth unrolling. 10710 if (!LB.areSafetyChecksAdded()) 10711 DisableRuntimeUnroll = true; 10712 } 10713 // Report the vectorization decision. 10714 ORE->emit([&]() { 10715 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10716 L->getHeader()) 10717 << "vectorized loop (vectorization width: " 10718 << NV("VectorizationFactor", VF.Width) 10719 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10720 }); 10721 } 10722 10723 if (ORE->allowExtraAnalysis(LV_NAME)) 10724 checkMixedPrecision(L, ORE); 10725 } 10726 10727 Optional<MDNode *> RemainderLoopID = 10728 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10729 LLVMLoopVectorizeFollowupEpilogue}); 10730 if (RemainderLoopID) { 10731 L->setLoopID(RemainderLoopID.getValue()); 10732 } else { 10733 if (DisableRuntimeUnroll) 10734 AddRuntimeUnrollDisableMetaData(L); 10735 10736 // Mark the loop as already vectorized to avoid vectorizing again. 10737 Hints.setAlreadyVectorized(); 10738 } 10739 10740 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10741 return true; 10742 } 10743 10744 LoopVectorizeResult LoopVectorizePass::runImpl( 10745 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10746 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10747 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10748 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10749 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10750 SE = &SE_; 10751 LI = &LI_; 10752 TTI = &TTI_; 10753 DT = &DT_; 10754 BFI = &BFI_; 10755 TLI = TLI_; 10756 AA = &AA_; 10757 AC = &AC_; 10758 GetLAA = &GetLAA_; 10759 DB = &DB_; 10760 ORE = &ORE_; 10761 PSI = PSI_; 10762 10763 // Don't attempt if 10764 // 1. the target claims to have no vector registers, and 10765 // 2. interleaving won't help ILP. 10766 // 10767 // The second condition is necessary because, even if the target has no 10768 // vector registers, loop vectorization may still enable scalar 10769 // interleaving. 10770 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10771 TTI->getMaxInterleaveFactor(1) < 2) 10772 return LoopVectorizeResult(false, false); 10773 10774 bool Changed = false, CFGChanged = false; 10775 10776 // The vectorizer requires loops to be in simplified form. 10777 // Since simplification may add new inner loops, it has to run before the 10778 // legality and profitability checks. This means running the loop vectorizer 10779 // will simplify all loops, regardless of whether anything end up being 10780 // vectorized. 10781 for (auto &L : *LI) 10782 Changed |= CFGChanged |= 10783 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10784 10785 // Build up a worklist of inner-loops to vectorize. This is necessary as 10786 // the act of vectorizing or partially unrolling a loop creates new loops 10787 // and can invalidate iterators across the loops. 10788 SmallVector<Loop *, 8> Worklist; 10789 10790 for (Loop *L : *LI) 10791 collectSupportedLoops(*L, LI, ORE, Worklist); 10792 10793 LoopsAnalyzed += Worklist.size(); 10794 10795 // Now walk the identified inner loops. 10796 while (!Worklist.empty()) { 10797 Loop *L = Worklist.pop_back_val(); 10798 10799 // For the inner loops we actually process, form LCSSA to simplify the 10800 // transform. 10801 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10802 10803 Changed |= CFGChanged |= processLoop(L); 10804 } 10805 10806 // Process each loop nest in the function. 10807 return LoopVectorizeResult(Changed, CFGChanged); 10808 } 10809 10810 PreservedAnalyses LoopVectorizePass::run(Function &F, 10811 FunctionAnalysisManager &AM) { 10812 auto &LI = AM.getResult<LoopAnalysis>(F); 10813 // There are no loops in the function. Return before computing other expensive 10814 // analyses. 10815 if (LI.empty()) 10816 return PreservedAnalyses::all(); 10817 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10818 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10819 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10820 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10821 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10822 auto &AA = AM.getResult<AAManager>(F); 10823 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10824 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10825 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10826 10827 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10828 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10829 [&](Loop &L) -> const LoopAccessInfo & { 10830 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10831 TLI, TTI, nullptr, nullptr, nullptr}; 10832 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10833 }; 10834 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10835 ProfileSummaryInfo *PSI = 10836 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10837 LoopVectorizeResult Result = 10838 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10839 if (!Result.MadeAnyChange) 10840 return PreservedAnalyses::all(); 10841 PreservedAnalyses PA; 10842 10843 // We currently do not preserve loopinfo/dominator analyses with outer loop 10844 // vectorization. Until this is addressed, mark these analyses as preserved 10845 // only for non-VPlan-native path. 10846 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10847 if (!EnableVPlanNativePath) { 10848 PA.preserve<LoopAnalysis>(); 10849 PA.preserve<DominatorTreeAnalysis>(); 10850 } 10851 10852 if (Result.MadeCFGChange) { 10853 // Making CFG changes likely means a loop got vectorized. Indicate that 10854 // extra simplification passes should be run. 10855 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10856 // be run if runtime checks have been added. 10857 AM.getResult<ShouldRunExtraVectorPasses>(F); 10858 PA.preserve<ShouldRunExtraVectorPasses>(); 10859 } else { 10860 PA.preserveSet<CFGAnalyses>(); 10861 } 10862 return PA; 10863 } 10864 10865 void LoopVectorizePass::printPipeline( 10866 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10867 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10868 OS, MapClassName2PassName); 10869 10870 OS << "<"; 10871 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10872 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10873 OS << ">"; 10874 } 10875