1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/None.h" 69 #include "llvm/ADT/Optional.h" 70 #include "llvm/ADT/STLExtras.h" 71 #include "llvm/ADT/SmallPtrSet.h" 72 #include "llvm/ADT/SmallSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 90 #include "llvm/Analysis/ProfileSummaryInfo.h" 91 #include "llvm/Analysis/ScalarEvolution.h" 92 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 93 #include "llvm/Analysis/TargetLibraryInfo.h" 94 #include "llvm/Analysis/TargetTransformInfo.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/InitializePasses.h" 125 #include "llvm/Pass.h" 126 #include "llvm/Support/Casting.h" 127 #include "llvm/Support/CommandLine.h" 128 #include "llvm/Support/Compiler.h" 129 #include "llvm/Support/Debug.h" 130 #include "llvm/Support/ErrorHandling.h" 131 #include "llvm/Support/InstructionCost.h" 132 #include "llvm/Support/MathExtras.h" 133 #include "llvm/Support/raw_ostream.h" 134 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 135 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks")); 202 203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 204 // that predication is preferred, and this lists all options. I.e., the 205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 206 // and predicate the instructions accordingly. If tail-folding fails, there are 207 // different fallback strategies depending on these values: 208 namespace PreferPredicateTy { 209 enum Option { 210 ScalarEpilogue = 0, 211 PredicateElseScalarEpilogue, 212 PredicateOrDontVectorize 213 }; 214 } // namespace PreferPredicateTy 215 216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 217 "prefer-predicate-over-epilogue", 218 cl::init(PreferPredicateTy::ScalarEpilogue), 219 cl::Hidden, 220 cl::desc("Tail-folding and predication preferences over creating a scalar " 221 "epilogue loop."), 222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 223 "scalar-epilogue", 224 "Don't tail-predicate loops, create scalar epilogue"), 225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 226 "predicate-else-scalar-epilogue", 227 "prefer tail-folding, create scalar epilogue if tail " 228 "folding fails."), 229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 230 "predicate-dont-vectorize", 231 "prefers tail-folding, don't attempt vectorization if " 232 "tail-folding fails."))); 233 234 static cl::opt<bool> MaximizeBandwidth( 235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 236 cl::desc("Maximize bandwidth when selecting vectorization factor which " 237 "will be determined by the smallest type in loop.")); 238 239 static cl::opt<bool> EnableInterleavedMemAccesses( 240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 241 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 242 243 /// An interleave-group may need masking if it resides in a block that needs 244 /// predication, or in order to mask away gaps. 245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 248 249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 251 cl::desc("We don't interleave loops with a estimated constant trip count " 252 "below this number")); 253 254 static cl::opt<unsigned> ForceTargetNumScalarRegs( 255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 256 cl::desc("A flag that overrides the target's number of scalar registers.")); 257 258 static cl::opt<unsigned> ForceTargetNumVectorRegs( 259 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 260 cl::desc("A flag that overrides the target's number of vector registers.")); 261 262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 264 cl::desc("A flag that overrides the target's max interleave factor for " 265 "scalar loops.")); 266 267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 269 cl::desc("A flag that overrides the target's max interleave factor for " 270 "vectorized loops.")); 271 272 static cl::opt<unsigned> ForceTargetInstructionCost( 273 "force-target-instruction-cost", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's expected cost for " 275 "an instruction to a single constant value. Mostly " 276 "useful for getting consistent testing.")); 277 278 static cl::opt<bool> ForceTargetSupportsScalableVectors( 279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 280 cl::desc( 281 "Pretend that scalable vectors are supported, even if the target does " 282 "not support them. This flag should only be used for testing.")); 283 284 static cl::opt<unsigned> SmallLoopCost( 285 "small-loop-cost", cl::init(20), cl::Hidden, 286 cl::desc( 287 "The cost of a loop that is considered 'small' by the interleaver.")); 288 289 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 291 cl::desc("Enable the use of the block frequency analysis to access PGO " 292 "heuristics minimizing code growth in cold regions and being more " 293 "aggressive in hot regions.")); 294 295 // Runtime interleave loops for load/store throughput. 296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 298 cl::desc( 299 "Enable runtime interleaving until load/store ports are saturated")); 300 301 /// Interleave small loops with scalar reductions. 302 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 304 cl::desc("Enable interleaving for loops with small iteration counts that " 305 "contain scalar reductions to expose ILP.")); 306 307 /// The number of stores in a loop that are allowed to need predication. 308 static cl::opt<unsigned> NumberOfStoresToPredicate( 309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 310 cl::desc("Max number of stores to be predicated behind an if.")); 311 312 static cl::opt<bool> EnableIndVarRegisterHeur( 313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 314 cl::desc("Count the induction variable only once when interleaving")); 315 316 static cl::opt<bool> EnableCondStoresVectorization( 317 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 318 cl::desc("Enable if predication of stores during vectorization.")); 319 320 static cl::opt<unsigned> MaxNestedScalarReductionIC( 321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 322 cl::desc("The maximum interleave count to use when interleaving a scalar " 323 "reduction in a nested loop.")); 324 325 static cl::opt<bool> 326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 327 cl::Hidden, 328 cl::desc("Prefer in-loop vector reductions, " 329 "overriding the targets preference.")); 330 331 static cl::opt<bool> ForceOrderedReductions( 332 "force-ordered-reductions", cl::init(false), cl::Hidden, 333 cl::desc("Enable the vectorisation of loops with in-order (strict) " 334 "FP reductions")); 335 336 static cl::opt<bool> PreferPredicatedReductionSelect( 337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 338 cl::desc( 339 "Prefer predicating a reduction operation over an after loop select.")); 340 341 cl::opt<bool> EnableVPlanNativePath( 342 "enable-vplan-native-path", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 cl::opt<bool> PrintVPlansInDotFormat( 365 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 366 cl::desc("Use dot format instead of plain text when dumping VPlans")); 367 368 /// A helper function that returns true if the given type is irregular. The 369 /// type is irregular if its allocated size doesn't equal the store size of an 370 /// element of the corresponding vector type. 371 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 372 // Determine if an array of N elements of type Ty is "bitcast compatible" 373 // with a <N x Ty> vector. 374 // This is only true if there is no padding between the array elements. 375 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 376 } 377 378 /// A helper function that returns the reciprocal of the block probability of 379 /// predicated blocks. If we return X, we are assuming the predicated block 380 /// will execute once for every X iterations of the loop header. 381 /// 382 /// TODO: We should use actual block probability here, if available. Currently, 383 /// we always assume predicated blocks have a 50% chance of executing. 384 static unsigned getReciprocalPredBlockProb() { return 2; } 385 386 /// A helper function that returns an integer or floating-point constant with 387 /// value C. 388 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 389 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 390 : ConstantFP::get(Ty, C); 391 } 392 393 /// Returns "best known" trip count for the specified loop \p L as defined by 394 /// the following procedure: 395 /// 1) Returns exact trip count if it is known. 396 /// 2) Returns expected trip count according to profile data if any. 397 /// 3) Returns upper bound estimate if it is known. 398 /// 4) Returns None if all of the above failed. 399 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 400 // Check if exact trip count is known. 401 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 402 return ExpectedTC; 403 404 // Check if there is an expected trip count available from profile data. 405 if (LoopVectorizeWithBlockFrequency) 406 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 407 return EstimatedTC; 408 409 // Check if upper bound estimate is known. 410 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 411 return ExpectedTC; 412 413 return None; 414 } 415 416 // Forward declare GeneratedRTChecks. 417 class GeneratedRTChecks; 418 419 namespace llvm { 420 421 AnalysisKey ShouldRunExtraVectorPasses::Key; 422 423 /// InnerLoopVectorizer vectorizes loops which contain only one basic 424 /// block to a specified vectorization factor (VF). 425 /// This class performs the widening of scalars into vectors, or multiple 426 /// scalars. This class also implements the following features: 427 /// * It inserts an epilogue loop for handling loops that don't have iteration 428 /// counts that are known to be a multiple of the vectorization factor. 429 /// * It handles the code generation for reduction variables. 430 /// * Scalarization (implementation using scalars) of un-vectorizable 431 /// instructions. 432 /// InnerLoopVectorizer does not perform any vectorization-legality 433 /// checks, and relies on the caller to check for the different legality 434 /// aspects. The InnerLoopVectorizer relies on the 435 /// LoopVectorizationLegality class to provide information about the induction 436 /// and reduction variables that were found to a given vectorization factor. 437 class InnerLoopVectorizer { 438 public: 439 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 440 LoopInfo *LI, DominatorTree *DT, 441 const TargetLibraryInfo *TLI, 442 const TargetTransformInfo *TTI, AssumptionCache *AC, 443 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 444 ElementCount MinProfitableTripCount, 445 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 446 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 447 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 448 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 449 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 450 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 451 PSI(PSI), RTChecks(RTChecks) { 452 // Query this against the original loop and save it here because the profile 453 // of the original loop header may change as the transformation happens. 454 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 455 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 456 457 if (MinProfitableTripCount.isZero()) 458 this->MinProfitableTripCount = VecWidth; 459 else 460 this->MinProfitableTripCount = MinProfitableTripCount; 461 } 462 463 virtual ~InnerLoopVectorizer() = default; 464 465 /// Create a new empty loop that will contain vectorized instructions later 466 /// on, while the old loop will be used as the scalar remainder. Control flow 467 /// is generated around the vectorized (and scalar epilogue) loops consisting 468 /// of various checks and bypasses. Return the pre-header block of the new 469 /// loop and the start value for the canonical induction, if it is != 0. The 470 /// latter is the case when vectorizing the epilogue loop. In the case of 471 /// epilogue vectorization, this function is overriden to handle the more 472 /// complex control flow around the loops. 473 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 474 475 /// Widen a single call instruction within the innermost loop. 476 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 477 VPTransformState &State); 478 479 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 480 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 481 482 // Return true if any runtime check is added. 483 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 484 485 /// A type for vectorized values in the new loop. Each value from the 486 /// original loop, when vectorized, is represented by UF vector values in the 487 /// new unrolled loop, where UF is the unroll factor. 488 using VectorParts = SmallVector<Value *, 2>; 489 490 /// A helper function to scalarize a single Instruction in the innermost loop. 491 /// Generates a sequence of scalar instances for each lane between \p MinLane 492 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 493 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 494 /// Instr's operands. 495 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 496 const VPIteration &Instance, bool IfPredicateInstr, 497 VPTransformState &State); 498 499 /// Construct the vector value of a scalarized value \p V one lane at a time. 500 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 501 VPTransformState &State); 502 503 /// Try to vectorize interleaved access group \p Group with the base address 504 /// given in \p Addr, optionally masking the vector operations if \p 505 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 506 /// values in the vectorized loop. 507 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 508 ArrayRef<VPValue *> VPDefs, 509 VPTransformState &State, VPValue *Addr, 510 ArrayRef<VPValue *> StoredValues, 511 VPValue *BlockInMask = nullptr); 512 513 /// Fix the non-induction PHIs in \p Plan. 514 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 515 516 /// Returns true if the reordering of FP operations is not allowed, but we are 517 /// able to vectorize with strict in-order reductions for the given RdxDesc. 518 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 519 520 /// Create a broadcast instruction. This method generates a broadcast 521 /// instruction (shuffle) for loop invariant values and for the induction 522 /// value. If this is the induction variable then we extend it to N, N+1, ... 523 /// this is needed because each iteration in the loop corresponds to a SIMD 524 /// element. 525 virtual Value *getBroadcastInstrs(Value *V); 526 527 // Returns the resume value (bc.merge.rdx) for a reduction as 528 // generated by fixReduction. 529 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 530 531 protected: 532 friend class LoopVectorizationPlanner; 533 534 /// A small list of PHINodes. 535 using PhiVector = SmallVector<PHINode *, 4>; 536 537 /// A type for scalarized values in the new loop. Each value from the 538 /// original loop, when scalarized, is represented by UF x VF scalar values 539 /// in the new unrolled loop, where UF is the unroll factor and VF is the 540 /// vectorization factor. 541 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 542 543 /// Set up the values of the IVs correctly when exiting the vector loop. 544 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 545 Value *VectorTripCount, Value *EndValue, 546 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 547 VPlan &Plan); 548 549 /// Handle all cross-iteration phis in the header. 550 void fixCrossIterationPHIs(VPTransformState &State); 551 552 /// Create the exit value of first order recurrences in the middle block and 553 /// update their users. 554 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 555 VPTransformState &State); 556 557 /// Create code for the loop exit value of the reduction. 558 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 559 560 /// Clear NSW/NUW flags from reduction instructions if necessary. 561 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 562 VPTransformState &State); 563 564 /// Iteratively sink the scalarized operands of a predicated instruction into 565 /// the block that was created for it. 566 void sinkScalarOperands(Instruction *PredInst); 567 568 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 569 /// represented as. 570 void truncateToMinimalBitwidths(VPTransformState &State); 571 572 /// Returns (and creates if needed) the original loop trip count. 573 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 574 575 /// Returns (and creates if needed) the trip count of the widened loop. 576 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 577 578 /// Returns a bitcasted value to the requested vector type. 579 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 580 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 581 const DataLayout &DL); 582 583 /// Emit a bypass check to see if the vector trip count is zero, including if 584 /// it overflows. 585 void emitIterationCountCheck(BasicBlock *Bypass); 586 587 /// Emit a bypass check to see if all of the SCEV assumptions we've 588 /// had to make are correct. Returns the block containing the checks or 589 /// nullptr if no checks have been added. 590 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 591 592 /// Emit bypass checks to check any memory assumptions we may have made. 593 /// Returns the block containing the checks or nullptr if no checks have been 594 /// added. 595 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 596 597 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 598 /// vector loop preheader, middle block and scalar preheader. 599 void createVectorLoopSkeleton(StringRef Prefix); 600 601 /// Create new phi nodes for the induction variables to resume iteration count 602 /// in the scalar epilogue, from where the vectorized loop left off. 603 /// In cases where the loop skeleton is more complicated (eg. epilogue 604 /// vectorization) and the resume values can come from an additional bypass 605 /// block, the \p AdditionalBypass pair provides information about the bypass 606 /// block and the end value on the edge from bypass to this loop. 607 void createInductionResumeValues( 608 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 609 610 /// Complete the loop skeleton by adding debug MDs, creating appropriate 611 /// conditional branches in the middle block, preparing the builder and 612 /// running the verifier. Return the preheader of the completed vector loop. 613 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 614 615 /// Collect poison-generating recipes that may generate a poison value that is 616 /// used after vectorization, even when their operands are not poison. Those 617 /// recipes meet the following conditions: 618 /// * Contribute to the address computation of a recipe generating a widen 619 /// memory load/store (VPWidenMemoryInstructionRecipe or 620 /// VPInterleaveRecipe). 621 /// * Such a widen memory load/store has at least one underlying Instruction 622 /// that is in a basic block that needs predication and after vectorization 623 /// the generated instruction won't be predicated. 624 void collectPoisonGeneratingRecipes(VPTransformState &State); 625 626 /// Allow subclasses to override and print debug traces before/after vplan 627 /// execution, when trace information is requested. 628 virtual void printDebugTracesAtStart(){}; 629 virtual void printDebugTracesAtEnd(){}; 630 631 /// The original loop. 632 Loop *OrigLoop; 633 634 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 635 /// dynamic knowledge to simplify SCEV expressions and converts them to a 636 /// more usable form. 637 PredicatedScalarEvolution &PSE; 638 639 /// Loop Info. 640 LoopInfo *LI; 641 642 /// Dominator Tree. 643 DominatorTree *DT; 644 645 /// Alias Analysis. 646 AAResults *AA; 647 648 /// Target Library Info. 649 const TargetLibraryInfo *TLI; 650 651 /// Target Transform Info. 652 const TargetTransformInfo *TTI; 653 654 /// Assumption Cache. 655 AssumptionCache *AC; 656 657 /// Interface to emit optimization remarks. 658 OptimizationRemarkEmitter *ORE; 659 660 /// The vectorization SIMD factor to use. Each vector will have this many 661 /// vector elements. 662 ElementCount VF; 663 664 ElementCount MinProfitableTripCount; 665 666 /// The vectorization unroll factor to use. Each scalar is vectorized to this 667 /// many different vector instructions. 668 unsigned UF; 669 670 /// The builder that we use 671 IRBuilder<> Builder; 672 673 // --- Vectorization state --- 674 675 /// The vector-loop preheader. 676 BasicBlock *LoopVectorPreHeader; 677 678 /// The scalar-loop preheader. 679 BasicBlock *LoopScalarPreHeader; 680 681 /// Middle Block between the vector and the scalar. 682 BasicBlock *LoopMiddleBlock; 683 684 /// The unique ExitBlock of the scalar loop if one exists. Note that 685 /// there can be multiple exiting edges reaching this block. 686 BasicBlock *LoopExitBlock; 687 688 /// The scalar loop body. 689 BasicBlock *LoopScalarBody; 690 691 /// A list of all bypass blocks. The first block is the entry of the loop. 692 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 693 694 /// Store instructions that were predicated. 695 SmallVector<Instruction *, 4> PredicatedInstructions; 696 697 /// Trip count of the original loop. 698 Value *TripCount = nullptr; 699 700 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 701 Value *VectorTripCount = nullptr; 702 703 /// The legality analysis. 704 LoopVectorizationLegality *Legal; 705 706 /// The profitablity analysis. 707 LoopVectorizationCostModel *Cost; 708 709 // Record whether runtime checks are added. 710 bool AddedSafetyChecks = false; 711 712 // Holds the end values for each induction variable. We save the end values 713 // so we can later fix-up the external users of the induction variables. 714 DenseMap<PHINode *, Value *> IVEndValues; 715 716 /// BFI and PSI are used to check for profile guided size optimizations. 717 BlockFrequencyInfo *BFI; 718 ProfileSummaryInfo *PSI; 719 720 // Whether this loop should be optimized for size based on profile guided size 721 // optimizatios. 722 bool OptForSizeBasedOnProfile; 723 724 /// Structure to hold information about generated runtime checks, responsible 725 /// for cleaning the checks, if vectorization turns out unprofitable. 726 GeneratedRTChecks &RTChecks; 727 728 // Holds the resume values for reductions in the loops, used to set the 729 // correct start value of reduction PHIs when vectorizing the epilogue. 730 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 731 ReductionResumeValues; 732 }; 733 734 class InnerLoopUnroller : public InnerLoopVectorizer { 735 public: 736 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 737 LoopInfo *LI, DominatorTree *DT, 738 const TargetLibraryInfo *TLI, 739 const TargetTransformInfo *TTI, AssumptionCache *AC, 740 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 741 LoopVectorizationLegality *LVL, 742 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 743 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 744 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 745 ElementCount::getFixed(1), 746 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 747 BFI, PSI, Check) {} 748 749 private: 750 Value *getBroadcastInstrs(Value *V) override; 751 }; 752 753 /// Encapsulate information regarding vectorization of a loop and its epilogue. 754 /// This information is meant to be updated and used across two stages of 755 /// epilogue vectorization. 756 struct EpilogueLoopVectorizationInfo { 757 ElementCount MainLoopVF = ElementCount::getFixed(0); 758 unsigned MainLoopUF = 0; 759 ElementCount EpilogueVF = ElementCount::getFixed(0); 760 unsigned EpilogueUF = 0; 761 BasicBlock *MainLoopIterationCountCheck = nullptr; 762 BasicBlock *EpilogueIterationCountCheck = nullptr; 763 BasicBlock *SCEVSafetyCheck = nullptr; 764 BasicBlock *MemSafetyCheck = nullptr; 765 Value *TripCount = nullptr; 766 Value *VectorTripCount = nullptr; 767 768 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 769 ElementCount EVF, unsigned EUF) 770 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 771 assert(EUF == 1 && 772 "A high UF for the epilogue loop is likely not beneficial."); 773 } 774 }; 775 776 /// An extension of the inner loop vectorizer that creates a skeleton for a 777 /// vectorized loop that has its epilogue (residual) also vectorized. 778 /// The idea is to run the vplan on a given loop twice, firstly to setup the 779 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 780 /// from the first step and vectorize the epilogue. This is achieved by 781 /// deriving two concrete strategy classes from this base class and invoking 782 /// them in succession from the loop vectorizer planner. 783 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 784 public: 785 InnerLoopAndEpilogueVectorizer( 786 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 787 DominatorTree *DT, const TargetLibraryInfo *TLI, 788 const TargetTransformInfo *TTI, AssumptionCache *AC, 789 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 790 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 791 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 792 GeneratedRTChecks &Checks) 793 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 794 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 795 CM, BFI, PSI, Checks), 796 EPI(EPI) {} 797 798 // Override this function to handle the more complex control flow around the 799 // three loops. 800 std::pair<BasicBlock *, Value *> 801 createVectorizedLoopSkeleton() final override { 802 return createEpilogueVectorizedLoopSkeleton(); 803 } 804 805 /// The interface for creating a vectorized skeleton using one of two 806 /// different strategies, each corresponding to one execution of the vplan 807 /// as described above. 808 virtual std::pair<BasicBlock *, Value *> 809 createEpilogueVectorizedLoopSkeleton() = 0; 810 811 /// Holds and updates state information required to vectorize the main loop 812 /// and its epilogue in two separate passes. This setup helps us avoid 813 /// regenerating and recomputing runtime safety checks. It also helps us to 814 /// shorten the iteration-count-check path length for the cases where the 815 /// iteration count of the loop is so small that the main vector loop is 816 /// completely skipped. 817 EpilogueLoopVectorizationInfo &EPI; 818 }; 819 820 /// A specialized derived class of inner loop vectorizer that performs 821 /// vectorization of *main* loops in the process of vectorizing loops and their 822 /// epilogues. 823 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 824 public: 825 EpilogueVectorizerMainLoop( 826 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 827 DominatorTree *DT, const TargetLibraryInfo *TLI, 828 const TargetTransformInfo *TTI, AssumptionCache *AC, 829 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 830 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 831 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 832 GeneratedRTChecks &Check) 833 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 834 EPI, LVL, CM, BFI, PSI, Check) {} 835 /// Implements the interface for creating a vectorized skeleton using the 836 /// *main loop* strategy (ie the first pass of vplan execution). 837 std::pair<BasicBlock *, Value *> 838 createEpilogueVectorizedLoopSkeleton() final override; 839 840 protected: 841 /// Emits an iteration count bypass check once for the main loop (when \p 842 /// ForEpilogue is false) and once for the epilogue loop (when \p 843 /// ForEpilogue is true). 844 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 845 void printDebugTracesAtStart() override; 846 void printDebugTracesAtEnd() override; 847 }; 848 849 // A specialized derived class of inner loop vectorizer that performs 850 // vectorization of *epilogue* loops in the process of vectorizing loops and 851 // their epilogues. 852 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 853 public: 854 EpilogueVectorizerEpilogueLoop( 855 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 856 DominatorTree *DT, const TargetLibraryInfo *TLI, 857 const TargetTransformInfo *TTI, AssumptionCache *AC, 858 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 859 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 860 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 861 GeneratedRTChecks &Checks) 862 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 863 EPI, LVL, CM, BFI, PSI, Checks) { 864 TripCount = EPI.TripCount; 865 } 866 /// Implements the interface for creating a vectorized skeleton using the 867 /// *epilogue loop* strategy (ie the second pass of vplan execution). 868 std::pair<BasicBlock *, Value *> 869 createEpilogueVectorizedLoopSkeleton() final override; 870 871 protected: 872 /// Emits an iteration count bypass check after the main vector loop has 873 /// finished to see if there are any iterations left to execute by either 874 /// the vector epilogue or the scalar epilogue. 875 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 876 BasicBlock *Bypass, 877 BasicBlock *Insert); 878 void printDebugTracesAtStart() override; 879 void printDebugTracesAtEnd() override; 880 }; 881 } // end namespace llvm 882 883 /// Look for a meaningful debug location on the instruction or it's 884 /// operands. 885 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 886 if (!I) 887 return I; 888 889 DebugLoc Empty; 890 if (I->getDebugLoc() != Empty) 891 return I; 892 893 for (Use &Op : I->operands()) { 894 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 895 if (OpInst->getDebugLoc() != Empty) 896 return OpInst; 897 } 898 899 return I; 900 } 901 902 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 903 /// is passed, the message relates to that particular instruction. 904 #ifndef NDEBUG 905 static void debugVectorizationMessage(const StringRef Prefix, 906 const StringRef DebugMsg, 907 Instruction *I) { 908 dbgs() << "LV: " << Prefix << DebugMsg; 909 if (I != nullptr) 910 dbgs() << " " << *I; 911 else 912 dbgs() << '.'; 913 dbgs() << '\n'; 914 } 915 #endif 916 917 /// Create an analysis remark that explains why vectorization failed 918 /// 919 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 920 /// RemarkName is the identifier for the remark. If \p I is passed it is an 921 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 922 /// the location of the remark. \return the remark object that can be 923 /// streamed to. 924 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 925 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 926 Value *CodeRegion = TheLoop->getHeader(); 927 DebugLoc DL = TheLoop->getStartLoc(); 928 929 if (I) { 930 CodeRegion = I->getParent(); 931 // If there is no debug location attached to the instruction, revert back to 932 // using the loop's. 933 if (I->getDebugLoc()) 934 DL = I->getDebugLoc(); 935 } 936 937 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 938 } 939 940 namespace llvm { 941 942 /// Return a value for Step multiplied by VF. 943 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 944 int64_t Step) { 945 assert(Ty->isIntegerTy() && "Expected an integer step"); 946 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 947 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 948 } 949 950 /// Return the runtime value for VF. 951 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 952 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 953 return VF.isScalable() ? B.CreateVScale(EC) : EC; 954 } 955 956 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 957 ElementCount VF) { 958 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 959 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 960 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 961 return B.CreateUIToFP(RuntimeVF, FTy); 962 } 963 964 void reportVectorizationFailure(const StringRef DebugMsg, 965 const StringRef OREMsg, const StringRef ORETag, 966 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 967 Instruction *I) { 968 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 969 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 970 ORE->emit( 971 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 972 << "loop not vectorized: " << OREMsg); 973 } 974 975 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 976 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 977 Instruction *I) { 978 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 979 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 980 ORE->emit( 981 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 982 << Msg); 983 } 984 985 } // end namespace llvm 986 987 #ifndef NDEBUG 988 /// \return string containing a file name and a line # for the given loop. 989 static std::string getDebugLocString(const Loop *L) { 990 std::string Result; 991 if (L) { 992 raw_string_ostream OS(Result); 993 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 994 LoopDbgLoc.print(OS); 995 else 996 // Just print the module name. 997 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 998 OS.flush(); 999 } 1000 return Result; 1001 } 1002 #endif 1003 1004 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1005 VPTransformState &State) { 1006 1007 // Collect recipes in the backward slice of `Root` that may generate a poison 1008 // value that is used after vectorization. 1009 SmallPtrSet<VPRecipeBase *, 16> Visited; 1010 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1011 SmallVector<VPRecipeBase *, 16> Worklist; 1012 Worklist.push_back(Root); 1013 1014 // Traverse the backward slice of Root through its use-def chain. 1015 while (!Worklist.empty()) { 1016 VPRecipeBase *CurRec = Worklist.back(); 1017 Worklist.pop_back(); 1018 1019 if (!Visited.insert(CurRec).second) 1020 continue; 1021 1022 // Prune search if we find another recipe generating a widen memory 1023 // instruction. Widen memory instructions involved in address computation 1024 // will lead to gather/scatter instructions, which don't need to be 1025 // handled. 1026 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1027 isa<VPInterleaveRecipe>(CurRec) || 1028 isa<VPScalarIVStepsRecipe>(CurRec) || 1029 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1030 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1031 continue; 1032 1033 // This recipe contributes to the address computation of a widen 1034 // load/store. Collect recipe if its underlying instruction has 1035 // poison-generating flags. 1036 Instruction *Instr = CurRec->getUnderlyingInstr(); 1037 if (Instr && Instr->hasPoisonGeneratingFlags()) 1038 State.MayGeneratePoisonRecipes.insert(CurRec); 1039 1040 // Add new definitions to the worklist. 1041 for (VPValue *operand : CurRec->operands()) 1042 if (VPDef *OpDef = operand->getDef()) 1043 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1044 } 1045 }); 1046 1047 // Traverse all the recipes in the VPlan and collect the poison-generating 1048 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1049 // VPInterleaveRecipe. 1050 auto Iter = depth_first( 1051 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1052 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1053 for (VPRecipeBase &Recipe : *VPBB) { 1054 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1055 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1056 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1057 if (AddrDef && WidenRec->isConsecutive() && 1058 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1059 collectPoisonGeneratingInstrsInBackwardSlice( 1060 cast<VPRecipeBase>(AddrDef)); 1061 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1062 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1063 if (AddrDef) { 1064 // Check if any member of the interleave group needs predication. 1065 const InterleaveGroup<Instruction> *InterGroup = 1066 InterleaveRec->getInterleaveGroup(); 1067 bool NeedPredication = false; 1068 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1069 I < NumMembers; ++I) { 1070 Instruction *Member = InterGroup->getMember(I); 1071 if (Member) 1072 NeedPredication |= 1073 Legal->blockNeedsPredication(Member->getParent()); 1074 } 1075 1076 if (NeedPredication) 1077 collectPoisonGeneratingInstrsInBackwardSlice( 1078 cast<VPRecipeBase>(AddrDef)); 1079 } 1080 } 1081 } 1082 } 1083 } 1084 1085 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1086 const RecurrenceDescriptor &RdxDesc) { 1087 auto It = ReductionResumeValues.find(&RdxDesc); 1088 assert(It != ReductionResumeValues.end() && 1089 "Expected to find a resume value for the reduction."); 1090 return It->second; 1091 } 1092 1093 namespace llvm { 1094 1095 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1096 // lowered. 1097 enum ScalarEpilogueLowering { 1098 1099 // The default: allowing scalar epilogues. 1100 CM_ScalarEpilogueAllowed, 1101 1102 // Vectorization with OptForSize: don't allow epilogues. 1103 CM_ScalarEpilogueNotAllowedOptSize, 1104 1105 // A special case of vectorisation with OptForSize: loops with a very small 1106 // trip count are considered for vectorization under OptForSize, thereby 1107 // making sure the cost of their loop body is dominant, free of runtime 1108 // guards and scalar iteration overheads. 1109 CM_ScalarEpilogueNotAllowedLowTripLoop, 1110 1111 // Loop hint predicate indicating an epilogue is undesired. 1112 CM_ScalarEpilogueNotNeededUsePredicate, 1113 1114 // Directive indicating we must either tail fold or not vectorize 1115 CM_ScalarEpilogueNotAllowedUsePredicate 1116 }; 1117 1118 /// ElementCountComparator creates a total ordering for ElementCount 1119 /// for the purposes of using it in a set structure. 1120 struct ElementCountComparator { 1121 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1122 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1123 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1124 } 1125 }; 1126 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1127 1128 /// LoopVectorizationCostModel - estimates the expected speedups due to 1129 /// vectorization. 1130 /// In many cases vectorization is not profitable. This can happen because of 1131 /// a number of reasons. In this class we mainly attempt to predict the 1132 /// expected speedup/slowdowns due to the supported instruction set. We use the 1133 /// TargetTransformInfo to query the different backends for the cost of 1134 /// different operations. 1135 class LoopVectorizationCostModel { 1136 public: 1137 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1138 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1139 LoopVectorizationLegality *Legal, 1140 const TargetTransformInfo &TTI, 1141 const TargetLibraryInfo *TLI, DemandedBits *DB, 1142 AssumptionCache *AC, 1143 OptimizationRemarkEmitter *ORE, const Function *F, 1144 const LoopVectorizeHints *Hints, 1145 InterleavedAccessInfo &IAI) 1146 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1147 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1148 Hints(Hints), InterleaveInfo(IAI) {} 1149 1150 /// \return An upper bound for the vectorization factors (both fixed and 1151 /// scalable). If the factors are 0, vectorization and interleaving should be 1152 /// avoided up front. 1153 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1154 1155 /// \return True if runtime checks are required for vectorization, and false 1156 /// otherwise. 1157 bool runtimeChecksRequired(); 1158 1159 /// \return The most profitable vectorization factor and the cost of that VF. 1160 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1161 /// then this vectorization factor will be selected if vectorization is 1162 /// possible. 1163 VectorizationFactor 1164 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1165 1166 VectorizationFactor 1167 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1168 const LoopVectorizationPlanner &LVP); 1169 1170 /// Setup cost-based decisions for user vectorization factor. 1171 /// \return true if the UserVF is a feasible VF to be chosen. 1172 bool selectUserVectorizationFactor(ElementCount UserVF) { 1173 collectUniformsAndScalars(UserVF); 1174 collectInstsToScalarize(UserVF); 1175 return expectedCost(UserVF).first.isValid(); 1176 } 1177 1178 /// \return The size (in bits) of the smallest and widest types in the code 1179 /// that needs to be vectorized. We ignore values that remain scalar such as 1180 /// 64 bit loop indices. 1181 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1182 1183 /// \return The desired interleave count. 1184 /// If interleave count has been specified by metadata it will be returned. 1185 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1186 /// are the selected vectorization factor and the cost of the selected VF. 1187 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1188 1189 /// Memory access instruction may be vectorized in more than one way. 1190 /// Form of instruction after vectorization depends on cost. 1191 /// This function takes cost-based decisions for Load/Store instructions 1192 /// and collects them in a map. This decisions map is used for building 1193 /// the lists of loop-uniform and loop-scalar instructions. 1194 /// The calculated cost is saved with widening decision in order to 1195 /// avoid redundant calculations. 1196 void setCostBasedWideningDecision(ElementCount VF); 1197 1198 /// A struct that represents some properties of the register usage 1199 /// of a loop. 1200 struct RegisterUsage { 1201 /// Holds the number of loop invariant values that are used in the loop. 1202 /// The key is ClassID of target-provided register class. 1203 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1204 /// Holds the maximum number of concurrent live intervals in the loop. 1205 /// The key is ClassID of target-provided register class. 1206 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1207 }; 1208 1209 /// \return Returns information about the register usages of the loop for the 1210 /// given vectorization factors. 1211 SmallVector<RegisterUsage, 8> 1212 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1213 1214 /// Collect values we want to ignore in the cost model. 1215 void collectValuesToIgnore(); 1216 1217 /// Collect all element types in the loop for which widening is needed. 1218 void collectElementTypesForWidening(); 1219 1220 /// Split reductions into those that happen in the loop, and those that happen 1221 /// outside. In loop reductions are collected into InLoopReductionChains. 1222 void collectInLoopReductions(); 1223 1224 /// Returns true if we should use strict in-order reductions for the given 1225 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1226 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1227 /// of FP operations. 1228 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1229 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1230 } 1231 1232 /// \returns The smallest bitwidth each instruction can be represented with. 1233 /// The vector equivalents of these instructions should be truncated to this 1234 /// type. 1235 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1236 return MinBWs; 1237 } 1238 1239 /// \returns True if it is more profitable to scalarize instruction \p I for 1240 /// vectorization factor \p VF. 1241 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1242 assert(VF.isVector() && 1243 "Profitable to scalarize relevant only for VF > 1."); 1244 1245 // Cost model is not run in the VPlan-native path - return conservative 1246 // result until this changes. 1247 if (EnableVPlanNativePath) 1248 return false; 1249 1250 auto Scalars = InstsToScalarize.find(VF); 1251 assert(Scalars != InstsToScalarize.end() && 1252 "VF not yet analyzed for scalarization profitability"); 1253 return Scalars->second.find(I) != Scalars->second.end(); 1254 } 1255 1256 /// Returns true if \p I is known to be uniform after vectorization. 1257 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1258 if (VF.isScalar()) 1259 return true; 1260 1261 // Cost model is not run in the VPlan-native path - return conservative 1262 // result until this changes. 1263 if (EnableVPlanNativePath) 1264 return false; 1265 1266 auto UniformsPerVF = Uniforms.find(VF); 1267 assert(UniformsPerVF != Uniforms.end() && 1268 "VF not yet analyzed for uniformity"); 1269 return UniformsPerVF->second.count(I); 1270 } 1271 1272 /// Returns true if \p I is known to be scalar after vectorization. 1273 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1274 if (VF.isScalar()) 1275 return true; 1276 1277 // Cost model is not run in the VPlan-native path - return conservative 1278 // result until this changes. 1279 if (EnableVPlanNativePath) 1280 return false; 1281 1282 auto ScalarsPerVF = Scalars.find(VF); 1283 assert(ScalarsPerVF != Scalars.end() && 1284 "Scalar values are not calculated for VF"); 1285 return ScalarsPerVF->second.count(I); 1286 } 1287 1288 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1289 /// for vectorization factor \p VF. 1290 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1291 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1292 !isProfitableToScalarize(I, VF) && 1293 !isScalarAfterVectorization(I, VF); 1294 } 1295 1296 /// Decision that was taken during cost calculation for memory instruction. 1297 enum InstWidening { 1298 CM_Unknown, 1299 CM_Widen, // For consecutive accesses with stride +1. 1300 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1301 CM_Interleave, 1302 CM_GatherScatter, 1303 CM_Scalarize 1304 }; 1305 1306 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1307 /// instruction \p I and vector width \p VF. 1308 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1309 InstructionCost Cost) { 1310 assert(VF.isVector() && "Expected VF >=2"); 1311 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1312 } 1313 1314 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1315 /// interleaving group \p Grp and vector width \p VF. 1316 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1317 ElementCount VF, InstWidening W, 1318 InstructionCost Cost) { 1319 assert(VF.isVector() && "Expected VF >=2"); 1320 /// Broadcast this decicion to all instructions inside the group. 1321 /// But the cost will be assigned to one instruction only. 1322 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1323 if (auto *I = Grp->getMember(i)) { 1324 if (Grp->getInsertPos() == I) 1325 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1326 else 1327 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1328 } 1329 } 1330 } 1331 1332 /// Return the cost model decision for the given instruction \p I and vector 1333 /// width \p VF. Return CM_Unknown if this instruction did not pass 1334 /// through the cost modeling. 1335 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1336 assert(VF.isVector() && "Expected VF to be a vector VF"); 1337 // Cost model is not run in the VPlan-native path - return conservative 1338 // result until this changes. 1339 if (EnableVPlanNativePath) 1340 return CM_GatherScatter; 1341 1342 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1343 auto Itr = WideningDecisions.find(InstOnVF); 1344 if (Itr == WideningDecisions.end()) 1345 return CM_Unknown; 1346 return Itr->second.first; 1347 } 1348 1349 /// Return the vectorization cost for the given instruction \p I and vector 1350 /// width \p VF. 1351 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1352 assert(VF.isVector() && "Expected VF >=2"); 1353 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1354 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1355 "The cost is not calculated"); 1356 return WideningDecisions[InstOnVF].second; 1357 } 1358 1359 /// Return True if instruction \p I is an optimizable truncate whose operand 1360 /// is an induction variable. Such a truncate will be removed by adding a new 1361 /// induction variable with the destination type. 1362 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1363 // If the instruction is not a truncate, return false. 1364 auto *Trunc = dyn_cast<TruncInst>(I); 1365 if (!Trunc) 1366 return false; 1367 1368 // Get the source and destination types of the truncate. 1369 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1370 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1371 1372 // If the truncate is free for the given types, return false. Replacing a 1373 // free truncate with an induction variable would add an induction variable 1374 // update instruction to each iteration of the loop. We exclude from this 1375 // check the primary induction variable since it will need an update 1376 // instruction regardless. 1377 Value *Op = Trunc->getOperand(0); 1378 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1379 return false; 1380 1381 // If the truncated value is not an induction variable, return false. 1382 return Legal->isInductionPhi(Op); 1383 } 1384 1385 /// Collects the instructions to scalarize for each predicated instruction in 1386 /// the loop. 1387 void collectInstsToScalarize(ElementCount VF); 1388 1389 /// Collect Uniform and Scalar values for the given \p VF. 1390 /// The sets depend on CM decision for Load/Store instructions 1391 /// that may be vectorized as interleave, gather-scatter or scalarized. 1392 void collectUniformsAndScalars(ElementCount VF) { 1393 // Do the analysis once. 1394 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1395 return; 1396 setCostBasedWideningDecision(VF); 1397 collectLoopUniforms(VF); 1398 collectLoopScalars(VF); 1399 } 1400 1401 /// Returns true if the target machine supports masked store operation 1402 /// for the given \p DataType and kind of access to \p Ptr. 1403 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1404 return Legal->isConsecutivePtr(DataType, Ptr) && 1405 TTI.isLegalMaskedStore(DataType, Alignment); 1406 } 1407 1408 /// Returns true if the target machine supports masked load operation 1409 /// for the given \p DataType and kind of access to \p Ptr. 1410 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1411 return Legal->isConsecutivePtr(DataType, Ptr) && 1412 TTI.isLegalMaskedLoad(DataType, Alignment); 1413 } 1414 1415 /// Returns true if the target machine can represent \p V as a masked gather 1416 /// or scatter operation. 1417 bool isLegalGatherOrScatter(Value *V, 1418 ElementCount VF = ElementCount::getFixed(1)) { 1419 bool LI = isa<LoadInst>(V); 1420 bool SI = isa<StoreInst>(V); 1421 if (!LI && !SI) 1422 return false; 1423 auto *Ty = getLoadStoreType(V); 1424 Align Align = getLoadStoreAlignment(V); 1425 if (VF.isVector()) 1426 Ty = VectorType::get(Ty, VF); 1427 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1428 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1429 } 1430 1431 /// Returns true if the target machine supports all of the reduction 1432 /// variables found for the given VF. 1433 bool canVectorizeReductions(ElementCount VF) const { 1434 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1435 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1436 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1437 })); 1438 } 1439 1440 /// Returns true if \p I is an instruction that will be scalarized with 1441 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1442 /// instructions include conditional stores and instructions that may divide 1443 /// by zero. 1444 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1445 1446 // Returns true if \p I is an instruction that will be predicated either 1447 // through scalar predication or masked load/store or masked gather/scatter. 1448 // \p VF is the vectorization factor that will be used to vectorize \p I. 1449 // Superset of instructions that return true for isScalarWithPredication. 1450 bool isPredicatedInst(Instruction *I, ElementCount VF, 1451 bool IsKnownUniform = false) { 1452 // When we know the load is uniform and the original scalar loop was not 1453 // predicated we don't need to mark it as a predicated instruction. Any 1454 // vectorised blocks created when tail-folding are something artificial we 1455 // have introduced and we know there is always at least one active lane. 1456 // That's why we call Legal->blockNeedsPredication here because it doesn't 1457 // query tail-folding. 1458 if (IsKnownUniform && isa<LoadInst>(I) && 1459 !Legal->blockNeedsPredication(I->getParent())) 1460 return false; 1461 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1462 return false; 1463 // Loads and stores that need some form of masked operation are predicated 1464 // instructions. 1465 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1466 return Legal->isMaskRequired(I); 1467 return isScalarWithPredication(I, VF); 1468 } 1469 1470 /// Returns true if \p I is a memory instruction with consecutive memory 1471 /// access that can be widened. 1472 bool 1473 memoryInstructionCanBeWidened(Instruction *I, 1474 ElementCount VF = ElementCount::getFixed(1)); 1475 1476 /// Returns true if \p I is a memory instruction in an interleaved-group 1477 /// of memory accesses that can be vectorized with wide vector loads/stores 1478 /// and shuffles. 1479 bool 1480 interleavedAccessCanBeWidened(Instruction *I, 1481 ElementCount VF = ElementCount::getFixed(1)); 1482 1483 /// Check if \p Instr belongs to any interleaved access group. 1484 bool isAccessInterleaved(Instruction *Instr) { 1485 return InterleaveInfo.isInterleaved(Instr); 1486 } 1487 1488 /// Get the interleaved access group that \p Instr belongs to. 1489 const InterleaveGroup<Instruction> * 1490 getInterleavedAccessGroup(Instruction *Instr) { 1491 return InterleaveInfo.getInterleaveGroup(Instr); 1492 } 1493 1494 /// Returns true if we're required to use a scalar epilogue for at least 1495 /// the final iteration of the original loop. 1496 bool requiresScalarEpilogue(ElementCount VF) const { 1497 if (!isScalarEpilogueAllowed()) 1498 return false; 1499 // If we might exit from anywhere but the latch, must run the exiting 1500 // iteration in scalar form. 1501 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1502 return true; 1503 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1504 } 1505 1506 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1507 /// loop hint annotation. 1508 bool isScalarEpilogueAllowed() const { 1509 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1510 } 1511 1512 /// Returns true if all loop blocks should be masked to fold tail loop. 1513 bool foldTailByMasking() const { return FoldTailByMasking; } 1514 1515 /// Returns true if were tail-folding and want to use the active lane mask 1516 /// for vector loop control flow. 1517 bool useActiveLaneMaskForControlFlow() const { 1518 return FoldTailByMasking && 1519 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; 1520 } 1521 1522 /// Returns true if the instructions in this block requires predication 1523 /// for any reason, e.g. because tail folding now requires a predicate 1524 /// or because the block in the original loop was predicated. 1525 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1526 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1527 } 1528 1529 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1530 /// nodes to the chain of instructions representing the reductions. Uses a 1531 /// MapVector to ensure deterministic iteration order. 1532 using ReductionChainMap = 1533 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1534 1535 /// Return the chain of instructions representing an inloop reduction. 1536 const ReductionChainMap &getInLoopReductionChains() const { 1537 return InLoopReductionChains; 1538 } 1539 1540 /// Returns true if the Phi is part of an inloop reduction. 1541 bool isInLoopReduction(PHINode *Phi) const { 1542 return InLoopReductionChains.count(Phi); 1543 } 1544 1545 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1546 /// with factor VF. Return the cost of the instruction, including 1547 /// scalarization overhead if it's needed. 1548 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1549 1550 /// Estimate cost of a call instruction CI if it were vectorized with factor 1551 /// VF. Return the cost of the instruction, including scalarization overhead 1552 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1553 /// scalarized - 1554 /// i.e. either vector version isn't available, or is too expensive. 1555 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1556 bool &NeedToScalarize) const; 1557 1558 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1559 /// that of B. 1560 bool isMoreProfitable(const VectorizationFactor &A, 1561 const VectorizationFactor &B) const; 1562 1563 /// Invalidates decisions already taken by the cost model. 1564 void invalidateCostModelingDecisions() { 1565 WideningDecisions.clear(); 1566 Uniforms.clear(); 1567 Scalars.clear(); 1568 } 1569 1570 /// Convenience function that returns the value of vscale_range iff 1571 /// vscale_range.min == vscale_range.max or otherwise returns the value 1572 /// returned by the corresponding TLI method. 1573 Optional<unsigned> getVScaleForTuning() const; 1574 1575 private: 1576 unsigned NumPredStores = 0; 1577 1578 /// \return An upper bound for the vectorization factors for both 1579 /// fixed and scalable vectorization, where the minimum-known number of 1580 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1581 /// disabled or unsupported, then the scalable part will be equal to 1582 /// ElementCount::getScalable(0). 1583 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1584 ElementCount UserVF, 1585 bool FoldTailByMasking); 1586 1587 /// \return the maximized element count based on the targets vector 1588 /// registers and the loop trip-count, but limited to a maximum safe VF. 1589 /// This is a helper function of computeFeasibleMaxVF. 1590 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1591 unsigned SmallestType, 1592 unsigned WidestType, 1593 ElementCount MaxSafeVF, 1594 bool FoldTailByMasking); 1595 1596 /// \return the maximum legal scalable VF, based on the safe max number 1597 /// of elements. 1598 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1599 1600 /// The vectorization cost is a combination of the cost itself and a boolean 1601 /// indicating whether any of the contributing operations will actually 1602 /// operate on vector values after type legalization in the backend. If this 1603 /// latter value is false, then all operations will be scalarized (i.e. no 1604 /// vectorization has actually taken place). 1605 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1606 1607 /// Returns the expected execution cost. The unit of the cost does 1608 /// not matter because we use the 'cost' units to compare different 1609 /// vector widths. The cost that is returned is *not* normalized by 1610 /// the factor width. If \p Invalid is not nullptr, this function 1611 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1612 /// each instruction that has an Invalid cost for the given VF. 1613 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1614 VectorizationCostTy 1615 expectedCost(ElementCount VF, 1616 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1617 1618 /// Returns the execution time cost of an instruction for a given vector 1619 /// width. Vector width of one means scalar. 1620 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1621 1622 /// The cost-computation logic from getInstructionCost which provides 1623 /// the vector type as an output parameter. 1624 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1625 Type *&VectorTy); 1626 1627 /// Return the cost of instructions in an inloop reduction pattern, if I is 1628 /// part of that pattern. 1629 Optional<InstructionCost> 1630 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1631 TTI::TargetCostKind CostKind); 1632 1633 /// Calculate vectorization cost of memory instruction \p I. 1634 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1635 1636 /// The cost computation for scalarized memory instruction. 1637 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1638 1639 /// The cost computation for interleaving group of memory instructions. 1640 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1641 1642 /// The cost computation for Gather/Scatter instruction. 1643 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1644 1645 /// The cost computation for widening instruction \p I with consecutive 1646 /// memory access. 1647 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1648 1649 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1650 /// Load: scalar load + broadcast. 1651 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1652 /// element) 1653 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1654 1655 /// Estimate the overhead of scalarizing an instruction. This is a 1656 /// convenience wrapper for the type-based getScalarizationOverhead API. 1657 InstructionCost getScalarizationOverhead(Instruction *I, 1658 ElementCount VF) const; 1659 1660 /// Returns whether the instruction is a load or store and will be a emitted 1661 /// as a vector operation. 1662 bool isConsecutiveLoadOrStore(Instruction *I); 1663 1664 /// Returns true if an artificially high cost for emulated masked memrefs 1665 /// should be used. 1666 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1667 1668 /// Map of scalar integer values to the smallest bitwidth they can be legally 1669 /// represented as. The vector equivalents of these values should be truncated 1670 /// to this type. 1671 MapVector<Instruction *, uint64_t> MinBWs; 1672 1673 /// A type representing the costs for instructions if they were to be 1674 /// scalarized rather than vectorized. The entries are Instruction-Cost 1675 /// pairs. 1676 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1677 1678 /// A set containing all BasicBlocks that are known to present after 1679 /// vectorization as a predicated block. 1680 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1681 1682 /// Records whether it is allowed to have the original scalar loop execute at 1683 /// least once. This may be needed as a fallback loop in case runtime 1684 /// aliasing/dependence checks fail, or to handle the tail/remainder 1685 /// iterations when the trip count is unknown or doesn't divide by the VF, 1686 /// or as a peel-loop to handle gaps in interleave-groups. 1687 /// Under optsize and when the trip count is very small we don't allow any 1688 /// iterations to execute in the scalar loop. 1689 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1690 1691 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1692 bool FoldTailByMasking = false; 1693 1694 /// A map holding scalar costs for different vectorization factors. The 1695 /// presence of a cost for an instruction in the mapping indicates that the 1696 /// instruction will be scalarized when vectorizing with the associated 1697 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1698 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1699 1700 /// Holds the instructions known to be uniform after vectorization. 1701 /// The data is collected per VF. 1702 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1703 1704 /// Holds the instructions known to be scalar after vectorization. 1705 /// The data is collected per VF. 1706 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1707 1708 /// Holds the instructions (address computations) that are forced to be 1709 /// scalarized. 1710 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1711 1712 /// PHINodes of the reductions that should be expanded in-loop along with 1713 /// their associated chains of reduction operations, in program order from top 1714 /// (PHI) to bottom 1715 ReductionChainMap InLoopReductionChains; 1716 1717 /// A Map of inloop reduction operations and their immediate chain operand. 1718 /// FIXME: This can be removed once reductions can be costed correctly in 1719 /// vplan. This was added to allow quick lookup to the inloop operations, 1720 /// without having to loop through InLoopReductionChains. 1721 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1722 1723 /// Returns the expected difference in cost from scalarizing the expression 1724 /// feeding a predicated instruction \p PredInst. The instructions to 1725 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1726 /// non-negative return value implies the expression will be scalarized. 1727 /// Currently, only single-use chains are considered for scalarization. 1728 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1729 ElementCount VF); 1730 1731 /// Collect the instructions that are uniform after vectorization. An 1732 /// instruction is uniform if we represent it with a single scalar value in 1733 /// the vectorized loop corresponding to each vector iteration. Examples of 1734 /// uniform instructions include pointer operands of consecutive or 1735 /// interleaved memory accesses. Note that although uniformity implies an 1736 /// instruction will be scalar, the reverse is not true. In general, a 1737 /// scalarized instruction will be represented by VF scalar values in the 1738 /// vectorized loop, each corresponding to an iteration of the original 1739 /// scalar loop. 1740 void collectLoopUniforms(ElementCount VF); 1741 1742 /// Collect the instructions that are scalar after vectorization. An 1743 /// instruction is scalar if it is known to be uniform or will be scalarized 1744 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1745 /// to the list if they are used by a load/store instruction that is marked as 1746 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1747 /// VF values in the vectorized loop, each corresponding to an iteration of 1748 /// the original scalar loop. 1749 void collectLoopScalars(ElementCount VF); 1750 1751 /// Keeps cost model vectorization decision and cost for instructions. 1752 /// Right now it is used for memory instructions only. 1753 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1754 std::pair<InstWidening, InstructionCost>>; 1755 1756 DecisionList WideningDecisions; 1757 1758 /// Returns true if \p V is expected to be vectorized and it needs to be 1759 /// extracted. 1760 bool needsExtract(Value *V, ElementCount VF) const { 1761 Instruction *I = dyn_cast<Instruction>(V); 1762 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1763 TheLoop->isLoopInvariant(I)) 1764 return false; 1765 1766 // Assume we can vectorize V (and hence we need extraction) if the 1767 // scalars are not computed yet. This can happen, because it is called 1768 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1769 // the scalars are collected. That should be a safe assumption in most 1770 // cases, because we check if the operands have vectorizable types 1771 // beforehand in LoopVectorizationLegality. 1772 return Scalars.find(VF) == Scalars.end() || 1773 !isScalarAfterVectorization(I, VF); 1774 }; 1775 1776 /// Returns a range containing only operands needing to be extracted. 1777 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1778 ElementCount VF) const { 1779 return SmallVector<Value *, 4>(make_filter_range( 1780 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1781 } 1782 1783 /// Determines if we have the infrastructure to vectorize loop \p L and its 1784 /// epilogue, assuming the main loop is vectorized by \p VF. 1785 bool isCandidateForEpilogueVectorization(const Loop &L, 1786 const ElementCount VF) const; 1787 1788 /// Returns true if epilogue vectorization is considered profitable, and 1789 /// false otherwise. 1790 /// \p VF is the vectorization factor chosen for the original loop. 1791 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1792 1793 public: 1794 /// The loop that we evaluate. 1795 Loop *TheLoop; 1796 1797 /// Predicated scalar evolution analysis. 1798 PredicatedScalarEvolution &PSE; 1799 1800 /// Loop Info analysis. 1801 LoopInfo *LI; 1802 1803 /// Vectorization legality. 1804 LoopVectorizationLegality *Legal; 1805 1806 /// Vector target information. 1807 const TargetTransformInfo &TTI; 1808 1809 /// Target Library Info. 1810 const TargetLibraryInfo *TLI; 1811 1812 /// Demanded bits analysis. 1813 DemandedBits *DB; 1814 1815 /// Assumption cache. 1816 AssumptionCache *AC; 1817 1818 /// Interface to emit optimization remarks. 1819 OptimizationRemarkEmitter *ORE; 1820 1821 const Function *TheFunction; 1822 1823 /// Loop Vectorize Hint. 1824 const LoopVectorizeHints *Hints; 1825 1826 /// The interleave access information contains groups of interleaved accesses 1827 /// with the same stride and close to each other. 1828 InterleavedAccessInfo &InterleaveInfo; 1829 1830 /// Values to ignore in the cost model. 1831 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1832 1833 /// Values to ignore in the cost model when VF > 1. 1834 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1835 1836 /// All element types found in the loop. 1837 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1838 1839 /// Profitable vector factors. 1840 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1841 }; 1842 } // end namespace llvm 1843 1844 /// Helper struct to manage generating runtime checks for vectorization. 1845 /// 1846 /// The runtime checks are created up-front in temporary blocks to allow better 1847 /// estimating the cost and un-linked from the existing IR. After deciding to 1848 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1849 /// temporary blocks are completely removed. 1850 class GeneratedRTChecks { 1851 /// Basic block which contains the generated SCEV checks, if any. 1852 BasicBlock *SCEVCheckBlock = nullptr; 1853 1854 /// The value representing the result of the generated SCEV checks. If it is 1855 /// nullptr, either no SCEV checks have been generated or they have been used. 1856 Value *SCEVCheckCond = nullptr; 1857 1858 /// Basic block which contains the generated memory runtime checks, if any. 1859 BasicBlock *MemCheckBlock = nullptr; 1860 1861 /// The value representing the result of the generated memory runtime checks. 1862 /// If it is nullptr, either no memory runtime checks have been generated or 1863 /// they have been used. 1864 Value *MemRuntimeCheckCond = nullptr; 1865 1866 DominatorTree *DT; 1867 LoopInfo *LI; 1868 TargetTransformInfo *TTI; 1869 1870 SCEVExpander SCEVExp; 1871 SCEVExpander MemCheckExp; 1872 1873 bool CostTooHigh = false; 1874 1875 public: 1876 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1877 TargetTransformInfo *TTI, const DataLayout &DL) 1878 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1879 MemCheckExp(SE, DL, "scev.check") {} 1880 1881 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1882 /// accurately estimate the cost of the runtime checks. The blocks are 1883 /// un-linked from the IR and is added back during vector code generation. If 1884 /// there is no vector code generation, the check blocks are removed 1885 /// completely. 1886 void Create(Loop *L, const LoopAccessInfo &LAI, 1887 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1888 1889 // Hard cutoff to limit compile-time increase in case a very large number of 1890 // runtime checks needs to be generated. 1891 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1892 // profile info. 1893 CostTooHigh = 1894 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1895 if (CostTooHigh) 1896 return; 1897 1898 BasicBlock *LoopHeader = L->getHeader(); 1899 BasicBlock *Preheader = L->getLoopPreheader(); 1900 1901 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1902 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1903 // may be used by SCEVExpander. The blocks will be un-linked from their 1904 // predecessors and removed from LI & DT at the end of the function. 1905 if (!UnionPred.isAlwaysTrue()) { 1906 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1907 nullptr, "vector.scevcheck"); 1908 1909 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1910 &UnionPred, SCEVCheckBlock->getTerminator()); 1911 } 1912 1913 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1914 if (RtPtrChecking.Need) { 1915 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1916 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1917 "vector.memcheck"); 1918 1919 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1920 if (DiffChecks) { 1921 MemRuntimeCheckCond = addDiffRuntimeChecks( 1922 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1923 [VF](IRBuilderBase &B, unsigned Bits) { 1924 return getRuntimeVF(B, B.getIntNTy(Bits), VF); 1925 }, 1926 IC); 1927 } else { 1928 MemRuntimeCheckCond = 1929 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1930 RtPtrChecking.getChecks(), MemCheckExp); 1931 } 1932 assert(MemRuntimeCheckCond && 1933 "no RT checks generated although RtPtrChecking " 1934 "claimed checks are required"); 1935 } 1936 1937 if (!MemCheckBlock && !SCEVCheckBlock) 1938 return; 1939 1940 // Unhook the temporary block with the checks, update various places 1941 // accordingly. 1942 if (SCEVCheckBlock) 1943 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1944 if (MemCheckBlock) 1945 MemCheckBlock->replaceAllUsesWith(Preheader); 1946 1947 if (SCEVCheckBlock) { 1948 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1949 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1950 Preheader->getTerminator()->eraseFromParent(); 1951 } 1952 if (MemCheckBlock) { 1953 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1954 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1955 Preheader->getTerminator()->eraseFromParent(); 1956 } 1957 1958 DT->changeImmediateDominator(LoopHeader, Preheader); 1959 if (MemCheckBlock) { 1960 DT->eraseNode(MemCheckBlock); 1961 LI->removeBlock(MemCheckBlock); 1962 } 1963 if (SCEVCheckBlock) { 1964 DT->eraseNode(SCEVCheckBlock); 1965 LI->removeBlock(SCEVCheckBlock); 1966 } 1967 } 1968 1969 InstructionCost getCost() { 1970 if (SCEVCheckBlock || MemCheckBlock) 1971 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 1972 1973 if (CostTooHigh) { 1974 InstructionCost Cost; 1975 Cost.setInvalid(); 1976 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 1977 return Cost; 1978 } 1979 1980 InstructionCost RTCheckCost = 0; 1981 if (SCEVCheckBlock) 1982 for (Instruction &I : *SCEVCheckBlock) { 1983 if (SCEVCheckBlock->getTerminator() == &I) 1984 continue; 1985 InstructionCost C = 1986 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1987 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1988 RTCheckCost += C; 1989 } 1990 if (MemCheckBlock) 1991 for (Instruction &I : *MemCheckBlock) { 1992 if (MemCheckBlock->getTerminator() == &I) 1993 continue; 1994 InstructionCost C = 1995 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1996 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1997 RTCheckCost += C; 1998 } 1999 2000 if (SCEVCheckBlock || MemCheckBlock) 2001 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2002 << "\n"); 2003 2004 return RTCheckCost; 2005 } 2006 2007 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2008 /// unused. 2009 ~GeneratedRTChecks() { 2010 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2011 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2012 if (!SCEVCheckCond) 2013 SCEVCleaner.markResultUsed(); 2014 2015 if (!MemRuntimeCheckCond) 2016 MemCheckCleaner.markResultUsed(); 2017 2018 if (MemRuntimeCheckCond) { 2019 auto &SE = *MemCheckExp.getSE(); 2020 // Memory runtime check generation creates compares that use expanded 2021 // values. Remove them before running the SCEVExpanderCleaners. 2022 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2023 if (MemCheckExp.isInsertedInstruction(&I)) 2024 continue; 2025 SE.forgetValue(&I); 2026 I.eraseFromParent(); 2027 } 2028 } 2029 MemCheckCleaner.cleanup(); 2030 SCEVCleaner.cleanup(); 2031 2032 if (SCEVCheckCond) 2033 SCEVCheckBlock->eraseFromParent(); 2034 if (MemRuntimeCheckCond) 2035 MemCheckBlock->eraseFromParent(); 2036 } 2037 2038 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2039 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2040 /// depending on the generated condition. 2041 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2042 BasicBlock *LoopVectorPreHeader, 2043 BasicBlock *LoopExitBlock) { 2044 if (!SCEVCheckCond) 2045 return nullptr; 2046 2047 Value *Cond = SCEVCheckCond; 2048 // Mark the check as used, to prevent it from being removed during cleanup. 2049 SCEVCheckCond = nullptr; 2050 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2051 if (C->isZero()) 2052 return nullptr; 2053 2054 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2055 2056 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2057 // Create new preheader for vector loop. 2058 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2059 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2060 2061 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2062 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2063 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2064 SCEVCheckBlock); 2065 2066 DT->addNewBlock(SCEVCheckBlock, Pred); 2067 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2068 2069 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2070 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2071 return SCEVCheckBlock; 2072 } 2073 2074 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2075 /// the branches to branch to the vector preheader or \p Bypass, depending on 2076 /// the generated condition. 2077 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2078 BasicBlock *LoopVectorPreHeader) { 2079 // Check if we generated code that checks in runtime if arrays overlap. 2080 if (!MemRuntimeCheckCond) 2081 return nullptr; 2082 2083 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2084 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2085 MemCheckBlock); 2086 2087 DT->addNewBlock(MemCheckBlock, Pred); 2088 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2089 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2090 2091 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2092 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2093 2094 ReplaceInstWithInst( 2095 MemCheckBlock->getTerminator(), 2096 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2097 MemCheckBlock->getTerminator()->setDebugLoc( 2098 Pred->getTerminator()->getDebugLoc()); 2099 2100 // Mark the check as used, to prevent it from being removed during cleanup. 2101 MemRuntimeCheckCond = nullptr; 2102 return MemCheckBlock; 2103 } 2104 }; 2105 2106 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2107 // vectorization. The loop needs to be annotated with #pragma omp simd 2108 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2109 // vector length information is not provided, vectorization is not considered 2110 // explicit. Interleave hints are not allowed either. These limitations will be 2111 // relaxed in the future. 2112 // Please, note that we are currently forced to abuse the pragma 'clang 2113 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2114 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2115 // provides *explicit vectorization hints* (LV can bypass legal checks and 2116 // assume that vectorization is legal). However, both hints are implemented 2117 // using the same metadata (llvm.loop.vectorize, processed by 2118 // LoopVectorizeHints). This will be fixed in the future when the native IR 2119 // representation for pragma 'omp simd' is introduced. 2120 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2121 OptimizationRemarkEmitter *ORE) { 2122 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2123 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2124 2125 // Only outer loops with an explicit vectorization hint are supported. 2126 // Unannotated outer loops are ignored. 2127 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2128 return false; 2129 2130 Function *Fn = OuterLp->getHeader()->getParent(); 2131 if (!Hints.allowVectorization(Fn, OuterLp, 2132 true /*VectorizeOnlyWhenForced*/)) { 2133 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2134 return false; 2135 } 2136 2137 if (Hints.getInterleave() > 1) { 2138 // TODO: Interleave support is future work. 2139 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2140 "outer loops.\n"); 2141 Hints.emitRemarkWithHints(); 2142 return false; 2143 } 2144 2145 return true; 2146 } 2147 2148 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2149 OptimizationRemarkEmitter *ORE, 2150 SmallVectorImpl<Loop *> &V) { 2151 // Collect inner loops and outer loops without irreducible control flow. For 2152 // now, only collect outer loops that have explicit vectorization hints. If we 2153 // are stress testing the VPlan H-CFG construction, we collect the outermost 2154 // loop of every loop nest. 2155 if (L.isInnermost() || VPlanBuildStressTest || 2156 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2157 LoopBlocksRPO RPOT(&L); 2158 RPOT.perform(LI); 2159 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2160 V.push_back(&L); 2161 // TODO: Collect inner loops inside marked outer loops in case 2162 // vectorization fails for the outer loop. Do not invoke 2163 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2164 // already known to be reducible. We can use an inherited attribute for 2165 // that. 2166 return; 2167 } 2168 } 2169 for (Loop *InnerL : L) 2170 collectSupportedLoops(*InnerL, LI, ORE, V); 2171 } 2172 2173 namespace { 2174 2175 /// The LoopVectorize Pass. 2176 struct LoopVectorize : public FunctionPass { 2177 /// Pass identification, replacement for typeid 2178 static char ID; 2179 2180 LoopVectorizePass Impl; 2181 2182 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2183 bool VectorizeOnlyWhenForced = false) 2184 : FunctionPass(ID), 2185 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2186 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2187 } 2188 2189 bool runOnFunction(Function &F) override { 2190 if (skipFunction(F)) 2191 return false; 2192 2193 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2194 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2195 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2196 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2197 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2198 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2199 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2200 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2201 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2202 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2203 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2204 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2205 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2206 2207 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2208 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2209 2210 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2211 GetLAA, *ORE, PSI).MadeAnyChange; 2212 } 2213 2214 void getAnalysisUsage(AnalysisUsage &AU) const override { 2215 AU.addRequired<AssumptionCacheTracker>(); 2216 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2217 AU.addRequired<DominatorTreeWrapperPass>(); 2218 AU.addRequired<LoopInfoWrapperPass>(); 2219 AU.addRequired<ScalarEvolutionWrapperPass>(); 2220 AU.addRequired<TargetTransformInfoWrapperPass>(); 2221 AU.addRequired<AAResultsWrapperPass>(); 2222 AU.addRequired<LoopAccessLegacyAnalysis>(); 2223 AU.addRequired<DemandedBitsWrapperPass>(); 2224 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2225 AU.addRequired<InjectTLIMappingsLegacy>(); 2226 2227 // We currently do not preserve loopinfo/dominator analyses with outer loop 2228 // vectorization. Until this is addressed, mark these analyses as preserved 2229 // only for non-VPlan-native path. 2230 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2231 if (!EnableVPlanNativePath) { 2232 AU.addPreserved<LoopInfoWrapperPass>(); 2233 AU.addPreserved<DominatorTreeWrapperPass>(); 2234 } 2235 2236 AU.addPreserved<BasicAAWrapperPass>(); 2237 AU.addPreserved<GlobalsAAWrapperPass>(); 2238 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2239 } 2240 }; 2241 2242 } // end anonymous namespace 2243 2244 //===----------------------------------------------------------------------===// 2245 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2246 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2247 //===----------------------------------------------------------------------===// 2248 2249 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2250 // We need to place the broadcast of invariant variables outside the loop, 2251 // but only if it's proven safe to do so. Else, broadcast will be inside 2252 // vector loop body. 2253 Instruction *Instr = dyn_cast<Instruction>(V); 2254 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2255 (!Instr || 2256 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2257 // Place the code for broadcasting invariant variables in the new preheader. 2258 IRBuilder<>::InsertPointGuard Guard(Builder); 2259 if (SafeToHoist) 2260 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2261 2262 // Broadcast the scalar into all locations in the vector. 2263 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2264 2265 return Shuf; 2266 } 2267 2268 /// This function adds 2269 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2270 /// to each vector element of Val. The sequence starts at StartIndex. 2271 /// \p Opcode is relevant for FP induction variable. 2272 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2273 Instruction::BinaryOps BinOp, ElementCount VF, 2274 IRBuilderBase &Builder) { 2275 assert(VF.isVector() && "only vector VFs are supported"); 2276 2277 // Create and check the types. 2278 auto *ValVTy = cast<VectorType>(Val->getType()); 2279 ElementCount VLen = ValVTy->getElementCount(); 2280 2281 Type *STy = Val->getType()->getScalarType(); 2282 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2283 "Induction Step must be an integer or FP"); 2284 assert(Step->getType() == STy && "Step has wrong type"); 2285 2286 SmallVector<Constant *, 8> Indices; 2287 2288 // Create a vector of consecutive numbers from zero to VF. 2289 VectorType *InitVecValVTy = ValVTy; 2290 if (STy->isFloatingPointTy()) { 2291 Type *InitVecValSTy = 2292 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2293 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2294 } 2295 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2296 2297 // Splat the StartIdx 2298 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2299 2300 if (STy->isIntegerTy()) { 2301 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2302 Step = Builder.CreateVectorSplat(VLen, Step); 2303 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2304 // FIXME: The newly created binary instructions should contain nsw/nuw 2305 // flags, which can be found from the original scalar operations. 2306 Step = Builder.CreateMul(InitVec, Step); 2307 return Builder.CreateAdd(Val, Step, "induction"); 2308 } 2309 2310 // Floating point induction. 2311 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2312 "Binary Opcode should be specified for FP induction"); 2313 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2314 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2315 2316 Step = Builder.CreateVectorSplat(VLen, Step); 2317 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2318 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2319 } 2320 2321 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2322 /// variable on which to base the steps, \p Step is the size of the step. 2323 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2324 const InductionDescriptor &ID, VPValue *Def, 2325 VPTransformState &State) { 2326 IRBuilderBase &Builder = State.Builder; 2327 // We shouldn't have to build scalar steps if we aren't vectorizing. 2328 assert(State.VF.isVector() && "VF should be greater than one"); 2329 // Get the value type and ensure it and the step have the same integer type. 2330 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2331 assert(ScalarIVTy == Step->getType() && 2332 "Val and Step should have the same type"); 2333 2334 // We build scalar steps for both integer and floating-point induction 2335 // variables. Here, we determine the kind of arithmetic we will perform. 2336 Instruction::BinaryOps AddOp; 2337 Instruction::BinaryOps MulOp; 2338 if (ScalarIVTy->isIntegerTy()) { 2339 AddOp = Instruction::Add; 2340 MulOp = Instruction::Mul; 2341 } else { 2342 AddOp = ID.getInductionOpcode(); 2343 MulOp = Instruction::FMul; 2344 } 2345 2346 // Determine the number of scalars we need to generate for each unroll 2347 // iteration. 2348 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2349 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2350 // Compute the scalar steps and save the results in State. 2351 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2352 ScalarIVTy->getScalarSizeInBits()); 2353 Type *VecIVTy = nullptr; 2354 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2355 if (!FirstLaneOnly && State.VF.isScalable()) { 2356 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2357 UnitStepVec = 2358 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2359 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2360 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2361 } 2362 2363 for (unsigned Part = 0; Part < State.UF; ++Part) { 2364 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2365 2366 if (!FirstLaneOnly && State.VF.isScalable()) { 2367 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2368 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2369 if (ScalarIVTy->isFloatingPointTy()) 2370 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2371 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2372 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2373 State.set(Def, Add, Part); 2374 // It's useful to record the lane values too for the known minimum number 2375 // of elements so we do those below. This improves the code quality when 2376 // trying to extract the first element, for example. 2377 } 2378 2379 if (ScalarIVTy->isFloatingPointTy()) 2380 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2381 2382 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2383 Value *StartIdx = Builder.CreateBinOp( 2384 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2385 // The step returned by `createStepForVF` is a runtime-evaluated value 2386 // when VF is scalable. Otherwise, it should be folded into a Constant. 2387 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2388 "Expected StartIdx to be folded to a constant when VF is not " 2389 "scalable"); 2390 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2391 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2392 State.set(Def, Add, VPIteration(Part, Lane)); 2393 } 2394 } 2395 } 2396 2397 // Generate code for the induction step. Note that induction steps are 2398 // required to be loop-invariant 2399 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2400 Instruction *InsertBefore, 2401 Loop *OrigLoop = nullptr) { 2402 const DataLayout &DL = SE.getDataLayout(); 2403 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2404 "Induction step should be loop invariant"); 2405 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2406 return E->getValue(); 2407 2408 SCEVExpander Exp(SE, DL, "induction"); 2409 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2410 } 2411 2412 /// Compute the transformed value of Index at offset StartValue using step 2413 /// StepValue. 2414 /// For integer induction, returns StartValue + Index * StepValue. 2415 /// For pointer induction, returns StartValue[Index * StepValue]. 2416 /// FIXME: The newly created binary instructions should contain nsw/nuw 2417 /// flags, which can be found from the original scalar operations. 2418 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2419 Value *StartValue, Value *Step, 2420 const InductionDescriptor &ID) { 2421 assert(Index->getType()->getScalarType() == Step->getType() && 2422 "Index scalar type does not match StepValue type"); 2423 2424 // Note: the IR at this point is broken. We cannot use SE to create any new 2425 // SCEV and then expand it, hoping that SCEV's simplification will give us 2426 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2427 // lead to various SCEV crashes. So all we can do is to use builder and rely 2428 // on InstCombine for future simplifications. Here we handle some trivial 2429 // cases only. 2430 auto CreateAdd = [&B](Value *X, Value *Y) { 2431 assert(X->getType() == Y->getType() && "Types don't match!"); 2432 if (auto *CX = dyn_cast<ConstantInt>(X)) 2433 if (CX->isZero()) 2434 return Y; 2435 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2436 if (CY->isZero()) 2437 return X; 2438 return B.CreateAdd(X, Y); 2439 }; 2440 2441 // We allow X to be a vector type, in which case Y will potentially be 2442 // splatted into a vector with the same element count. 2443 auto CreateMul = [&B](Value *X, Value *Y) { 2444 assert(X->getType()->getScalarType() == Y->getType() && 2445 "Types don't match!"); 2446 if (auto *CX = dyn_cast<ConstantInt>(X)) 2447 if (CX->isOne()) 2448 return Y; 2449 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2450 if (CY->isOne()) 2451 return X; 2452 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2453 if (XVTy && !isa<VectorType>(Y->getType())) 2454 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2455 return B.CreateMul(X, Y); 2456 }; 2457 2458 switch (ID.getKind()) { 2459 case InductionDescriptor::IK_IntInduction: { 2460 assert(!isa<VectorType>(Index->getType()) && 2461 "Vector indices not supported for integer inductions yet"); 2462 assert(Index->getType() == StartValue->getType() && 2463 "Index type does not match StartValue type"); 2464 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2465 return B.CreateSub(StartValue, Index); 2466 auto *Offset = CreateMul(Index, Step); 2467 return CreateAdd(StartValue, Offset); 2468 } 2469 case InductionDescriptor::IK_PtrInduction: { 2470 assert(isa<Constant>(Step) && 2471 "Expected constant step for pointer induction"); 2472 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2473 } 2474 case InductionDescriptor::IK_FpInduction: { 2475 assert(!isa<VectorType>(Index->getType()) && 2476 "Vector indices not supported for FP inductions yet"); 2477 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2478 auto InductionBinOp = ID.getInductionBinOp(); 2479 assert(InductionBinOp && 2480 (InductionBinOp->getOpcode() == Instruction::FAdd || 2481 InductionBinOp->getOpcode() == Instruction::FSub) && 2482 "Original bin op should be defined for FP induction"); 2483 2484 Value *MulExp = B.CreateFMul(Step, Index); 2485 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2486 "induction"); 2487 } 2488 case InductionDescriptor::IK_NoInduction: 2489 return nullptr; 2490 } 2491 llvm_unreachable("invalid enum"); 2492 } 2493 2494 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2495 const VPIteration &Instance, 2496 VPTransformState &State) { 2497 Value *ScalarInst = State.get(Def, Instance); 2498 Value *VectorValue = State.get(Def, Instance.Part); 2499 VectorValue = Builder.CreateInsertElement( 2500 VectorValue, ScalarInst, 2501 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2502 State.set(Def, VectorValue, Instance.Part); 2503 } 2504 2505 // Return whether we allow using masked interleave-groups (for dealing with 2506 // strided loads/stores that reside in predicated blocks, or for dealing 2507 // with gaps). 2508 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2509 // If an override option has been passed in for interleaved accesses, use it. 2510 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2511 return EnableMaskedInterleavedMemAccesses; 2512 2513 return TTI.enableMaskedInterleavedAccessVectorization(); 2514 } 2515 2516 // Try to vectorize the interleave group that \p Instr belongs to. 2517 // 2518 // E.g. Translate following interleaved load group (factor = 3): 2519 // for (i = 0; i < N; i+=3) { 2520 // R = Pic[i]; // Member of index 0 2521 // G = Pic[i+1]; // Member of index 1 2522 // B = Pic[i+2]; // Member of index 2 2523 // ... // do something to R, G, B 2524 // } 2525 // To: 2526 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2527 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2528 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2529 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2530 // 2531 // Or translate following interleaved store group (factor = 3): 2532 // for (i = 0; i < N; i+=3) { 2533 // ... do something to R, G, B 2534 // Pic[i] = R; // Member of index 0 2535 // Pic[i+1] = G; // Member of index 1 2536 // Pic[i+2] = B; // Member of index 2 2537 // } 2538 // To: 2539 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2540 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2541 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2542 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2543 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2544 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2545 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2546 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2547 VPValue *BlockInMask) { 2548 Instruction *Instr = Group->getInsertPos(); 2549 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2550 2551 // Prepare for the vector type of the interleaved load/store. 2552 Type *ScalarTy = getLoadStoreType(Instr); 2553 unsigned InterleaveFactor = Group->getFactor(); 2554 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2555 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2556 2557 // Prepare for the new pointers. 2558 SmallVector<Value *, 2> AddrParts; 2559 unsigned Index = Group->getIndex(Instr); 2560 2561 // TODO: extend the masked interleaved-group support to reversed access. 2562 assert((!BlockInMask || !Group->isReverse()) && 2563 "Reversed masked interleave-group not supported."); 2564 2565 // If the group is reverse, adjust the index to refer to the last vector lane 2566 // instead of the first. We adjust the index from the first vector lane, 2567 // rather than directly getting the pointer for lane VF - 1, because the 2568 // pointer operand of the interleaved access is supposed to be uniform. For 2569 // uniform instructions, we're only required to generate a value for the 2570 // first vector lane in each unroll iteration. 2571 if (Group->isReverse()) 2572 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2573 2574 for (unsigned Part = 0; Part < UF; Part++) { 2575 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2576 State.setDebugLocFromInst(AddrPart); 2577 2578 // Notice current instruction could be any index. Need to adjust the address 2579 // to the member of index 0. 2580 // 2581 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2582 // b = A[i]; // Member of index 0 2583 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2584 // 2585 // E.g. A[i+1] = a; // Member of index 1 2586 // A[i] = b; // Member of index 0 2587 // A[i+2] = c; // Member of index 2 (Current instruction) 2588 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2589 2590 bool InBounds = false; 2591 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2592 InBounds = gep->isInBounds(); 2593 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2594 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2595 2596 // Cast to the vector pointer type. 2597 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2598 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2599 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2600 } 2601 2602 State.setDebugLocFromInst(Instr); 2603 Value *PoisonVec = PoisonValue::get(VecTy); 2604 2605 Value *MaskForGaps = nullptr; 2606 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2607 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2608 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2609 } 2610 2611 // Vectorize the interleaved load group. 2612 if (isa<LoadInst>(Instr)) { 2613 // For each unroll part, create a wide load for the group. 2614 SmallVector<Value *, 2> NewLoads; 2615 for (unsigned Part = 0; Part < UF; Part++) { 2616 Instruction *NewLoad; 2617 if (BlockInMask || MaskForGaps) { 2618 assert(useMaskedInterleavedAccesses(*TTI) && 2619 "masked interleaved groups are not allowed."); 2620 Value *GroupMask = MaskForGaps; 2621 if (BlockInMask) { 2622 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2623 Value *ShuffledMask = Builder.CreateShuffleVector( 2624 BlockInMaskPart, 2625 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2626 "interleaved.mask"); 2627 GroupMask = MaskForGaps 2628 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2629 MaskForGaps) 2630 : ShuffledMask; 2631 } 2632 NewLoad = 2633 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2634 GroupMask, PoisonVec, "wide.masked.vec"); 2635 } 2636 else 2637 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2638 Group->getAlign(), "wide.vec"); 2639 Group->addMetadata(NewLoad); 2640 NewLoads.push_back(NewLoad); 2641 } 2642 2643 // For each member in the group, shuffle out the appropriate data from the 2644 // wide loads. 2645 unsigned J = 0; 2646 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2647 Instruction *Member = Group->getMember(I); 2648 2649 // Skip the gaps in the group. 2650 if (!Member) 2651 continue; 2652 2653 auto StrideMask = 2654 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2655 for (unsigned Part = 0; Part < UF; Part++) { 2656 Value *StridedVec = Builder.CreateShuffleVector( 2657 NewLoads[Part], StrideMask, "strided.vec"); 2658 2659 // If this member has different type, cast the result type. 2660 if (Member->getType() != ScalarTy) { 2661 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2662 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2663 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2664 } 2665 2666 if (Group->isReverse()) 2667 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2668 2669 State.set(VPDefs[J], StridedVec, Part); 2670 } 2671 ++J; 2672 } 2673 return; 2674 } 2675 2676 // The sub vector type for current instruction. 2677 auto *SubVT = VectorType::get(ScalarTy, VF); 2678 2679 // Vectorize the interleaved store group. 2680 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2681 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2682 "masked interleaved groups are not allowed."); 2683 assert((!MaskForGaps || !VF.isScalable()) && 2684 "masking gaps for scalable vectors is not yet supported."); 2685 for (unsigned Part = 0; Part < UF; Part++) { 2686 // Collect the stored vector from each member. 2687 SmallVector<Value *, 4> StoredVecs; 2688 for (unsigned i = 0; i < InterleaveFactor; i++) { 2689 assert((Group->getMember(i) || MaskForGaps) && 2690 "Fail to get a member from an interleaved store group"); 2691 Instruction *Member = Group->getMember(i); 2692 2693 // Skip the gaps in the group. 2694 if (!Member) { 2695 Value *Undef = PoisonValue::get(SubVT); 2696 StoredVecs.push_back(Undef); 2697 continue; 2698 } 2699 2700 Value *StoredVec = State.get(StoredValues[i], Part); 2701 2702 if (Group->isReverse()) 2703 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2704 2705 // If this member has different type, cast it to a unified type. 2706 2707 if (StoredVec->getType() != SubVT) 2708 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2709 2710 StoredVecs.push_back(StoredVec); 2711 } 2712 2713 // Concatenate all vectors into a wide vector. 2714 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2715 2716 // Interleave the elements in the wide vector. 2717 Value *IVec = Builder.CreateShuffleVector( 2718 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2719 "interleaved.vec"); 2720 2721 Instruction *NewStoreInstr; 2722 if (BlockInMask || MaskForGaps) { 2723 Value *GroupMask = MaskForGaps; 2724 if (BlockInMask) { 2725 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2726 Value *ShuffledMask = Builder.CreateShuffleVector( 2727 BlockInMaskPart, 2728 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2729 "interleaved.mask"); 2730 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2731 ShuffledMask, MaskForGaps) 2732 : ShuffledMask; 2733 } 2734 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2735 Group->getAlign(), GroupMask); 2736 } else 2737 NewStoreInstr = 2738 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2739 2740 Group->addMetadata(NewStoreInstr); 2741 } 2742 } 2743 2744 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2745 VPReplicateRecipe *RepRecipe, 2746 const VPIteration &Instance, 2747 bool IfPredicateInstr, 2748 VPTransformState &State) { 2749 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2750 2751 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2752 // the first lane and part. 2753 if (isa<NoAliasScopeDeclInst>(Instr)) 2754 if (!Instance.isFirstIteration()) 2755 return; 2756 2757 // Does this instruction return a value ? 2758 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2759 2760 Instruction *Cloned = Instr->clone(); 2761 if (!IsVoidRetTy) 2762 Cloned->setName(Instr->getName() + ".cloned"); 2763 2764 // If the scalarized instruction contributes to the address computation of a 2765 // widen masked load/store which was in a basic block that needed predication 2766 // and is not predicated after vectorization, we can't propagate 2767 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2768 // instruction could feed a poison value to the base address of the widen 2769 // load/store. 2770 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2771 Cloned->dropPoisonGeneratingFlags(); 2772 2773 if (Instr->getDebugLoc()) 2774 State.setDebugLocFromInst(Instr); 2775 2776 // Replace the operands of the cloned instructions with their scalar 2777 // equivalents in the new loop. 2778 for (auto &I : enumerate(RepRecipe->operands())) { 2779 auto InputInstance = Instance; 2780 VPValue *Operand = I.value(); 2781 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2782 if (OperandR && OperandR->isUniform()) 2783 InputInstance.Lane = VPLane::getFirstLane(); 2784 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2785 } 2786 State.addNewMetadata(Cloned, Instr); 2787 2788 // Place the cloned scalar in the new loop. 2789 State.Builder.Insert(Cloned); 2790 2791 State.set(RepRecipe, Cloned, Instance); 2792 2793 // If we just cloned a new assumption, add it the assumption cache. 2794 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2795 AC->registerAssumption(II); 2796 2797 // End if-block. 2798 if (IfPredicateInstr) 2799 PredicatedInstructions.push_back(Cloned); 2800 } 2801 2802 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2803 if (TripCount) 2804 return TripCount; 2805 2806 assert(InsertBlock); 2807 IRBuilder<> Builder(InsertBlock->getTerminator()); 2808 // Find the loop boundaries. 2809 ScalarEvolution *SE = PSE.getSE(); 2810 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2811 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2812 "Invalid loop count"); 2813 2814 Type *IdxTy = Legal->getWidestInductionType(); 2815 assert(IdxTy && "No type for induction"); 2816 2817 // The exit count might have the type of i64 while the phi is i32. This can 2818 // happen if we have an induction variable that is sign extended before the 2819 // compare. The only way that we get a backedge taken count is that the 2820 // induction variable was signed and as such will not overflow. In such a case 2821 // truncation is legal. 2822 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2823 IdxTy->getPrimitiveSizeInBits()) 2824 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2825 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2826 2827 // Get the total trip count from the count by adding 1. 2828 const SCEV *ExitCount = SE->getAddExpr( 2829 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2830 2831 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2832 2833 // Expand the trip count and place the new instructions in the preheader. 2834 // Notice that the pre-header does not change, only the loop body. 2835 SCEVExpander Exp(*SE, DL, "induction"); 2836 2837 // Count holds the overall loop count (N). 2838 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2839 InsertBlock->getTerminator()); 2840 2841 if (TripCount->getType()->isPointerTy()) 2842 TripCount = 2843 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2844 InsertBlock->getTerminator()); 2845 2846 return TripCount; 2847 } 2848 2849 Value * 2850 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2851 if (VectorTripCount) 2852 return VectorTripCount; 2853 2854 Value *TC = getOrCreateTripCount(InsertBlock); 2855 IRBuilder<> Builder(InsertBlock->getTerminator()); 2856 2857 Type *Ty = TC->getType(); 2858 // This is where we can make the step a runtime constant. 2859 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2860 2861 // If the tail is to be folded by masking, round the number of iterations N 2862 // up to a multiple of Step instead of rounding down. This is done by first 2863 // adding Step-1 and then rounding down. Note that it's ok if this addition 2864 // overflows: the vector induction variable will eventually wrap to zero given 2865 // that it starts at zero and its Step is a power of two; the loop will then 2866 // exit, with the last early-exit vector comparison also producing all-true. 2867 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2868 // is accounted for in emitIterationCountCheck that adds an overflow check. 2869 if (Cost->foldTailByMasking()) { 2870 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2871 "VF*UF must be a power of 2 when folding tail by masking"); 2872 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2873 TC = Builder.CreateAdd( 2874 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2875 } 2876 2877 // Now we need to generate the expression for the part of the loop that the 2878 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2879 // iterations are not required for correctness, or N - Step, otherwise. Step 2880 // is equal to the vectorization factor (number of SIMD elements) times the 2881 // unroll factor (number of SIMD instructions). 2882 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2883 2884 // There are cases where we *must* run at least one iteration in the remainder 2885 // loop. See the cost model for when this can happen. If the step evenly 2886 // divides the trip count, we set the remainder to be equal to the step. If 2887 // the step does not evenly divide the trip count, no adjustment is necessary 2888 // since there will already be scalar iterations. Note that the minimum 2889 // iterations check ensures that N >= Step. 2890 if (Cost->requiresScalarEpilogue(VF)) { 2891 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2892 R = Builder.CreateSelect(IsZero, Step, R); 2893 } 2894 2895 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2896 2897 return VectorTripCount; 2898 } 2899 2900 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2901 const DataLayout &DL) { 2902 // Verify that V is a vector type with same number of elements as DstVTy. 2903 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2904 unsigned VF = DstFVTy->getNumElements(); 2905 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2906 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2907 Type *SrcElemTy = SrcVecTy->getElementType(); 2908 Type *DstElemTy = DstFVTy->getElementType(); 2909 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2910 "Vector elements must have same size"); 2911 2912 // Do a direct cast if element types are castable. 2913 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2914 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2915 } 2916 // V cannot be directly casted to desired vector type. 2917 // May happen when V is a floating point vector but DstVTy is a vector of 2918 // pointers or vice-versa. Handle this using a two-step bitcast using an 2919 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2920 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2921 "Only one type should be a pointer type"); 2922 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2923 "Only one type should be a floating point type"); 2924 Type *IntTy = 2925 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2926 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2927 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2928 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2929 } 2930 2931 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2932 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2933 // Reuse existing vector loop preheader for TC checks. 2934 // Note that new preheader block is generated for vector loop. 2935 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2936 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2937 2938 // Generate code to check if the loop's trip count is less than VF * UF, or 2939 // equal to it in case a scalar epilogue is required; this implies that the 2940 // vector trip count is zero. This check also covers the case where adding one 2941 // to the backedge-taken count overflowed leading to an incorrect trip count 2942 // of zero. In this case we will also jump to the scalar loop. 2943 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2944 : ICmpInst::ICMP_ULT; 2945 2946 // If tail is to be folded, vector loop takes care of all iterations. 2947 Type *CountTy = Count->getType(); 2948 Value *CheckMinIters = Builder.getFalse(); 2949 auto CreateStep = [&]() { 2950 // Create step with max(MinProTripCount, UF * VF). 2951 if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue()) 2952 return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2953 return createStepForVF(Builder, CountTy, VF, UF); 2954 }; 2955 2956 if (!Cost->foldTailByMasking()) 2957 CheckMinIters = 2958 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2959 else if (VF.isScalable()) { 2960 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2961 // an overflow to zero when updating induction variables and so an 2962 // additional overflow check is required before entering the vector loop. 2963 2964 // Get the maximum unsigned value for the type. 2965 Value *MaxUIntTripCount = 2966 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2967 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2968 2969 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2970 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2971 } 2972 2973 // Create new preheader for vector loop. 2974 LoopVectorPreHeader = 2975 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2976 "vector.ph"); 2977 2978 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2979 DT->getNode(Bypass)->getIDom()) && 2980 "TC check is expected to dominate Bypass"); 2981 2982 // Update dominator for Bypass & LoopExit (if needed). 2983 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2984 if (!Cost->requiresScalarEpilogue(VF)) 2985 // If there is an epilogue which must run, there's no edge from the 2986 // middle block to exit blocks and thus no need to update the immediate 2987 // dominator of the exit blocks. 2988 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2989 2990 ReplaceInstWithInst( 2991 TCCheckBlock->getTerminator(), 2992 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2993 LoopBypassBlocks.push_back(TCCheckBlock); 2994 } 2995 2996 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2997 BasicBlock *const SCEVCheckBlock = 2998 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 2999 if (!SCEVCheckBlock) 3000 return nullptr; 3001 3002 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3003 (OptForSizeBasedOnProfile && 3004 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3005 "Cannot SCEV check stride or overflow when optimizing for size"); 3006 3007 3008 // Update dominator only if this is first RT check. 3009 if (LoopBypassBlocks.empty()) { 3010 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3011 if (!Cost->requiresScalarEpilogue(VF)) 3012 // If there is an epilogue which must run, there's no edge from the 3013 // middle block to exit blocks and thus no need to update the immediate 3014 // dominator of the exit blocks. 3015 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3016 } 3017 3018 LoopBypassBlocks.push_back(SCEVCheckBlock); 3019 AddedSafetyChecks = true; 3020 return SCEVCheckBlock; 3021 } 3022 3023 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3024 // VPlan-native path does not do any analysis for runtime checks currently. 3025 if (EnableVPlanNativePath) 3026 return nullptr; 3027 3028 BasicBlock *const MemCheckBlock = 3029 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3030 3031 // Check if we generated code that checks in runtime if arrays overlap. We put 3032 // the checks into a separate block to make the more common case of few 3033 // elements faster. 3034 if (!MemCheckBlock) 3035 return nullptr; 3036 3037 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3038 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3039 "Cannot emit memory checks when optimizing for size, unless forced " 3040 "to vectorize."); 3041 ORE->emit([&]() { 3042 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3043 OrigLoop->getStartLoc(), 3044 OrigLoop->getHeader()) 3045 << "Code-size may be reduced by not forcing " 3046 "vectorization, or by source-code modifications " 3047 "eliminating the need for runtime checks " 3048 "(e.g., adding 'restrict')."; 3049 }); 3050 } 3051 3052 LoopBypassBlocks.push_back(MemCheckBlock); 3053 3054 AddedSafetyChecks = true; 3055 3056 return MemCheckBlock; 3057 } 3058 3059 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3060 LoopScalarBody = OrigLoop->getHeader(); 3061 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3062 assert(LoopVectorPreHeader && "Invalid loop structure"); 3063 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3064 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3065 "multiple exit loop without required epilogue?"); 3066 3067 LoopMiddleBlock = 3068 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3069 LI, nullptr, Twine(Prefix) + "middle.block"); 3070 LoopScalarPreHeader = 3071 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3072 nullptr, Twine(Prefix) + "scalar.ph"); 3073 3074 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3075 3076 // Set up the middle block terminator. Two cases: 3077 // 1) If we know that we must execute the scalar epilogue, emit an 3078 // unconditional branch. 3079 // 2) Otherwise, we must have a single unique exit block (due to how we 3080 // implement the multiple exit case). In this case, set up a conditonal 3081 // branch from the middle block to the loop scalar preheader, and the 3082 // exit block. completeLoopSkeleton will update the condition to use an 3083 // iteration check, if required to decide whether to execute the remainder. 3084 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3085 BranchInst::Create(LoopScalarPreHeader) : 3086 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3087 Builder.getTrue()); 3088 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3089 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3090 3091 // Update dominator for loop exit. During skeleton creation, only the vector 3092 // pre-header and the middle block are created. The vector loop is entirely 3093 // created during VPlan exection. 3094 if (!Cost->requiresScalarEpilogue(VF)) 3095 // If there is an epilogue which must run, there's no edge from the 3096 // middle block to exit blocks and thus no need to update the immediate 3097 // dominator of the exit blocks. 3098 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3099 } 3100 3101 void InnerLoopVectorizer::createInductionResumeValues( 3102 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3103 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3104 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3105 "Inconsistent information about additional bypass."); 3106 3107 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3108 assert(VectorTripCount && "Expected valid arguments"); 3109 // We are going to resume the execution of the scalar loop. 3110 // Go over all of the induction variables that we found and fix the 3111 // PHIs that are left in the scalar version of the loop. 3112 // The starting values of PHI nodes depend on the counter of the last 3113 // iteration in the vectorized loop. 3114 // If we come from a bypass edge then we need to start from the original 3115 // start value. 3116 Instruction *OldInduction = Legal->getPrimaryInduction(); 3117 for (auto &InductionEntry : Legal->getInductionVars()) { 3118 PHINode *OrigPhi = InductionEntry.first; 3119 InductionDescriptor II = InductionEntry.second; 3120 3121 Value *&EndValue = IVEndValues[OrigPhi]; 3122 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3123 if (OrigPhi == OldInduction) { 3124 // We know what the end value is. 3125 EndValue = VectorTripCount; 3126 } else { 3127 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3128 3129 // Fast-math-flags propagate from the original induction instruction. 3130 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3131 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3132 3133 Type *StepType = II.getStep()->getType(); 3134 Instruction::CastOps CastOp = 3135 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3136 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3137 Value *Step = 3138 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3139 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3140 EndValue->setName("ind.end"); 3141 3142 // Compute the end value for the additional bypass (if applicable). 3143 if (AdditionalBypass.first) { 3144 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3145 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3146 StepType, true); 3147 Value *Step = 3148 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3149 VTC = 3150 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3151 EndValueFromAdditionalBypass = 3152 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3153 EndValueFromAdditionalBypass->setName("ind.end"); 3154 } 3155 } 3156 3157 // Create phi nodes to merge from the backedge-taken check block. 3158 PHINode *BCResumeVal = 3159 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3160 LoopScalarPreHeader->getTerminator()); 3161 // Copy original phi DL over to the new one. 3162 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3163 3164 // The new PHI merges the original incoming value, in case of a bypass, 3165 // or the value at the end of the vectorized loop. 3166 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3167 3168 // Fix the scalar body counter (PHI node). 3169 // The old induction's phi node in the scalar body needs the truncated 3170 // value. 3171 for (BasicBlock *BB : LoopBypassBlocks) 3172 BCResumeVal->addIncoming(II.getStartValue(), BB); 3173 3174 if (AdditionalBypass.first) 3175 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3176 EndValueFromAdditionalBypass); 3177 3178 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3179 } 3180 } 3181 3182 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3183 // The trip counts should be cached by now. 3184 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3185 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3186 3187 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3188 3189 // Add a check in the middle block to see if we have completed 3190 // all of the iterations in the first vector loop. Three cases: 3191 // 1) If we require a scalar epilogue, there is no conditional branch as 3192 // we unconditionally branch to the scalar preheader. Do nothing. 3193 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3194 // Thus if tail is to be folded, we know we don't need to run the 3195 // remainder and we can use the previous value for the condition (true). 3196 // 3) Otherwise, construct a runtime check. 3197 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3198 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3199 Count, VectorTripCount, "cmp.n", 3200 LoopMiddleBlock->getTerminator()); 3201 3202 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3203 // of the corresponding compare because they may have ended up with 3204 // different line numbers and we want to avoid awkward line stepping while 3205 // debugging. Eg. if the compare has got a line number inside the loop. 3206 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3207 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3208 } 3209 3210 #ifdef EXPENSIVE_CHECKS 3211 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3212 #endif 3213 3214 return LoopVectorPreHeader; 3215 } 3216 3217 std::pair<BasicBlock *, Value *> 3218 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3219 /* 3220 In this function we generate a new loop. The new loop will contain 3221 the vectorized instructions while the old loop will continue to run the 3222 scalar remainder. 3223 3224 [ ] <-- loop iteration number check. 3225 / | 3226 / v 3227 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3228 | / | 3229 | / v 3230 || [ ] <-- vector pre header. 3231 |/ | 3232 | v 3233 | [ ] \ 3234 | [ ]_| <-- vector loop (created during VPlan execution). 3235 | | 3236 | v 3237 \ -[ ] <--- middle-block. 3238 \/ | 3239 /\ v 3240 | ->[ ] <--- new preheader. 3241 | | 3242 (opt) v <-- edge from middle to exit iff epilogue is not required. 3243 | [ ] \ 3244 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3245 \ | 3246 \ v 3247 >[ ] <-- exit block(s). 3248 ... 3249 */ 3250 3251 // Get the metadata of the original loop before it gets modified. 3252 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3253 3254 // Workaround! Compute the trip count of the original loop and cache it 3255 // before we start modifying the CFG. This code has a systemic problem 3256 // wherein it tries to run analysis over partially constructed IR; this is 3257 // wrong, and not simply for SCEV. The trip count of the original loop 3258 // simply happens to be prone to hitting this in practice. In theory, we 3259 // can hit the same issue for any SCEV, or ValueTracking query done during 3260 // mutation. See PR49900. 3261 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3262 3263 // Create an empty vector loop, and prepare basic blocks for the runtime 3264 // checks. 3265 createVectorLoopSkeleton(""); 3266 3267 // Now, compare the new count to zero. If it is zero skip the vector loop and 3268 // jump to the scalar loop. This check also covers the case where the 3269 // backedge-taken count is uint##_max: adding one to it will overflow leading 3270 // to an incorrect trip count of zero. In this (rare) case we will also jump 3271 // to the scalar loop. 3272 emitIterationCountCheck(LoopScalarPreHeader); 3273 3274 // Generate the code to check any assumptions that we've made for SCEV 3275 // expressions. 3276 emitSCEVChecks(LoopScalarPreHeader); 3277 3278 // Generate the code that checks in runtime if arrays overlap. We put the 3279 // checks into a separate block to make the more common case of few elements 3280 // faster. 3281 emitMemRuntimeChecks(LoopScalarPreHeader); 3282 3283 // Emit phis for the new starting index of the scalar loop. 3284 createInductionResumeValues(); 3285 3286 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3287 } 3288 3289 // Fix up external users of the induction variable. At this point, we are 3290 // in LCSSA form, with all external PHIs that use the IV having one input value, 3291 // coming from the remainder loop. We need those PHIs to also have a correct 3292 // value for the IV when arriving directly from the middle block. 3293 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3294 const InductionDescriptor &II, 3295 Value *VectorTripCount, Value *EndValue, 3296 BasicBlock *MiddleBlock, 3297 BasicBlock *VectorHeader, VPlan &Plan) { 3298 // There are two kinds of external IV usages - those that use the value 3299 // computed in the last iteration (the PHI) and those that use the penultimate 3300 // value (the value that feeds into the phi from the loop latch). 3301 // We allow both, but they, obviously, have different values. 3302 3303 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3304 3305 DenseMap<Value *, Value *> MissingVals; 3306 3307 // An external user of the last iteration's value should see the value that 3308 // the remainder loop uses to initialize its own IV. 3309 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3310 for (User *U : PostInc->users()) { 3311 Instruction *UI = cast<Instruction>(U); 3312 if (!OrigLoop->contains(UI)) { 3313 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3314 MissingVals[UI] = EndValue; 3315 } 3316 } 3317 3318 // An external user of the penultimate value need to see EndValue - Step. 3319 // The simplest way to get this is to recompute it from the constituent SCEVs, 3320 // that is Start + (Step * (CRD - 1)). 3321 for (User *U : OrigPhi->users()) { 3322 auto *UI = cast<Instruction>(U); 3323 if (!OrigLoop->contains(UI)) { 3324 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3325 3326 IRBuilder<> B(MiddleBlock->getTerminator()); 3327 3328 // Fast-math-flags propagate from the original induction instruction. 3329 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3330 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3331 3332 Value *CountMinusOne = B.CreateSub( 3333 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3334 Value *CMO = 3335 !II.getStep()->getType()->isIntegerTy() 3336 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3337 II.getStep()->getType()) 3338 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3339 CMO->setName("cast.cmo"); 3340 3341 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3342 VectorHeader->getTerminator()); 3343 Value *Escape = 3344 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3345 Escape->setName("ind.escape"); 3346 MissingVals[UI] = Escape; 3347 } 3348 } 3349 3350 for (auto &I : MissingVals) { 3351 PHINode *PHI = cast<PHINode>(I.first); 3352 // One corner case we have to handle is two IVs "chasing" each-other, 3353 // that is %IV2 = phi [...], [ %IV1, %latch ] 3354 // In this case, if IV1 has an external use, we need to avoid adding both 3355 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3356 // don't already have an incoming value for the middle block. 3357 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3358 PHI->addIncoming(I.second, MiddleBlock); 3359 Plan.removeLiveOut(PHI); 3360 } 3361 } 3362 } 3363 3364 namespace { 3365 3366 struct CSEDenseMapInfo { 3367 static bool canHandle(const Instruction *I) { 3368 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3369 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3370 } 3371 3372 static inline Instruction *getEmptyKey() { 3373 return DenseMapInfo<Instruction *>::getEmptyKey(); 3374 } 3375 3376 static inline Instruction *getTombstoneKey() { 3377 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3378 } 3379 3380 static unsigned getHashValue(const Instruction *I) { 3381 assert(canHandle(I) && "Unknown instruction!"); 3382 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3383 I->value_op_end())); 3384 } 3385 3386 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3387 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3388 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3389 return LHS == RHS; 3390 return LHS->isIdenticalTo(RHS); 3391 } 3392 }; 3393 3394 } // end anonymous namespace 3395 3396 ///Perform cse of induction variable instructions. 3397 static void cse(BasicBlock *BB) { 3398 // Perform simple cse. 3399 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3400 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3401 if (!CSEDenseMapInfo::canHandle(&In)) 3402 continue; 3403 3404 // Check if we can replace this instruction with any of the 3405 // visited instructions. 3406 if (Instruction *V = CSEMap.lookup(&In)) { 3407 In.replaceAllUsesWith(V); 3408 In.eraseFromParent(); 3409 continue; 3410 } 3411 3412 CSEMap[&In] = &In; 3413 } 3414 } 3415 3416 InstructionCost 3417 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3418 bool &NeedToScalarize) const { 3419 Function *F = CI->getCalledFunction(); 3420 Type *ScalarRetTy = CI->getType(); 3421 SmallVector<Type *, 4> Tys, ScalarTys; 3422 for (auto &ArgOp : CI->args()) 3423 ScalarTys.push_back(ArgOp->getType()); 3424 3425 // Estimate cost of scalarized vector call. The source operands are assumed 3426 // to be vectors, so we need to extract individual elements from there, 3427 // execute VF scalar calls, and then gather the result into the vector return 3428 // value. 3429 InstructionCost ScalarCallCost = 3430 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3431 if (VF.isScalar()) 3432 return ScalarCallCost; 3433 3434 // Compute corresponding vector type for return value and arguments. 3435 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3436 for (Type *ScalarTy : ScalarTys) 3437 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3438 3439 // Compute costs of unpacking argument values for the scalar calls and 3440 // packing the return values to a vector. 3441 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3442 3443 InstructionCost Cost = 3444 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3445 3446 // If we can't emit a vector call for this function, then the currently found 3447 // cost is the cost we need to return. 3448 NeedToScalarize = true; 3449 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3450 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3451 3452 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3453 return Cost; 3454 3455 // If the corresponding vector cost is cheaper, return its cost. 3456 InstructionCost VectorCallCost = 3457 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3458 if (VectorCallCost < Cost) { 3459 NeedToScalarize = false; 3460 Cost = VectorCallCost; 3461 } 3462 return Cost; 3463 } 3464 3465 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3466 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3467 return Elt; 3468 return VectorType::get(Elt, VF); 3469 } 3470 3471 InstructionCost 3472 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3473 ElementCount VF) const { 3474 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3475 assert(ID && "Expected intrinsic call!"); 3476 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3477 FastMathFlags FMF; 3478 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3479 FMF = FPMO->getFastMathFlags(); 3480 3481 SmallVector<const Value *> Arguments(CI->args()); 3482 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3483 SmallVector<Type *> ParamTys; 3484 std::transform(FTy->param_begin(), FTy->param_end(), 3485 std::back_inserter(ParamTys), 3486 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3487 3488 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3489 dyn_cast<IntrinsicInst>(CI)); 3490 return TTI.getIntrinsicInstrCost(CostAttrs, 3491 TargetTransformInfo::TCK_RecipThroughput); 3492 } 3493 3494 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3495 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3496 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3497 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3498 } 3499 3500 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3501 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3502 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3503 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3504 } 3505 3506 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3507 // For every instruction `I` in MinBWs, truncate the operands, create a 3508 // truncated version of `I` and reextend its result. InstCombine runs 3509 // later and will remove any ext/trunc pairs. 3510 SmallPtrSet<Value *, 4> Erased; 3511 for (const auto &KV : Cost->getMinimalBitwidths()) { 3512 // If the value wasn't vectorized, we must maintain the original scalar 3513 // type. The absence of the value from State indicates that it 3514 // wasn't vectorized. 3515 // FIXME: Should not rely on getVPValue at this point. 3516 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3517 if (!State.hasAnyVectorValue(Def)) 3518 continue; 3519 for (unsigned Part = 0; Part < UF; ++Part) { 3520 Value *I = State.get(Def, Part); 3521 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3522 continue; 3523 Type *OriginalTy = I->getType(); 3524 Type *ScalarTruncatedTy = 3525 IntegerType::get(OriginalTy->getContext(), KV.second); 3526 auto *TruncatedTy = VectorType::get( 3527 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3528 if (TruncatedTy == OriginalTy) 3529 continue; 3530 3531 IRBuilder<> B(cast<Instruction>(I)); 3532 auto ShrinkOperand = [&](Value *V) -> Value * { 3533 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3534 if (ZI->getSrcTy() == TruncatedTy) 3535 return ZI->getOperand(0); 3536 return B.CreateZExtOrTrunc(V, TruncatedTy); 3537 }; 3538 3539 // The actual instruction modification depends on the instruction type, 3540 // unfortunately. 3541 Value *NewI = nullptr; 3542 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3543 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3544 ShrinkOperand(BO->getOperand(1))); 3545 3546 // Any wrapping introduced by shrinking this operation shouldn't be 3547 // considered undefined behavior. So, we can't unconditionally copy 3548 // arithmetic wrapping flags to NewI. 3549 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3550 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3551 NewI = 3552 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3553 ShrinkOperand(CI->getOperand(1))); 3554 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3555 NewI = B.CreateSelect(SI->getCondition(), 3556 ShrinkOperand(SI->getTrueValue()), 3557 ShrinkOperand(SI->getFalseValue())); 3558 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3559 switch (CI->getOpcode()) { 3560 default: 3561 llvm_unreachable("Unhandled cast!"); 3562 case Instruction::Trunc: 3563 NewI = ShrinkOperand(CI->getOperand(0)); 3564 break; 3565 case Instruction::SExt: 3566 NewI = B.CreateSExtOrTrunc( 3567 CI->getOperand(0), 3568 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3569 break; 3570 case Instruction::ZExt: 3571 NewI = B.CreateZExtOrTrunc( 3572 CI->getOperand(0), 3573 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3574 break; 3575 } 3576 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3577 auto Elements0 = 3578 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3579 auto *O0 = B.CreateZExtOrTrunc( 3580 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3581 auto Elements1 = 3582 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3583 auto *O1 = B.CreateZExtOrTrunc( 3584 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3585 3586 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3587 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3588 // Don't do anything with the operands, just extend the result. 3589 continue; 3590 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3591 auto Elements = 3592 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3593 auto *O0 = B.CreateZExtOrTrunc( 3594 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3595 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3596 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3597 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3598 auto Elements = 3599 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3600 auto *O0 = B.CreateZExtOrTrunc( 3601 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3602 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3603 } else { 3604 // If we don't know what to do, be conservative and don't do anything. 3605 continue; 3606 } 3607 3608 // Lastly, extend the result. 3609 NewI->takeName(cast<Instruction>(I)); 3610 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3611 I->replaceAllUsesWith(Res); 3612 cast<Instruction>(I)->eraseFromParent(); 3613 Erased.insert(I); 3614 State.reset(Def, Res, Part); 3615 } 3616 } 3617 3618 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3619 for (const auto &KV : Cost->getMinimalBitwidths()) { 3620 // If the value wasn't vectorized, we must maintain the original scalar 3621 // type. The absence of the value from State indicates that it 3622 // wasn't vectorized. 3623 // FIXME: Should not rely on getVPValue at this point. 3624 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3625 if (!State.hasAnyVectorValue(Def)) 3626 continue; 3627 for (unsigned Part = 0; Part < UF; ++Part) { 3628 Value *I = State.get(Def, Part); 3629 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3630 if (Inst && Inst->use_empty()) { 3631 Value *NewI = Inst->getOperand(0); 3632 Inst->eraseFromParent(); 3633 State.reset(Def, NewI, Part); 3634 } 3635 } 3636 } 3637 } 3638 3639 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3640 VPlan &Plan) { 3641 // Insert truncates and extends for any truncated instructions as hints to 3642 // InstCombine. 3643 if (VF.isVector()) 3644 truncateToMinimalBitwidths(State); 3645 3646 // Fix widened non-induction PHIs by setting up the PHI operands. 3647 if (EnableVPlanNativePath) 3648 fixNonInductionPHIs(Plan, State); 3649 3650 // At this point every instruction in the original loop is widened to a 3651 // vector form. Now we need to fix the recurrences in the loop. These PHI 3652 // nodes are currently empty because we did not want to introduce cycles. 3653 // This is the second stage of vectorizing recurrences. 3654 fixCrossIterationPHIs(State); 3655 3656 // Forget the original basic block. 3657 PSE.getSE()->forgetLoop(OrigLoop); 3658 3659 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3660 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3661 if (Cost->requiresScalarEpilogue(VF)) { 3662 // No edge from the middle block to the unique exit block has been inserted 3663 // and there is nothing to fix from vector loop; phis should have incoming 3664 // from scalar loop only. 3665 Plan.clearLiveOuts(); 3666 } else { 3667 // If we inserted an edge from the middle block to the unique exit block, 3668 // update uses outside the loop (phis) to account for the newly inserted 3669 // edge. 3670 3671 // Fix-up external users of the induction variables. 3672 for (auto &Entry : Legal->getInductionVars()) 3673 fixupIVUsers(Entry.first, Entry.second, 3674 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3675 IVEndValues[Entry.first], LoopMiddleBlock, 3676 VectorLoop->getHeader(), Plan); 3677 } 3678 3679 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3680 // in the exit block, so update the builder. 3681 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3682 for (auto &KV : Plan.getLiveOuts()) 3683 KV.second->fixPhi(Plan, State); 3684 3685 for (Instruction *PI : PredicatedInstructions) 3686 sinkScalarOperands(&*PI); 3687 3688 // Remove redundant induction instructions. 3689 cse(VectorLoop->getHeader()); 3690 3691 // Set/update profile weights for the vector and remainder loops as original 3692 // loop iterations are now distributed among them. Note that original loop 3693 // represented by LoopScalarBody becomes remainder loop after vectorization. 3694 // 3695 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3696 // end up getting slightly roughened result but that should be OK since 3697 // profile is not inherently precise anyway. Note also possible bypass of 3698 // vector code caused by legality checks is ignored, assigning all the weight 3699 // to the vector loop, optimistically. 3700 // 3701 // For scalable vectorization we can't know at compile time how many iterations 3702 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3703 // vscale of '1'. 3704 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3705 LI->getLoopFor(LoopScalarBody), 3706 VF.getKnownMinValue() * UF); 3707 } 3708 3709 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3710 // In order to support recurrences we need to be able to vectorize Phi nodes. 3711 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3712 // stage #2: We now need to fix the recurrences by adding incoming edges to 3713 // the currently empty PHI nodes. At this point every instruction in the 3714 // original loop is widened to a vector form so we can use them to construct 3715 // the incoming edges. 3716 VPBasicBlock *Header = 3717 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3718 for (VPRecipeBase &R : Header->phis()) { 3719 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3720 fixReduction(ReductionPhi, State); 3721 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3722 fixFirstOrderRecurrence(FOR, State); 3723 } 3724 } 3725 3726 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3727 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3728 // This is the second phase of vectorizing first-order recurrences. An 3729 // overview of the transformation is described below. Suppose we have the 3730 // following loop. 3731 // 3732 // for (int i = 0; i < n; ++i) 3733 // b[i] = a[i] - a[i - 1]; 3734 // 3735 // There is a first-order recurrence on "a". For this loop, the shorthand 3736 // scalar IR looks like: 3737 // 3738 // scalar.ph: 3739 // s_init = a[-1] 3740 // br scalar.body 3741 // 3742 // scalar.body: 3743 // i = phi [0, scalar.ph], [i+1, scalar.body] 3744 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3745 // s2 = a[i] 3746 // b[i] = s2 - s1 3747 // br cond, scalar.body, ... 3748 // 3749 // In this example, s1 is a recurrence because it's value depends on the 3750 // previous iteration. In the first phase of vectorization, we created a 3751 // vector phi v1 for s1. We now complete the vectorization and produce the 3752 // shorthand vector IR shown below (for VF = 4, UF = 1). 3753 // 3754 // vector.ph: 3755 // v_init = vector(..., ..., ..., a[-1]) 3756 // br vector.body 3757 // 3758 // vector.body 3759 // i = phi [0, vector.ph], [i+4, vector.body] 3760 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3761 // v2 = a[i, i+1, i+2, i+3]; 3762 // v3 = vector(v1(3), v2(0, 1, 2)) 3763 // b[i, i+1, i+2, i+3] = v2 - v3 3764 // br cond, vector.body, middle.block 3765 // 3766 // middle.block: 3767 // x = v2(3) 3768 // br scalar.ph 3769 // 3770 // scalar.ph: 3771 // s_init = phi [x, middle.block], [a[-1], otherwise] 3772 // br scalar.body 3773 // 3774 // After execution completes the vector loop, we extract the next value of 3775 // the recurrence (x) to use as the initial value in the scalar loop. 3776 3777 // Extract the last vector element in the middle block. This will be the 3778 // initial value for the recurrence when jumping to the scalar loop. 3779 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3780 Value *Incoming = State.get(PreviousDef, UF - 1); 3781 auto *ExtractForScalar = Incoming; 3782 auto *IdxTy = Builder.getInt32Ty(); 3783 if (VF.isVector()) { 3784 auto *One = ConstantInt::get(IdxTy, 1); 3785 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3786 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3787 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3788 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3789 "vector.recur.extract"); 3790 } 3791 // Extract the second last element in the middle block if the 3792 // Phi is used outside the loop. We need to extract the phi itself 3793 // and not the last element (the phi update in the current iteration). This 3794 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3795 // when the scalar loop is not run at all. 3796 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3797 if (VF.isVector()) { 3798 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3799 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3800 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3801 Incoming, Idx, "vector.recur.extract.for.phi"); 3802 } else if (UF > 1) 3803 // When loop is unrolled without vectorizing, initialize 3804 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3805 // of `Incoming`. This is analogous to the vectorized case above: extracting 3806 // the second last element when VF > 1. 3807 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3808 3809 // Fix the initial value of the original recurrence in the scalar loop. 3810 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3811 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3812 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3813 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3814 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3815 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3816 Start->addIncoming(Incoming, BB); 3817 } 3818 3819 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3820 Phi->setName("scalar.recur"); 3821 3822 // Finally, fix users of the recurrence outside the loop. The users will need 3823 // either the last value of the scalar recurrence or the last value of the 3824 // vector recurrence we extracted in the middle block. Since the loop is in 3825 // LCSSA form, we just need to find all the phi nodes for the original scalar 3826 // recurrence in the exit block, and then add an edge for the middle block. 3827 // Note that LCSSA does not imply single entry when the original scalar loop 3828 // had multiple exiting edges (as we always run the last iteration in the 3829 // scalar epilogue); in that case, there is no edge from middle to exit and 3830 // and thus no phis which needed updated. 3831 if (!Cost->requiresScalarEpilogue(VF)) 3832 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3833 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3834 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3835 State.Plan->removeLiveOut(&LCSSAPhi); 3836 } 3837 } 3838 3839 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3840 VPTransformState &State) { 3841 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3842 // Get it's reduction variable descriptor. 3843 assert(Legal->isReductionVariable(OrigPhi) && 3844 "Unable to find the reduction variable"); 3845 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3846 3847 RecurKind RK = RdxDesc.getRecurrenceKind(); 3848 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3849 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3850 State.setDebugLocFromInst(ReductionStartValue); 3851 3852 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3853 // This is the vector-clone of the value that leaves the loop. 3854 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3855 3856 // Wrap flags are in general invalid after vectorization, clear them. 3857 clearReductionWrapFlags(PhiR, State); 3858 3859 // Before each round, move the insertion point right between 3860 // the PHIs and the values we are going to write. 3861 // This allows us to write both PHINodes and the extractelement 3862 // instructions. 3863 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3864 3865 State.setDebugLocFromInst(LoopExitInst); 3866 3867 Type *PhiTy = OrigPhi->getType(); 3868 3869 VPBasicBlock *LatchVPBB = 3870 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3871 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3872 // If tail is folded by masking, the vector value to leave the loop should be 3873 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3874 // instead of the former. For an inloop reduction the reduction will already 3875 // be predicated, and does not need to be handled here. 3876 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3877 for (unsigned Part = 0; Part < UF; ++Part) { 3878 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3879 SelectInst *Sel = nullptr; 3880 for (User *U : VecLoopExitInst->users()) { 3881 if (isa<SelectInst>(U)) { 3882 assert(!Sel && "Reduction exit feeding two selects"); 3883 Sel = cast<SelectInst>(U); 3884 } else 3885 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3886 } 3887 assert(Sel && "Reduction exit feeds no select"); 3888 State.reset(LoopExitInstDef, Sel, Part); 3889 3890 if (isa<FPMathOperator>(Sel)) 3891 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3892 3893 // If the target can create a predicated operator for the reduction at no 3894 // extra cost in the loop (for example a predicated vadd), it can be 3895 // cheaper for the select to remain in the loop than be sunk out of it, 3896 // and so use the select value for the phi instead of the old 3897 // LoopExitValue. 3898 if (PreferPredicatedReductionSelect || 3899 TTI->preferPredicatedReductionSelect( 3900 RdxDesc.getOpcode(), PhiTy, 3901 TargetTransformInfo::ReductionFlags())) { 3902 auto *VecRdxPhi = 3903 cast<PHINode>(State.get(PhiR, Part)); 3904 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3905 } 3906 } 3907 } 3908 3909 // If the vector reduction can be performed in a smaller type, we truncate 3910 // then extend the loop exit value to enable InstCombine to evaluate the 3911 // entire expression in the smaller type. 3912 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3913 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3914 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3915 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3916 VectorParts RdxParts(UF); 3917 for (unsigned Part = 0; Part < UF; ++Part) { 3918 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3919 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3920 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3921 : Builder.CreateZExt(Trunc, VecTy); 3922 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3923 if (U != Trunc) { 3924 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3925 RdxParts[Part] = Extnd; 3926 } 3927 } 3928 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3929 for (unsigned Part = 0; Part < UF; ++Part) { 3930 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3931 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3932 } 3933 } 3934 3935 // Reduce all of the unrolled parts into a single vector. 3936 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3937 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3938 3939 // The middle block terminator has already been assigned a DebugLoc here (the 3940 // OrigLoop's single latch terminator). We want the whole middle block to 3941 // appear to execute on this line because: (a) it is all compiler generated, 3942 // (b) these instructions are always executed after evaluating the latch 3943 // conditional branch, and (c) other passes may add new predecessors which 3944 // terminate on this line. This is the easiest way to ensure we don't 3945 // accidentally cause an extra step back into the loop while debugging. 3946 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3947 if (PhiR->isOrdered()) 3948 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3949 else { 3950 // Floating-point operations should have some FMF to enable the reduction. 3951 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3952 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3953 for (unsigned Part = 1; Part < UF; ++Part) { 3954 Value *RdxPart = State.get(LoopExitInstDef, Part); 3955 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3956 ReducedPartRdx = Builder.CreateBinOp( 3957 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3958 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3959 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3960 ReducedPartRdx, RdxPart); 3961 else 3962 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3963 } 3964 } 3965 3966 // Create the reduction after the loop. Note that inloop reductions create the 3967 // target reduction in the loop using a Reduction recipe. 3968 if (VF.isVector() && !PhiR->isInLoop()) { 3969 ReducedPartRdx = 3970 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3971 // If the reduction can be performed in a smaller type, we need to extend 3972 // the reduction to the wider type before we branch to the original loop. 3973 if (PhiTy != RdxDesc.getRecurrenceType()) 3974 ReducedPartRdx = RdxDesc.isSigned() 3975 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3976 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3977 } 3978 3979 PHINode *ResumePhi = 3980 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 3981 3982 // Create a phi node that merges control-flow from the backedge-taken check 3983 // block and the middle block. 3984 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 3985 LoopScalarPreHeader->getTerminator()); 3986 3987 // If we are fixing reductions in the epilogue loop then we should already 3988 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 3989 // we carry over the incoming values correctly. 3990 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 3991 if (Incoming == LoopMiddleBlock) 3992 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 3993 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 3994 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 3995 Incoming); 3996 else 3997 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 3998 } 3999 4000 // Set the resume value for this reduction 4001 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4002 4003 // If there were stores of the reduction value to a uniform memory address 4004 // inside the loop, create the final store here. 4005 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4006 StoreInst *NewSI = 4007 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4008 propagateMetadata(NewSI, SI); 4009 4010 // If the reduction value is used in other places, 4011 // then let the code below create PHI's for that. 4012 } 4013 4014 // Now, we need to fix the users of the reduction variable 4015 // inside and outside of the scalar remainder loop. 4016 4017 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4018 // in the exit blocks. See comment on analogous loop in 4019 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4020 if (!Cost->requiresScalarEpilogue(VF)) 4021 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4022 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4023 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4024 State.Plan->removeLiveOut(&LCSSAPhi); 4025 } 4026 4027 // Fix the scalar loop reduction variable with the incoming reduction sum 4028 // from the vector body and from the backedge value. 4029 int IncomingEdgeBlockIdx = 4030 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4031 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4032 // Pick the other block. 4033 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4034 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4035 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4036 } 4037 4038 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4039 VPTransformState &State) { 4040 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4041 RecurKind RK = RdxDesc.getRecurrenceKind(); 4042 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4043 return; 4044 4045 SmallVector<VPValue *, 8> Worklist; 4046 SmallPtrSet<VPValue *, 8> Visited; 4047 Worklist.push_back(PhiR); 4048 Visited.insert(PhiR); 4049 4050 while (!Worklist.empty()) { 4051 VPValue *Cur = Worklist.pop_back_val(); 4052 for (unsigned Part = 0; Part < UF; ++Part) { 4053 Value *V = State.get(Cur, Part); 4054 if (!isa<OverflowingBinaryOperator>(V)) 4055 break; 4056 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4057 } 4058 4059 for (VPUser *U : Cur->users()) { 4060 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4061 if (!UserRecipe) 4062 continue; 4063 for (VPValue *V : UserRecipe->definedValues()) 4064 if (Visited.insert(V).second) 4065 Worklist.push_back(V); 4066 } 4067 } 4068 } 4069 4070 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4071 // The basic block and loop containing the predicated instruction. 4072 auto *PredBB = PredInst->getParent(); 4073 auto *VectorLoop = LI->getLoopFor(PredBB); 4074 4075 // Initialize a worklist with the operands of the predicated instruction. 4076 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4077 4078 // Holds instructions that we need to analyze again. An instruction may be 4079 // reanalyzed if we don't yet know if we can sink it or not. 4080 SmallVector<Instruction *, 8> InstsToReanalyze; 4081 4082 // Returns true if a given use occurs in the predicated block. Phi nodes use 4083 // their operands in their corresponding predecessor blocks. 4084 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4085 auto *I = cast<Instruction>(U.getUser()); 4086 BasicBlock *BB = I->getParent(); 4087 if (auto *Phi = dyn_cast<PHINode>(I)) 4088 BB = Phi->getIncomingBlock( 4089 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4090 return BB == PredBB; 4091 }; 4092 4093 // Iteratively sink the scalarized operands of the predicated instruction 4094 // into the block we created for it. When an instruction is sunk, it's 4095 // operands are then added to the worklist. The algorithm ends after one pass 4096 // through the worklist doesn't sink a single instruction. 4097 bool Changed; 4098 do { 4099 // Add the instructions that need to be reanalyzed to the worklist, and 4100 // reset the changed indicator. 4101 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4102 InstsToReanalyze.clear(); 4103 Changed = false; 4104 4105 while (!Worklist.empty()) { 4106 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4107 4108 // We can't sink an instruction if it is a phi node, is not in the loop, 4109 // or may have side effects. 4110 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4111 I->mayHaveSideEffects()) 4112 continue; 4113 4114 // If the instruction is already in PredBB, check if we can sink its 4115 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4116 // sinking the scalar instruction I, hence it appears in PredBB; but it 4117 // may have failed to sink I's operands (recursively), which we try 4118 // (again) here. 4119 if (I->getParent() == PredBB) { 4120 Worklist.insert(I->op_begin(), I->op_end()); 4121 continue; 4122 } 4123 4124 // It's legal to sink the instruction if all its uses occur in the 4125 // predicated block. Otherwise, there's nothing to do yet, and we may 4126 // need to reanalyze the instruction. 4127 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4128 InstsToReanalyze.push_back(I); 4129 continue; 4130 } 4131 4132 // Move the instruction to the beginning of the predicated block, and add 4133 // it's operands to the worklist. 4134 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4135 Worklist.insert(I->op_begin(), I->op_end()); 4136 4137 // The sinking may have enabled other instructions to be sunk, so we will 4138 // need to iterate. 4139 Changed = true; 4140 } 4141 } while (Changed); 4142 } 4143 4144 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4145 VPTransformState &State) { 4146 auto Iter = depth_first( 4147 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); 4148 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4149 for (VPRecipeBase &P : VPBB->phis()) { 4150 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4151 if (!VPPhi) 4152 continue; 4153 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4154 // Make sure the builder has a valid insert point. 4155 Builder.SetInsertPoint(NewPhi); 4156 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4157 VPValue *Inc = VPPhi->getIncomingValue(i); 4158 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4159 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4160 } 4161 } 4162 } 4163 } 4164 4165 bool InnerLoopVectorizer::useOrderedReductions( 4166 const RecurrenceDescriptor &RdxDesc) { 4167 return Cost->useOrderedReductions(RdxDesc); 4168 } 4169 4170 /// A helper function for checking whether an integer division-related 4171 /// instruction may divide by zero (in which case it must be predicated if 4172 /// executed conditionally in the scalar code). 4173 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4174 /// Non-zero divisors that are non compile-time constants will not be 4175 /// converted into multiplication, so we will still end up scalarizing 4176 /// the division, but can do so w/o predication. 4177 static bool mayDivideByZero(Instruction &I) { 4178 assert((I.getOpcode() == Instruction::UDiv || 4179 I.getOpcode() == Instruction::SDiv || 4180 I.getOpcode() == Instruction::URem || 4181 I.getOpcode() == Instruction::SRem) && 4182 "Unexpected instruction"); 4183 Value *Divisor = I.getOperand(1); 4184 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4185 return !CInt || CInt->isZero(); 4186 } 4187 4188 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4189 VPUser &ArgOperands, 4190 VPTransformState &State) { 4191 assert(!isa<DbgInfoIntrinsic>(I) && 4192 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4193 State.setDebugLocFromInst(&I); 4194 4195 Module *M = I.getParent()->getParent()->getParent(); 4196 auto *CI = cast<CallInst>(&I); 4197 4198 SmallVector<Type *, 4> Tys; 4199 for (Value *ArgOperand : CI->args()) 4200 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4201 4202 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4203 4204 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4205 // version of the instruction. 4206 // Is it beneficial to perform intrinsic call compared to lib call? 4207 bool NeedToScalarize = false; 4208 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4209 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4210 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4211 assert((UseVectorIntrinsic || !NeedToScalarize) && 4212 "Instruction should be scalarized elsewhere."); 4213 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4214 "Either the intrinsic cost or vector call cost must be valid"); 4215 4216 for (unsigned Part = 0; Part < UF; ++Part) { 4217 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4218 SmallVector<Value *, 4> Args; 4219 for (auto &I : enumerate(ArgOperands.operands())) { 4220 // Some intrinsics have a scalar argument - don't replace it with a 4221 // vector. 4222 Value *Arg; 4223 if (!UseVectorIntrinsic || 4224 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4225 Arg = State.get(I.value(), Part); 4226 else 4227 Arg = State.get(I.value(), VPIteration(0, 0)); 4228 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4229 TysForDecl.push_back(Arg->getType()); 4230 Args.push_back(Arg); 4231 } 4232 4233 Function *VectorF; 4234 if (UseVectorIntrinsic) { 4235 // Use vector version of the intrinsic. 4236 if (VF.isVector()) 4237 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4238 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4239 assert(VectorF && "Can't retrieve vector intrinsic."); 4240 } else { 4241 // Use vector version of the function call. 4242 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4243 #ifndef NDEBUG 4244 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4245 "Can't create vector function."); 4246 #endif 4247 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4248 } 4249 SmallVector<OperandBundleDef, 1> OpBundles; 4250 CI->getOperandBundlesAsDefs(OpBundles); 4251 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4252 4253 if (isa<FPMathOperator>(V)) 4254 V->copyFastMathFlags(CI); 4255 4256 State.set(Def, V, Part); 4257 State.addMetadata(V, &I); 4258 } 4259 } 4260 4261 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4262 // We should not collect Scalars more than once per VF. Right now, this 4263 // function is called from collectUniformsAndScalars(), which already does 4264 // this check. Collecting Scalars for VF=1 does not make any sense. 4265 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4266 "This function should not be visited twice for the same VF"); 4267 4268 // This avoids any chances of creating a REPLICATE recipe during planning 4269 // since that would result in generation of scalarized code during execution, 4270 // which is not supported for scalable vectors. 4271 if (VF.isScalable()) { 4272 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4273 return; 4274 } 4275 4276 SmallSetVector<Instruction *, 8> Worklist; 4277 4278 // These sets are used to seed the analysis with pointers used by memory 4279 // accesses that will remain scalar. 4280 SmallSetVector<Instruction *, 8> ScalarPtrs; 4281 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4282 auto *Latch = TheLoop->getLoopLatch(); 4283 4284 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4285 // The pointer operands of loads and stores will be scalar as long as the 4286 // memory access is not a gather or scatter operation. The value operand of a 4287 // store will remain scalar if the store is scalarized. 4288 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4289 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4290 assert(WideningDecision != CM_Unknown && 4291 "Widening decision should be ready at this moment"); 4292 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4293 if (Ptr == Store->getValueOperand()) 4294 return WideningDecision == CM_Scalarize; 4295 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4296 "Ptr is neither a value or pointer operand"); 4297 return WideningDecision != CM_GatherScatter; 4298 }; 4299 4300 // A helper that returns true if the given value is a bitcast or 4301 // getelementptr instruction contained in the loop. 4302 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4303 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4304 isa<GetElementPtrInst>(V)) && 4305 !TheLoop->isLoopInvariant(V); 4306 }; 4307 4308 // A helper that evaluates a memory access's use of a pointer. If the use will 4309 // be a scalar use and the pointer is only used by memory accesses, we place 4310 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4311 // PossibleNonScalarPtrs. 4312 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4313 // We only care about bitcast and getelementptr instructions contained in 4314 // the loop. 4315 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4316 return; 4317 4318 // If the pointer has already been identified as scalar (e.g., if it was 4319 // also identified as uniform), there's nothing to do. 4320 auto *I = cast<Instruction>(Ptr); 4321 if (Worklist.count(I)) 4322 return; 4323 4324 // If the use of the pointer will be a scalar use, and all users of the 4325 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4326 // place the pointer in PossibleNonScalarPtrs. 4327 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4328 return isa<LoadInst>(U) || isa<StoreInst>(U); 4329 })) 4330 ScalarPtrs.insert(I); 4331 else 4332 PossibleNonScalarPtrs.insert(I); 4333 }; 4334 4335 // We seed the scalars analysis with three classes of instructions: (1) 4336 // instructions marked uniform-after-vectorization and (2) bitcast, 4337 // getelementptr and (pointer) phi instructions used by memory accesses 4338 // requiring a scalar use. 4339 // 4340 // (1) Add to the worklist all instructions that have been identified as 4341 // uniform-after-vectorization. 4342 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4343 4344 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4345 // memory accesses requiring a scalar use. The pointer operands of loads and 4346 // stores will be scalar as long as the memory accesses is not a gather or 4347 // scatter operation. The value operand of a store will remain scalar if the 4348 // store is scalarized. 4349 for (auto *BB : TheLoop->blocks()) 4350 for (auto &I : *BB) { 4351 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4352 evaluatePtrUse(Load, Load->getPointerOperand()); 4353 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4354 evaluatePtrUse(Store, Store->getPointerOperand()); 4355 evaluatePtrUse(Store, Store->getValueOperand()); 4356 } 4357 } 4358 for (auto *I : ScalarPtrs) 4359 if (!PossibleNonScalarPtrs.count(I)) { 4360 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4361 Worklist.insert(I); 4362 } 4363 4364 // Insert the forced scalars. 4365 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4366 // induction variable when the PHI user is scalarized. 4367 auto ForcedScalar = ForcedScalars.find(VF); 4368 if (ForcedScalar != ForcedScalars.end()) 4369 for (auto *I : ForcedScalar->second) 4370 Worklist.insert(I); 4371 4372 // Expand the worklist by looking through any bitcasts and getelementptr 4373 // instructions we've already identified as scalar. This is similar to the 4374 // expansion step in collectLoopUniforms(); however, here we're only 4375 // expanding to include additional bitcasts and getelementptr instructions. 4376 unsigned Idx = 0; 4377 while (Idx != Worklist.size()) { 4378 Instruction *Dst = Worklist[Idx++]; 4379 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4380 continue; 4381 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4382 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4383 auto *J = cast<Instruction>(U); 4384 return !TheLoop->contains(J) || Worklist.count(J) || 4385 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4386 isScalarUse(J, Src)); 4387 })) { 4388 Worklist.insert(Src); 4389 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4390 } 4391 } 4392 4393 // An induction variable will remain scalar if all users of the induction 4394 // variable and induction variable update remain scalar. 4395 for (auto &Induction : Legal->getInductionVars()) { 4396 auto *Ind = Induction.first; 4397 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4398 4399 // If tail-folding is applied, the primary induction variable will be used 4400 // to feed a vector compare. 4401 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4402 continue; 4403 4404 // Returns true if \p Indvar is a pointer induction that is used directly by 4405 // load/store instruction \p I. 4406 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4407 Instruction *I) { 4408 return Induction.second.getKind() == 4409 InductionDescriptor::IK_PtrInduction && 4410 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4411 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4412 }; 4413 4414 // Determine if all users of the induction variable are scalar after 4415 // vectorization. 4416 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4417 auto *I = cast<Instruction>(U); 4418 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4419 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4420 }); 4421 if (!ScalarInd) 4422 continue; 4423 4424 // Determine if all users of the induction variable update instruction are 4425 // scalar after vectorization. 4426 auto ScalarIndUpdate = 4427 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4428 auto *I = cast<Instruction>(U); 4429 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4430 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4431 }); 4432 if (!ScalarIndUpdate) 4433 continue; 4434 4435 // The induction variable and its update instruction will remain scalar. 4436 Worklist.insert(Ind); 4437 Worklist.insert(IndUpdate); 4438 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4439 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4440 << "\n"); 4441 } 4442 4443 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4444 } 4445 4446 bool LoopVectorizationCostModel::isScalarWithPredication( 4447 Instruction *I, ElementCount VF) const { 4448 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4449 return false; 4450 switch(I->getOpcode()) { 4451 default: 4452 break; 4453 case Instruction::Load: 4454 case Instruction::Store: { 4455 if (!Legal->isMaskRequired(I)) 4456 return false; 4457 auto *Ptr = getLoadStorePointerOperand(I); 4458 auto *Ty = getLoadStoreType(I); 4459 Type *VTy = Ty; 4460 if (VF.isVector()) 4461 VTy = VectorType::get(Ty, VF); 4462 const Align Alignment = getLoadStoreAlignment(I); 4463 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4464 TTI.isLegalMaskedGather(VTy, Alignment)) 4465 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4466 TTI.isLegalMaskedScatter(VTy, Alignment)); 4467 } 4468 case Instruction::UDiv: 4469 case Instruction::SDiv: 4470 case Instruction::SRem: 4471 case Instruction::URem: 4472 return mayDivideByZero(*I); 4473 } 4474 return false; 4475 } 4476 4477 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4478 Instruction *I, ElementCount VF) { 4479 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4480 assert(getWideningDecision(I, VF) == CM_Unknown && 4481 "Decision should not be set yet."); 4482 auto *Group = getInterleavedAccessGroup(I); 4483 assert(Group && "Must have a group."); 4484 4485 // If the instruction's allocated size doesn't equal it's type size, it 4486 // requires padding and will be scalarized. 4487 auto &DL = I->getModule()->getDataLayout(); 4488 auto *ScalarTy = getLoadStoreType(I); 4489 if (hasIrregularType(ScalarTy, DL)) 4490 return false; 4491 4492 // If the group involves a non-integral pointer, we may not be able to 4493 // losslessly cast all values to a common type. 4494 unsigned InterleaveFactor = Group->getFactor(); 4495 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4496 for (unsigned i = 0; i < InterleaveFactor; i++) { 4497 Instruction *Member = Group->getMember(i); 4498 if (!Member) 4499 continue; 4500 auto *MemberTy = getLoadStoreType(Member); 4501 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4502 // Don't coerce non-integral pointers to integers or vice versa. 4503 if (MemberNI != ScalarNI) { 4504 // TODO: Consider adding special nullptr value case here 4505 return false; 4506 } else if (MemberNI && ScalarNI && 4507 ScalarTy->getPointerAddressSpace() != 4508 MemberTy->getPointerAddressSpace()) { 4509 return false; 4510 } 4511 } 4512 4513 // Check if masking is required. 4514 // A Group may need masking for one of two reasons: it resides in a block that 4515 // needs predication, or it was decided to use masking to deal with gaps 4516 // (either a gap at the end of a load-access that may result in a speculative 4517 // load, or any gaps in a store-access). 4518 bool PredicatedAccessRequiresMasking = 4519 blockNeedsPredicationForAnyReason(I->getParent()) && 4520 Legal->isMaskRequired(I); 4521 bool LoadAccessWithGapsRequiresEpilogMasking = 4522 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4523 !isScalarEpilogueAllowed(); 4524 bool StoreAccessWithGapsRequiresMasking = 4525 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4526 if (!PredicatedAccessRequiresMasking && 4527 !LoadAccessWithGapsRequiresEpilogMasking && 4528 !StoreAccessWithGapsRequiresMasking) 4529 return true; 4530 4531 // If masked interleaving is required, we expect that the user/target had 4532 // enabled it, because otherwise it either wouldn't have been created or 4533 // it should have been invalidated by the CostModel. 4534 assert(useMaskedInterleavedAccesses(TTI) && 4535 "Masked interleave-groups for predicated accesses are not enabled."); 4536 4537 if (Group->isReverse()) 4538 return false; 4539 4540 auto *Ty = getLoadStoreType(I); 4541 const Align Alignment = getLoadStoreAlignment(I); 4542 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4543 : TTI.isLegalMaskedStore(Ty, Alignment); 4544 } 4545 4546 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4547 Instruction *I, ElementCount VF) { 4548 // Get and ensure we have a valid memory instruction. 4549 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4550 4551 auto *Ptr = getLoadStorePointerOperand(I); 4552 auto *ScalarTy = getLoadStoreType(I); 4553 4554 // In order to be widened, the pointer should be consecutive, first of all. 4555 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4556 return false; 4557 4558 // If the instruction is a store located in a predicated block, it will be 4559 // scalarized. 4560 if (isScalarWithPredication(I, VF)) 4561 return false; 4562 4563 // If the instruction's allocated size doesn't equal it's type size, it 4564 // requires padding and will be scalarized. 4565 auto &DL = I->getModule()->getDataLayout(); 4566 if (hasIrregularType(ScalarTy, DL)) 4567 return false; 4568 4569 return true; 4570 } 4571 4572 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4573 // We should not collect Uniforms more than once per VF. Right now, 4574 // this function is called from collectUniformsAndScalars(), which 4575 // already does this check. Collecting Uniforms for VF=1 does not make any 4576 // sense. 4577 4578 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4579 "This function should not be visited twice for the same VF"); 4580 4581 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4582 // not analyze again. Uniforms.count(VF) will return 1. 4583 Uniforms[VF].clear(); 4584 4585 // We now know that the loop is vectorizable! 4586 // Collect instructions inside the loop that will remain uniform after 4587 // vectorization. 4588 4589 // Global values, params and instructions outside of current loop are out of 4590 // scope. 4591 auto isOutOfScope = [&](Value *V) -> bool { 4592 Instruction *I = dyn_cast<Instruction>(V); 4593 return (!I || !TheLoop->contains(I)); 4594 }; 4595 4596 // Worklist containing uniform instructions demanding lane 0. 4597 SetVector<Instruction *> Worklist; 4598 BasicBlock *Latch = TheLoop->getLoopLatch(); 4599 4600 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4601 // that are scalar with predication must not be considered uniform after 4602 // vectorization, because that would create an erroneous replicating region 4603 // where only a single instance out of VF should be formed. 4604 // TODO: optimize such seldom cases if found important, see PR40816. 4605 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4606 if (isOutOfScope(I)) { 4607 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4608 << *I << "\n"); 4609 return; 4610 } 4611 if (isScalarWithPredication(I, VF)) { 4612 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4613 << *I << "\n"); 4614 return; 4615 } 4616 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4617 Worklist.insert(I); 4618 }; 4619 4620 // Start with the conditional branch. If the branch condition is an 4621 // instruction contained in the loop that is only used by the branch, it is 4622 // uniform. 4623 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4624 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4625 addToWorklistIfAllowed(Cmp); 4626 4627 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4628 InstWidening WideningDecision = getWideningDecision(I, VF); 4629 assert(WideningDecision != CM_Unknown && 4630 "Widening decision should be ready at this moment"); 4631 4632 // A uniform memory op is itself uniform. We exclude uniform stores 4633 // here as they demand the last lane, not the first one. 4634 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4635 assert(WideningDecision == CM_Scalarize); 4636 return true; 4637 } 4638 4639 return (WideningDecision == CM_Widen || 4640 WideningDecision == CM_Widen_Reverse || 4641 WideningDecision == CM_Interleave); 4642 }; 4643 4644 4645 // Returns true if Ptr is the pointer operand of a memory access instruction 4646 // I, and I is known to not require scalarization. 4647 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4648 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4649 }; 4650 4651 // Holds a list of values which are known to have at least one uniform use. 4652 // Note that there may be other uses which aren't uniform. A "uniform use" 4653 // here is something which only demands lane 0 of the unrolled iterations; 4654 // it does not imply that all lanes produce the same value (e.g. this is not 4655 // the usual meaning of uniform) 4656 SetVector<Value *> HasUniformUse; 4657 4658 // Scan the loop for instructions which are either a) known to have only 4659 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4660 for (auto *BB : TheLoop->blocks()) 4661 for (auto &I : *BB) { 4662 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4663 switch (II->getIntrinsicID()) { 4664 case Intrinsic::sideeffect: 4665 case Intrinsic::experimental_noalias_scope_decl: 4666 case Intrinsic::assume: 4667 case Intrinsic::lifetime_start: 4668 case Intrinsic::lifetime_end: 4669 if (TheLoop->hasLoopInvariantOperands(&I)) 4670 addToWorklistIfAllowed(&I); 4671 break; 4672 default: 4673 break; 4674 } 4675 } 4676 4677 // ExtractValue instructions must be uniform, because the operands are 4678 // known to be loop-invariant. 4679 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4680 assert(isOutOfScope(EVI->getAggregateOperand()) && 4681 "Expected aggregate value to be loop invariant"); 4682 addToWorklistIfAllowed(EVI); 4683 continue; 4684 } 4685 4686 // If there's no pointer operand, there's nothing to do. 4687 auto *Ptr = getLoadStorePointerOperand(&I); 4688 if (!Ptr) 4689 continue; 4690 4691 // A uniform memory op is itself uniform. We exclude uniform stores 4692 // here as they demand the last lane, not the first one. 4693 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4694 addToWorklistIfAllowed(&I); 4695 4696 if (isUniformDecision(&I, VF)) { 4697 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4698 HasUniformUse.insert(Ptr); 4699 } 4700 } 4701 4702 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4703 // demanding) users. Since loops are assumed to be in LCSSA form, this 4704 // disallows uses outside the loop as well. 4705 for (auto *V : HasUniformUse) { 4706 if (isOutOfScope(V)) 4707 continue; 4708 auto *I = cast<Instruction>(V); 4709 auto UsersAreMemAccesses = 4710 llvm::all_of(I->users(), [&](User *U) -> bool { 4711 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4712 }); 4713 if (UsersAreMemAccesses) 4714 addToWorklistIfAllowed(I); 4715 } 4716 4717 // Expand Worklist in topological order: whenever a new instruction 4718 // is added , its users should be already inside Worklist. It ensures 4719 // a uniform instruction will only be used by uniform instructions. 4720 unsigned idx = 0; 4721 while (idx != Worklist.size()) { 4722 Instruction *I = Worklist[idx++]; 4723 4724 for (auto OV : I->operand_values()) { 4725 // isOutOfScope operands cannot be uniform instructions. 4726 if (isOutOfScope(OV)) 4727 continue; 4728 // First order recurrence Phi's should typically be considered 4729 // non-uniform. 4730 auto *OP = dyn_cast<PHINode>(OV); 4731 if (OP && Legal->isFirstOrderRecurrence(OP)) 4732 continue; 4733 // If all the users of the operand are uniform, then add the 4734 // operand into the uniform worklist. 4735 auto *OI = cast<Instruction>(OV); 4736 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4737 auto *J = cast<Instruction>(U); 4738 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4739 })) 4740 addToWorklistIfAllowed(OI); 4741 } 4742 } 4743 4744 // For an instruction to be added into Worklist above, all its users inside 4745 // the loop should also be in Worklist. However, this condition cannot be 4746 // true for phi nodes that form a cyclic dependence. We must process phi 4747 // nodes separately. An induction variable will remain uniform if all users 4748 // of the induction variable and induction variable update remain uniform. 4749 // The code below handles both pointer and non-pointer induction variables. 4750 for (auto &Induction : Legal->getInductionVars()) { 4751 auto *Ind = Induction.first; 4752 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4753 4754 // Determine if all users of the induction variable are uniform after 4755 // vectorization. 4756 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4757 auto *I = cast<Instruction>(U); 4758 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4759 isVectorizedMemAccessUse(I, Ind); 4760 }); 4761 if (!UniformInd) 4762 continue; 4763 4764 // Determine if all users of the induction variable update instruction are 4765 // uniform after vectorization. 4766 auto UniformIndUpdate = 4767 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4768 auto *I = cast<Instruction>(U); 4769 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4770 isVectorizedMemAccessUse(I, IndUpdate); 4771 }); 4772 if (!UniformIndUpdate) 4773 continue; 4774 4775 // The induction variable and its update instruction will remain uniform. 4776 addToWorklistIfAllowed(Ind); 4777 addToWorklistIfAllowed(IndUpdate); 4778 } 4779 4780 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4781 } 4782 4783 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4784 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4785 4786 if (Legal->getRuntimePointerChecking()->Need) { 4787 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4788 "runtime pointer checks needed. Enable vectorization of this " 4789 "loop with '#pragma clang loop vectorize(enable)' when " 4790 "compiling with -Os/-Oz", 4791 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4792 return true; 4793 } 4794 4795 if (!PSE.getPredicate().isAlwaysTrue()) { 4796 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4797 "runtime SCEV checks needed. Enable vectorization of this " 4798 "loop with '#pragma clang loop vectorize(enable)' when " 4799 "compiling with -Os/-Oz", 4800 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4801 return true; 4802 } 4803 4804 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4805 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4806 reportVectorizationFailure("Runtime stride check for small trip count", 4807 "runtime stride == 1 checks needed. Enable vectorization of " 4808 "this loop without such check by compiling with -Os/-Oz", 4809 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4810 return true; 4811 } 4812 4813 return false; 4814 } 4815 4816 ElementCount 4817 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4818 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4819 return ElementCount::getScalable(0); 4820 4821 if (Hints->isScalableVectorizationDisabled()) { 4822 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4823 "ScalableVectorizationDisabled", ORE, TheLoop); 4824 return ElementCount::getScalable(0); 4825 } 4826 4827 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4828 4829 auto MaxScalableVF = ElementCount::getScalable( 4830 std::numeric_limits<ElementCount::ScalarTy>::max()); 4831 4832 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4833 // FIXME: While for scalable vectors this is currently sufficient, this should 4834 // be replaced by a more detailed mechanism that filters out specific VFs, 4835 // instead of invalidating vectorization for a whole set of VFs based on the 4836 // MaxVF. 4837 4838 // Disable scalable vectorization if the loop contains unsupported reductions. 4839 if (!canVectorizeReductions(MaxScalableVF)) { 4840 reportVectorizationInfo( 4841 "Scalable vectorization not supported for the reduction " 4842 "operations found in this loop.", 4843 "ScalableVFUnfeasible", ORE, TheLoop); 4844 return ElementCount::getScalable(0); 4845 } 4846 4847 // Disable scalable vectorization if the loop contains any instructions 4848 // with element types not supported for scalable vectors. 4849 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4850 return !Ty->isVoidTy() && 4851 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4852 })) { 4853 reportVectorizationInfo("Scalable vectorization is not supported " 4854 "for all element types found in this loop.", 4855 "ScalableVFUnfeasible", ORE, TheLoop); 4856 return ElementCount::getScalable(0); 4857 } 4858 4859 if (Legal->isSafeForAnyVectorWidth()) 4860 return MaxScalableVF; 4861 4862 // Limit MaxScalableVF by the maximum safe dependence distance. 4863 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4864 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4865 MaxVScale = 4866 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4867 MaxScalableVF = ElementCount::getScalable( 4868 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 4869 if (!MaxScalableVF) 4870 reportVectorizationInfo( 4871 "Max legal vector width too small, scalable vectorization " 4872 "unfeasible.", 4873 "ScalableVFUnfeasible", ORE, TheLoop); 4874 4875 return MaxScalableVF; 4876 } 4877 4878 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4879 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4880 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4881 unsigned SmallestType, WidestType; 4882 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4883 4884 // Get the maximum safe dependence distance in bits computed by LAA. 4885 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4886 // the memory accesses that is most restrictive (involved in the smallest 4887 // dependence distance). 4888 unsigned MaxSafeElements = 4889 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4890 4891 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4892 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4893 4894 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4895 << ".\n"); 4896 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4897 << ".\n"); 4898 4899 // First analyze the UserVF, fall back if the UserVF should be ignored. 4900 if (UserVF) { 4901 auto MaxSafeUserVF = 4902 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4903 4904 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4905 // If `VF=vscale x N` is safe, then so is `VF=N` 4906 if (UserVF.isScalable()) 4907 return FixedScalableVFPair( 4908 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4909 else 4910 return UserVF; 4911 } 4912 4913 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4914 4915 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4916 // is better to ignore the hint and let the compiler choose a suitable VF. 4917 if (!UserVF.isScalable()) { 4918 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4919 << " is unsafe, clamping to max safe VF=" 4920 << MaxSafeFixedVF << ".\n"); 4921 ORE->emit([&]() { 4922 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4923 TheLoop->getStartLoc(), 4924 TheLoop->getHeader()) 4925 << "User-specified vectorization factor " 4926 << ore::NV("UserVectorizationFactor", UserVF) 4927 << " is unsafe, clamping to maximum safe vectorization factor " 4928 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4929 }); 4930 return MaxSafeFixedVF; 4931 } 4932 4933 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4934 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4935 << " is ignored because scalable vectors are not " 4936 "available.\n"); 4937 ORE->emit([&]() { 4938 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4939 TheLoop->getStartLoc(), 4940 TheLoop->getHeader()) 4941 << "User-specified vectorization factor " 4942 << ore::NV("UserVectorizationFactor", UserVF) 4943 << " is ignored because the target does not support scalable " 4944 "vectors. The compiler will pick a more suitable value."; 4945 }); 4946 } else { 4947 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4948 << " is unsafe. Ignoring scalable UserVF.\n"); 4949 ORE->emit([&]() { 4950 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4951 TheLoop->getStartLoc(), 4952 TheLoop->getHeader()) 4953 << "User-specified vectorization factor " 4954 << ore::NV("UserVectorizationFactor", UserVF) 4955 << " is unsafe. Ignoring the hint to let the compiler pick a " 4956 "more suitable value."; 4957 }); 4958 } 4959 } 4960 4961 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4962 << " / " << WidestType << " bits.\n"); 4963 4964 FixedScalableVFPair Result(ElementCount::getFixed(1), 4965 ElementCount::getScalable(0)); 4966 if (auto MaxVF = 4967 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4968 MaxSafeFixedVF, FoldTailByMasking)) 4969 Result.FixedVF = MaxVF; 4970 4971 if (auto MaxVF = 4972 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4973 MaxSafeScalableVF, FoldTailByMasking)) 4974 if (MaxVF.isScalable()) { 4975 Result.ScalableVF = MaxVF; 4976 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4977 << "\n"); 4978 } 4979 4980 return Result; 4981 } 4982 4983 FixedScalableVFPair 4984 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4985 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4986 // TODO: It may by useful to do since it's still likely to be dynamically 4987 // uniform if the target can skip. 4988 reportVectorizationFailure( 4989 "Not inserting runtime ptr check for divergent target", 4990 "runtime pointer checks needed. Not enabled for divergent target", 4991 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4992 return FixedScalableVFPair::getNone(); 4993 } 4994 4995 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4996 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4997 if (TC == 1) { 4998 reportVectorizationFailure("Single iteration (non) loop", 4999 "loop trip count is one, irrelevant for vectorization", 5000 "SingleIterationLoop", ORE, TheLoop); 5001 return FixedScalableVFPair::getNone(); 5002 } 5003 5004 switch (ScalarEpilogueStatus) { 5005 case CM_ScalarEpilogueAllowed: 5006 return computeFeasibleMaxVF(TC, UserVF, false); 5007 case CM_ScalarEpilogueNotAllowedUsePredicate: 5008 LLVM_FALLTHROUGH; 5009 case CM_ScalarEpilogueNotNeededUsePredicate: 5010 LLVM_DEBUG( 5011 dbgs() << "LV: vector predicate hint/switch found.\n" 5012 << "LV: Not allowing scalar epilogue, creating predicated " 5013 << "vector loop.\n"); 5014 break; 5015 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5016 // fallthrough as a special case of OptForSize 5017 case CM_ScalarEpilogueNotAllowedOptSize: 5018 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5019 LLVM_DEBUG( 5020 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5021 else 5022 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5023 << "count.\n"); 5024 5025 // Bail if runtime checks are required, which are not good when optimising 5026 // for size. 5027 if (runtimeChecksRequired()) 5028 return FixedScalableVFPair::getNone(); 5029 5030 break; 5031 } 5032 5033 // The only loops we can vectorize without a scalar epilogue, are loops with 5034 // a bottom-test and a single exiting block. We'd have to handle the fact 5035 // that not every instruction executes on the last iteration. This will 5036 // require a lane mask which varies through the vector loop body. (TODO) 5037 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5038 // If there was a tail-folding hint/switch, but we can't fold the tail by 5039 // masking, fallback to a vectorization with a scalar epilogue. 5040 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5041 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5042 "scalar epilogue instead.\n"); 5043 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5044 return computeFeasibleMaxVF(TC, UserVF, false); 5045 } 5046 return FixedScalableVFPair::getNone(); 5047 } 5048 5049 // Now try the tail folding 5050 5051 // Invalidate interleave groups that require an epilogue if we can't mask 5052 // the interleave-group. 5053 if (!useMaskedInterleavedAccesses(TTI)) { 5054 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5055 "No decisions should have been taken at this point"); 5056 // Note: There is no need to invalidate any cost modeling decisions here, as 5057 // non where taken so far. 5058 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5059 } 5060 5061 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5062 // Avoid tail folding if the trip count is known to be a multiple of any VF 5063 // we chose. 5064 // FIXME: The condition below pessimises the case for fixed-width vectors, 5065 // when scalable VFs are also candidates for vectorization. 5066 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5067 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5068 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5069 "MaxFixedVF must be a power of 2"); 5070 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5071 : MaxFixedVF.getFixedValue(); 5072 ScalarEvolution *SE = PSE.getSE(); 5073 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5074 const SCEV *ExitCount = SE->getAddExpr( 5075 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5076 const SCEV *Rem = SE->getURemExpr( 5077 SE->applyLoopGuards(ExitCount, TheLoop), 5078 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5079 if (Rem->isZero()) { 5080 // Accept MaxFixedVF if we do not have a tail. 5081 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5082 return MaxFactors; 5083 } 5084 } 5085 5086 // If we don't know the precise trip count, or if the trip count that we 5087 // found modulo the vectorization factor is not zero, try to fold the tail 5088 // by masking. 5089 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5090 if (Legal->prepareToFoldTailByMasking()) { 5091 FoldTailByMasking = true; 5092 return MaxFactors; 5093 } 5094 5095 // If there was a tail-folding hint/switch, but we can't fold the tail by 5096 // masking, fallback to a vectorization with a scalar epilogue. 5097 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5098 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5099 "scalar epilogue instead.\n"); 5100 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5101 return MaxFactors; 5102 } 5103 5104 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5105 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5106 return FixedScalableVFPair::getNone(); 5107 } 5108 5109 if (TC == 0) { 5110 reportVectorizationFailure( 5111 "Unable to calculate the loop count due to complex control flow", 5112 "unable to calculate the loop count due to complex control flow", 5113 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5114 return FixedScalableVFPair::getNone(); 5115 } 5116 5117 reportVectorizationFailure( 5118 "Cannot optimize for size and vectorize at the same time.", 5119 "cannot optimize for size and vectorize at the same time. " 5120 "Enable vectorization of this loop with '#pragma clang loop " 5121 "vectorize(enable)' when compiling with -Os/-Oz", 5122 "NoTailLoopWithOptForSize", ORE, TheLoop); 5123 return FixedScalableVFPair::getNone(); 5124 } 5125 5126 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5127 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5128 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5129 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5130 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5131 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5132 : TargetTransformInfo::RGK_FixedWidthVector); 5133 5134 // Convenience function to return the minimum of two ElementCounts. 5135 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5136 assert((LHS.isScalable() == RHS.isScalable()) && 5137 "Scalable flags must match"); 5138 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5139 }; 5140 5141 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5142 // Note that both WidestRegister and WidestType may not be a powers of 2. 5143 auto MaxVectorElementCount = ElementCount::get( 5144 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5145 ComputeScalableMaxVF); 5146 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5147 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5148 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5149 5150 if (!MaxVectorElementCount) { 5151 LLVM_DEBUG(dbgs() << "LV: The target has no " 5152 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5153 << " vector registers.\n"); 5154 return ElementCount::getFixed(1); 5155 } 5156 5157 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5158 if (ConstTripCount && 5159 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5160 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5161 // If loop trip count (TC) is known at compile time there is no point in 5162 // choosing VF greater than TC (as done in the loop below). Select maximum 5163 // power of two which doesn't exceed TC. 5164 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5165 // when the TC is less than or equal to the known number of lanes. 5166 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5167 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5168 "exceeding the constant trip count: " 5169 << ClampedConstTripCount << "\n"); 5170 return ElementCount::getFixed(ClampedConstTripCount); 5171 } 5172 5173 TargetTransformInfo::RegisterKind RegKind = 5174 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5175 : TargetTransformInfo::RGK_FixedWidthVector; 5176 ElementCount MaxVF = MaxVectorElementCount; 5177 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5178 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5179 auto MaxVectorElementCountMaxBW = ElementCount::get( 5180 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5181 ComputeScalableMaxVF); 5182 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5183 5184 // Collect all viable vectorization factors larger than the default MaxVF 5185 // (i.e. MaxVectorElementCount). 5186 SmallVector<ElementCount, 8> VFs; 5187 for (ElementCount VS = MaxVectorElementCount * 2; 5188 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5189 VFs.push_back(VS); 5190 5191 // For each VF calculate its register usage. 5192 auto RUs = calculateRegisterUsage(VFs); 5193 5194 // Select the largest VF which doesn't require more registers than existing 5195 // ones. 5196 for (int i = RUs.size() - 1; i >= 0; --i) { 5197 bool Selected = true; 5198 for (auto &pair : RUs[i].MaxLocalUsers) { 5199 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5200 if (pair.second > TargetNumRegisters) 5201 Selected = false; 5202 } 5203 if (Selected) { 5204 MaxVF = VFs[i]; 5205 break; 5206 } 5207 } 5208 if (ElementCount MinVF = 5209 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5210 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5211 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5212 << ") with target's minimum: " << MinVF << '\n'); 5213 MaxVF = MinVF; 5214 } 5215 } 5216 5217 // Invalidate any widening decisions we might have made, in case the loop 5218 // requires prediction (decided later), but we have already made some 5219 // load/store widening decisions. 5220 invalidateCostModelingDecisions(); 5221 } 5222 return MaxVF; 5223 } 5224 5225 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5226 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5227 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5228 auto Min = Attr.getVScaleRangeMin(); 5229 auto Max = Attr.getVScaleRangeMax(); 5230 if (Max && Min == Max) 5231 return Max; 5232 } 5233 5234 return TTI.getVScaleForTuning(); 5235 } 5236 5237 bool LoopVectorizationCostModel::isMoreProfitable( 5238 const VectorizationFactor &A, const VectorizationFactor &B) const { 5239 InstructionCost CostA = A.Cost; 5240 InstructionCost CostB = B.Cost; 5241 5242 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5243 5244 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5245 MaxTripCount) { 5246 // If we are folding the tail and the trip count is a known (possibly small) 5247 // constant, the trip count will be rounded up to an integer number of 5248 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5249 // which we compare directly. When not folding the tail, the total cost will 5250 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5251 // approximated with the per-lane cost below instead of using the tripcount 5252 // as here. 5253 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5254 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5255 return RTCostA < RTCostB; 5256 } 5257 5258 // Improve estimate for the vector width if it is scalable. 5259 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5260 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5261 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5262 if (A.Width.isScalable()) 5263 EstimatedWidthA *= VScale.getValue(); 5264 if (B.Width.isScalable()) 5265 EstimatedWidthB *= VScale.getValue(); 5266 } 5267 5268 // Assume vscale may be larger than 1 (or the value being tuned for), 5269 // so that scalable vectorization is slightly favorable over fixed-width 5270 // vectorization. 5271 if (A.Width.isScalable() && !B.Width.isScalable()) 5272 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5273 5274 // To avoid the need for FP division: 5275 // (CostA / A.Width) < (CostB / B.Width) 5276 // <=> (CostA * B.Width) < (CostB * A.Width) 5277 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5278 } 5279 5280 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5281 const ElementCountSet &VFCandidates) { 5282 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5283 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5284 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5285 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5286 "Expected Scalar VF to be a candidate"); 5287 5288 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5289 ExpectedCost); 5290 VectorizationFactor ChosenFactor = ScalarCost; 5291 5292 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5293 if (ForceVectorization && VFCandidates.size() > 1) { 5294 // Ignore scalar width, because the user explicitly wants vectorization. 5295 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5296 // evaluation. 5297 ChosenFactor.Cost = InstructionCost::getMax(); 5298 } 5299 5300 SmallVector<InstructionVFPair> InvalidCosts; 5301 for (const auto &i : VFCandidates) { 5302 // The cost for scalar VF=1 is already calculated, so ignore it. 5303 if (i.isScalar()) 5304 continue; 5305 5306 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5307 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5308 5309 #ifndef NDEBUG 5310 unsigned AssumedMinimumVscale = 1; 5311 if (Optional<unsigned> VScale = getVScaleForTuning()) 5312 AssumedMinimumVscale = *VScale; 5313 unsigned Width = 5314 Candidate.Width.isScalable() 5315 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5316 : Candidate.Width.getFixedValue(); 5317 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5318 << " costs: " << (Candidate.Cost / Width)); 5319 if (i.isScalable()) 5320 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5321 << AssumedMinimumVscale << ")"); 5322 LLVM_DEBUG(dbgs() << ".\n"); 5323 #endif 5324 5325 if (!C.second && !ForceVectorization) { 5326 LLVM_DEBUG( 5327 dbgs() << "LV: Not considering vector loop of width " << i 5328 << " because it will not generate any vector instructions.\n"); 5329 continue; 5330 } 5331 5332 // If profitable add it to ProfitableVF list. 5333 if (isMoreProfitable(Candidate, ScalarCost)) 5334 ProfitableVFs.push_back(Candidate); 5335 5336 if (isMoreProfitable(Candidate, ChosenFactor)) 5337 ChosenFactor = Candidate; 5338 } 5339 5340 // Emit a report of VFs with invalid costs in the loop. 5341 if (!InvalidCosts.empty()) { 5342 // Group the remarks per instruction, keeping the instruction order from 5343 // InvalidCosts. 5344 std::map<Instruction *, unsigned> Numbering; 5345 unsigned I = 0; 5346 for (auto &Pair : InvalidCosts) 5347 if (!Numbering.count(Pair.first)) 5348 Numbering[Pair.first] = I++; 5349 5350 // Sort the list, first on instruction(number) then on VF. 5351 llvm::sort(InvalidCosts, 5352 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5353 if (Numbering[A.first] != Numbering[B.first]) 5354 return Numbering[A.first] < Numbering[B.first]; 5355 ElementCountComparator ECC; 5356 return ECC(A.second, B.second); 5357 }); 5358 5359 // For a list of ordered instruction-vf pairs: 5360 // [(load, vf1), (load, vf2), (store, vf1)] 5361 // Group the instructions together to emit separate remarks for: 5362 // load (vf1, vf2) 5363 // store (vf1) 5364 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5365 auto Subset = ArrayRef<InstructionVFPair>(); 5366 do { 5367 if (Subset.empty()) 5368 Subset = Tail.take_front(1); 5369 5370 Instruction *I = Subset.front().first; 5371 5372 // If the next instruction is different, or if there are no other pairs, 5373 // emit a remark for the collated subset. e.g. 5374 // [(load, vf1), (load, vf2))] 5375 // to emit: 5376 // remark: invalid costs for 'load' at VF=(vf, vf2) 5377 if (Subset == Tail || Tail[Subset.size()].first != I) { 5378 std::string OutString; 5379 raw_string_ostream OS(OutString); 5380 assert(!Subset.empty() && "Unexpected empty range"); 5381 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5382 for (auto &Pair : Subset) 5383 OS << (Pair.second == Subset.front().second ? "" : ", ") 5384 << Pair.second; 5385 OS << "):"; 5386 if (auto *CI = dyn_cast<CallInst>(I)) 5387 OS << " call to " << CI->getCalledFunction()->getName(); 5388 else 5389 OS << " " << I->getOpcodeName(); 5390 OS.flush(); 5391 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5392 Tail = Tail.drop_front(Subset.size()); 5393 Subset = {}; 5394 } else 5395 // Grow the subset by one element 5396 Subset = Tail.take_front(Subset.size() + 1); 5397 } while (!Tail.empty()); 5398 } 5399 5400 if (!EnableCondStoresVectorization && NumPredStores) { 5401 reportVectorizationFailure("There are conditional stores.", 5402 "store that is conditionally executed prevents vectorization", 5403 "ConditionalStore", ORE, TheLoop); 5404 ChosenFactor = ScalarCost; 5405 } 5406 5407 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5408 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5409 << "LV: Vectorization seems to be not beneficial, " 5410 << "but was forced by a user.\n"); 5411 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5412 return ChosenFactor; 5413 } 5414 5415 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5416 const Loop &L, ElementCount VF) const { 5417 // Cross iteration phis such as reductions need special handling and are 5418 // currently unsupported. 5419 if (any_of(L.getHeader()->phis(), 5420 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5421 return false; 5422 5423 // Phis with uses outside of the loop require special handling and are 5424 // currently unsupported. 5425 for (auto &Entry : Legal->getInductionVars()) { 5426 // Look for uses of the value of the induction at the last iteration. 5427 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5428 for (User *U : PostInc->users()) 5429 if (!L.contains(cast<Instruction>(U))) 5430 return false; 5431 // Look for uses of penultimate value of the induction. 5432 for (User *U : Entry.first->users()) 5433 if (!L.contains(cast<Instruction>(U))) 5434 return false; 5435 } 5436 5437 // Induction variables that are widened require special handling that is 5438 // currently not supported. 5439 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5440 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5441 this->isProfitableToScalarize(Entry.first, VF)); 5442 })) 5443 return false; 5444 5445 // Epilogue vectorization code has not been auditted to ensure it handles 5446 // non-latch exits properly. It may be fine, but it needs auditted and 5447 // tested. 5448 if (L.getExitingBlock() != L.getLoopLatch()) 5449 return false; 5450 5451 return true; 5452 } 5453 5454 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5455 const ElementCount VF) const { 5456 // FIXME: We need a much better cost-model to take different parameters such 5457 // as register pressure, code size increase and cost of extra branches into 5458 // account. For now we apply a very crude heuristic and only consider loops 5459 // with vectorization factors larger than a certain value. 5460 // We also consider epilogue vectorization unprofitable for targets that don't 5461 // consider interleaving beneficial (eg. MVE). 5462 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5463 return false; 5464 // FIXME: We should consider changing the threshold for scalable 5465 // vectors to take VScaleForTuning into account. 5466 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5467 return true; 5468 return false; 5469 } 5470 5471 VectorizationFactor 5472 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5473 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5474 VectorizationFactor Result = VectorizationFactor::Disabled(); 5475 if (!EnableEpilogueVectorization) { 5476 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5477 return Result; 5478 } 5479 5480 if (!isScalarEpilogueAllowed()) { 5481 LLVM_DEBUG( 5482 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5483 "allowed.\n";); 5484 return Result; 5485 } 5486 5487 // Not really a cost consideration, but check for unsupported cases here to 5488 // simplify the logic. 5489 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5490 LLVM_DEBUG( 5491 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5492 "not a supported candidate.\n";); 5493 return Result; 5494 } 5495 5496 if (EpilogueVectorizationForceVF > 1) { 5497 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5498 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5499 if (LVP.hasPlanWithVF(ForcedEC)) 5500 return {ForcedEC, 0, 0}; 5501 else { 5502 LLVM_DEBUG( 5503 dbgs() 5504 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5505 return Result; 5506 } 5507 } 5508 5509 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5510 TheLoop->getHeader()->getParent()->hasMinSize()) { 5511 LLVM_DEBUG( 5512 dbgs() 5513 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5514 return Result; 5515 } 5516 5517 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5518 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5519 "this loop\n"); 5520 return Result; 5521 } 5522 5523 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5524 // the main loop handles 8 lanes per iteration. We could still benefit from 5525 // vectorizing the epilogue loop with VF=4. 5526 ElementCount EstimatedRuntimeVF = MainLoopVF; 5527 if (MainLoopVF.isScalable()) { 5528 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5529 if (Optional<unsigned> VScale = getVScaleForTuning()) 5530 EstimatedRuntimeVF *= *VScale; 5531 } 5532 5533 for (auto &NextVF : ProfitableVFs) 5534 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5535 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5536 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5537 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5538 LVP.hasPlanWithVF(NextVF.Width)) 5539 Result = NextVF; 5540 5541 if (Result != VectorizationFactor::Disabled()) 5542 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5543 << Result.Width << "\n";); 5544 return Result; 5545 } 5546 5547 std::pair<unsigned, unsigned> 5548 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5549 unsigned MinWidth = -1U; 5550 unsigned MaxWidth = 8; 5551 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5552 // For in-loop reductions, no element types are added to ElementTypesInLoop 5553 // if there are no loads/stores in the loop. In this case, check through the 5554 // reduction variables to determine the maximum width. 5555 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5556 // Reset MaxWidth so that we can find the smallest type used by recurrences 5557 // in the loop. 5558 MaxWidth = -1U; 5559 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5560 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5561 // When finding the min width used by the recurrence we need to account 5562 // for casts on the input operands of the recurrence. 5563 MaxWidth = std::min<unsigned>( 5564 MaxWidth, std::min<unsigned>( 5565 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5566 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5567 } 5568 } else { 5569 for (Type *T : ElementTypesInLoop) { 5570 MinWidth = std::min<unsigned>( 5571 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5572 MaxWidth = std::max<unsigned>( 5573 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5574 } 5575 } 5576 return {MinWidth, MaxWidth}; 5577 } 5578 5579 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5580 ElementTypesInLoop.clear(); 5581 // For each block. 5582 for (BasicBlock *BB : TheLoop->blocks()) { 5583 // For each instruction in the loop. 5584 for (Instruction &I : BB->instructionsWithoutDebug()) { 5585 Type *T = I.getType(); 5586 5587 // Skip ignored values. 5588 if (ValuesToIgnore.count(&I)) 5589 continue; 5590 5591 // Only examine Loads, Stores and PHINodes. 5592 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5593 continue; 5594 5595 // Examine PHI nodes that are reduction variables. Update the type to 5596 // account for the recurrence type. 5597 if (auto *PN = dyn_cast<PHINode>(&I)) { 5598 if (!Legal->isReductionVariable(PN)) 5599 continue; 5600 const RecurrenceDescriptor &RdxDesc = 5601 Legal->getReductionVars().find(PN)->second; 5602 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5603 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5604 RdxDesc.getRecurrenceType(), 5605 TargetTransformInfo::ReductionFlags())) 5606 continue; 5607 T = RdxDesc.getRecurrenceType(); 5608 } 5609 5610 // Examine the stored values. 5611 if (auto *ST = dyn_cast<StoreInst>(&I)) 5612 T = ST->getValueOperand()->getType(); 5613 5614 assert(T->isSized() && 5615 "Expected the load/store/recurrence type to be sized"); 5616 5617 ElementTypesInLoop.insert(T); 5618 } 5619 } 5620 } 5621 5622 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5623 unsigned LoopCost) { 5624 // -- The interleave heuristics -- 5625 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5626 // There are many micro-architectural considerations that we can't predict 5627 // at this level. For example, frontend pressure (on decode or fetch) due to 5628 // code size, or the number and capabilities of the execution ports. 5629 // 5630 // We use the following heuristics to select the interleave count: 5631 // 1. If the code has reductions, then we interleave to break the cross 5632 // iteration dependency. 5633 // 2. If the loop is really small, then we interleave to reduce the loop 5634 // overhead. 5635 // 3. We don't interleave if we think that we will spill registers to memory 5636 // due to the increased register pressure. 5637 5638 if (!isScalarEpilogueAllowed()) 5639 return 1; 5640 5641 // We used the distance for the interleave count. 5642 if (Legal->getMaxSafeDepDistBytes() != -1U) 5643 return 1; 5644 5645 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5646 const bool HasReductions = !Legal->getReductionVars().empty(); 5647 // Do not interleave loops with a relatively small known or estimated trip 5648 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5649 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5650 // because with the above conditions interleaving can expose ILP and break 5651 // cross iteration dependences for reductions. 5652 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5653 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5654 return 1; 5655 5656 // If we did not calculate the cost for VF (because the user selected the VF) 5657 // then we calculate the cost of VF here. 5658 if (LoopCost == 0) { 5659 InstructionCost C = expectedCost(VF).first; 5660 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5661 LoopCost = *C.getValue(); 5662 5663 // Loop body is free and there is no need for interleaving. 5664 if (LoopCost == 0) 5665 return 1; 5666 } 5667 5668 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5669 // We divide by these constants so assume that we have at least one 5670 // instruction that uses at least one register. 5671 for (auto& pair : R.MaxLocalUsers) { 5672 pair.second = std::max(pair.second, 1U); 5673 } 5674 5675 // We calculate the interleave count using the following formula. 5676 // Subtract the number of loop invariants from the number of available 5677 // registers. These registers are used by all of the interleaved instances. 5678 // Next, divide the remaining registers by the number of registers that is 5679 // required by the loop, in order to estimate how many parallel instances 5680 // fit without causing spills. All of this is rounded down if necessary to be 5681 // a power of two. We want power of two interleave count to simplify any 5682 // addressing operations or alignment considerations. 5683 // We also want power of two interleave counts to ensure that the induction 5684 // variable of the vector loop wraps to zero, when tail is folded by masking; 5685 // this currently happens when OptForSize, in which case IC is set to 1 above. 5686 unsigned IC = UINT_MAX; 5687 5688 for (auto& pair : R.MaxLocalUsers) { 5689 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5690 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5691 << " registers of " 5692 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5693 if (VF.isScalar()) { 5694 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5695 TargetNumRegisters = ForceTargetNumScalarRegs; 5696 } else { 5697 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5698 TargetNumRegisters = ForceTargetNumVectorRegs; 5699 } 5700 unsigned MaxLocalUsers = pair.second; 5701 unsigned LoopInvariantRegs = 0; 5702 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5703 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5704 5705 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5706 // Don't count the induction variable as interleaved. 5707 if (EnableIndVarRegisterHeur) { 5708 TmpIC = 5709 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5710 std::max(1U, (MaxLocalUsers - 1))); 5711 } 5712 5713 IC = std::min(IC, TmpIC); 5714 } 5715 5716 // Clamp the interleave ranges to reasonable counts. 5717 unsigned MaxInterleaveCount = 5718 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5719 5720 // Check if the user has overridden the max. 5721 if (VF.isScalar()) { 5722 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5723 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5724 } else { 5725 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5726 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5727 } 5728 5729 // If trip count is known or estimated compile time constant, limit the 5730 // interleave count to be less than the trip count divided by VF, provided it 5731 // is at least 1. 5732 // 5733 // For scalable vectors we can't know if interleaving is beneficial. It may 5734 // not be beneficial for small loops if none of the lanes in the second vector 5735 // iterations is enabled. However, for larger loops, there is likely to be a 5736 // similar benefit as for fixed-width vectors. For now, we choose to leave 5737 // the InterleaveCount as if vscale is '1', although if some information about 5738 // the vector is known (e.g. min vector size), we can make a better decision. 5739 if (BestKnownTC) { 5740 MaxInterleaveCount = 5741 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5742 // Make sure MaxInterleaveCount is greater than 0. 5743 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5744 } 5745 5746 assert(MaxInterleaveCount > 0 && 5747 "Maximum interleave count must be greater than 0"); 5748 5749 // Clamp the calculated IC to be between the 1 and the max interleave count 5750 // that the target and trip count allows. 5751 if (IC > MaxInterleaveCount) 5752 IC = MaxInterleaveCount; 5753 else 5754 // Make sure IC is greater than 0. 5755 IC = std::max(1u, IC); 5756 5757 assert(IC > 0 && "Interleave count must be greater than 0."); 5758 5759 // Interleave if we vectorized this loop and there is a reduction that could 5760 // benefit from interleaving. 5761 if (VF.isVector() && HasReductions) { 5762 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5763 return IC; 5764 } 5765 5766 // For any scalar loop that either requires runtime checks or predication we 5767 // are better off leaving this to the unroller. Note that if we've already 5768 // vectorized the loop we will have done the runtime check and so interleaving 5769 // won't require further checks. 5770 bool ScalarInterleavingRequiresPredication = 5771 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5772 return Legal->blockNeedsPredication(BB); 5773 })); 5774 bool ScalarInterleavingRequiresRuntimePointerCheck = 5775 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5776 5777 // We want to interleave small loops in order to reduce the loop overhead and 5778 // potentially expose ILP opportunities. 5779 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5780 << "LV: IC is " << IC << '\n' 5781 << "LV: VF is " << VF << '\n'); 5782 const bool AggressivelyInterleaveReductions = 5783 TTI.enableAggressiveInterleaving(HasReductions); 5784 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5785 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5786 // We assume that the cost overhead is 1 and we use the cost model 5787 // to estimate the cost of the loop and interleave until the cost of the 5788 // loop overhead is about 5% of the cost of the loop. 5789 unsigned SmallIC = 5790 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5791 5792 // Interleave until store/load ports (estimated by max interleave count) are 5793 // saturated. 5794 unsigned NumStores = Legal->getNumStores(); 5795 unsigned NumLoads = Legal->getNumLoads(); 5796 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5797 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5798 5799 // There is little point in interleaving for reductions containing selects 5800 // and compares when VF=1 since it may just create more overhead than it's 5801 // worth for loops with small trip counts. This is because we still have to 5802 // do the final reduction after the loop. 5803 bool HasSelectCmpReductions = 5804 HasReductions && 5805 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5806 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5807 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5808 RdxDesc.getRecurrenceKind()); 5809 }); 5810 if (HasSelectCmpReductions) { 5811 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5812 return 1; 5813 } 5814 5815 // If we have a scalar reduction (vector reductions are already dealt with 5816 // by this point), we can increase the critical path length if the loop 5817 // we're interleaving is inside another loop. For tree-wise reductions 5818 // set the limit to 2, and for ordered reductions it's best to disable 5819 // interleaving entirely. 5820 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5821 bool HasOrderedReductions = 5822 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5823 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5824 return RdxDesc.isOrdered(); 5825 }); 5826 if (HasOrderedReductions) { 5827 LLVM_DEBUG( 5828 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5829 return 1; 5830 } 5831 5832 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5833 SmallIC = std::min(SmallIC, F); 5834 StoresIC = std::min(StoresIC, F); 5835 LoadsIC = std::min(LoadsIC, F); 5836 } 5837 5838 if (EnableLoadStoreRuntimeInterleave && 5839 std::max(StoresIC, LoadsIC) > SmallIC) { 5840 LLVM_DEBUG( 5841 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5842 return std::max(StoresIC, LoadsIC); 5843 } 5844 5845 // If there are scalar reductions and TTI has enabled aggressive 5846 // interleaving for reductions, we will interleave to expose ILP. 5847 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5848 AggressivelyInterleaveReductions) { 5849 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5850 // Interleave no less than SmallIC but not as aggressive as the normal IC 5851 // to satisfy the rare situation when resources are too limited. 5852 return std::max(IC / 2, SmallIC); 5853 } else { 5854 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5855 return SmallIC; 5856 } 5857 } 5858 5859 // Interleave if this is a large loop (small loops are already dealt with by 5860 // this point) that could benefit from interleaving. 5861 if (AggressivelyInterleaveReductions) { 5862 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5863 return IC; 5864 } 5865 5866 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5867 return 1; 5868 } 5869 5870 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5871 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5872 // This function calculates the register usage by measuring the highest number 5873 // of values that are alive at a single location. Obviously, this is a very 5874 // rough estimation. We scan the loop in a topological order in order and 5875 // assign a number to each instruction. We use RPO to ensure that defs are 5876 // met before their users. We assume that each instruction that has in-loop 5877 // users starts an interval. We record every time that an in-loop value is 5878 // used, so we have a list of the first and last occurrences of each 5879 // instruction. Next, we transpose this data structure into a multi map that 5880 // holds the list of intervals that *end* at a specific location. This multi 5881 // map allows us to perform a linear search. We scan the instructions linearly 5882 // and record each time that a new interval starts, by placing it in a set. 5883 // If we find this value in the multi-map then we remove it from the set. 5884 // The max register usage is the maximum size of the set. 5885 // We also search for instructions that are defined outside the loop, but are 5886 // used inside the loop. We need this number separately from the max-interval 5887 // usage number because when we unroll, loop-invariant values do not take 5888 // more register. 5889 LoopBlocksDFS DFS(TheLoop); 5890 DFS.perform(LI); 5891 5892 RegisterUsage RU; 5893 5894 // Each 'key' in the map opens a new interval. The values 5895 // of the map are the index of the 'last seen' usage of the 5896 // instruction that is the key. 5897 using IntervalMap = DenseMap<Instruction *, unsigned>; 5898 5899 // Maps instruction to its index. 5900 SmallVector<Instruction *, 64> IdxToInstr; 5901 // Marks the end of each interval. 5902 IntervalMap EndPoint; 5903 // Saves the list of instruction indices that are used in the loop. 5904 SmallPtrSet<Instruction *, 8> Ends; 5905 // Saves the list of values that are used in the loop but are 5906 // defined outside the loop, such as arguments and constants. 5907 SmallPtrSet<Value *, 8> LoopInvariants; 5908 5909 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5910 for (Instruction &I : BB->instructionsWithoutDebug()) { 5911 IdxToInstr.push_back(&I); 5912 5913 // Save the end location of each USE. 5914 for (Value *U : I.operands()) { 5915 auto *Instr = dyn_cast<Instruction>(U); 5916 5917 // Ignore non-instruction values such as arguments, constants, etc. 5918 if (!Instr) 5919 continue; 5920 5921 // If this instruction is outside the loop then record it and continue. 5922 if (!TheLoop->contains(Instr)) { 5923 LoopInvariants.insert(Instr); 5924 continue; 5925 } 5926 5927 // Overwrite previous end points. 5928 EndPoint[Instr] = IdxToInstr.size(); 5929 Ends.insert(Instr); 5930 } 5931 } 5932 } 5933 5934 // Saves the list of intervals that end with the index in 'key'. 5935 using InstrList = SmallVector<Instruction *, 2>; 5936 DenseMap<unsigned, InstrList> TransposeEnds; 5937 5938 // Transpose the EndPoints to a list of values that end at each index. 5939 for (auto &Interval : EndPoint) 5940 TransposeEnds[Interval.second].push_back(Interval.first); 5941 5942 SmallPtrSet<Instruction *, 8> OpenIntervals; 5943 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5944 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5945 5946 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5947 5948 const auto &TTICapture = TTI; 5949 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5950 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5951 return 0; 5952 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5953 }; 5954 5955 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5956 Instruction *I = IdxToInstr[i]; 5957 5958 // Remove all of the instructions that end at this location. 5959 InstrList &List = TransposeEnds[i]; 5960 for (Instruction *ToRemove : List) 5961 OpenIntervals.erase(ToRemove); 5962 5963 // Ignore instructions that are never used within the loop. 5964 if (!Ends.count(I)) 5965 continue; 5966 5967 // Skip ignored values. 5968 if (ValuesToIgnore.count(I)) 5969 continue; 5970 5971 // For each VF find the maximum usage of registers. 5972 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5973 // Count the number of live intervals. 5974 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5975 5976 if (VFs[j].isScalar()) { 5977 for (auto Inst : OpenIntervals) { 5978 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5979 if (RegUsage.find(ClassID) == RegUsage.end()) 5980 RegUsage[ClassID] = 1; 5981 else 5982 RegUsage[ClassID] += 1; 5983 } 5984 } else { 5985 collectUniformsAndScalars(VFs[j]); 5986 for (auto Inst : OpenIntervals) { 5987 // Skip ignored values for VF > 1. 5988 if (VecValuesToIgnore.count(Inst)) 5989 continue; 5990 if (isScalarAfterVectorization(Inst, VFs[j])) { 5991 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5992 if (RegUsage.find(ClassID) == RegUsage.end()) 5993 RegUsage[ClassID] = 1; 5994 else 5995 RegUsage[ClassID] += 1; 5996 } else { 5997 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5998 if (RegUsage.find(ClassID) == RegUsage.end()) 5999 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6000 else 6001 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6002 } 6003 } 6004 } 6005 6006 for (auto& pair : RegUsage) { 6007 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6008 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6009 else 6010 MaxUsages[j][pair.first] = pair.second; 6011 } 6012 } 6013 6014 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6015 << OpenIntervals.size() << '\n'); 6016 6017 // Add the current instruction to the list of open intervals. 6018 OpenIntervals.insert(I); 6019 } 6020 6021 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6022 SmallMapVector<unsigned, unsigned, 4> Invariant; 6023 6024 for (auto Inst : LoopInvariants) { 6025 unsigned Usage = 6026 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6027 unsigned ClassID = 6028 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6029 if (Invariant.find(ClassID) == Invariant.end()) 6030 Invariant[ClassID] = Usage; 6031 else 6032 Invariant[ClassID] += Usage; 6033 } 6034 6035 LLVM_DEBUG({ 6036 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6037 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6038 << " item\n"; 6039 for (const auto &pair : MaxUsages[i]) { 6040 dbgs() << "LV(REG): RegisterClass: " 6041 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6042 << " registers\n"; 6043 } 6044 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6045 << " item\n"; 6046 for (const auto &pair : Invariant) { 6047 dbgs() << "LV(REG): RegisterClass: " 6048 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6049 << " registers\n"; 6050 } 6051 }); 6052 6053 RU.LoopInvariantRegs = Invariant; 6054 RU.MaxLocalUsers = MaxUsages[i]; 6055 RUs[i] = RU; 6056 } 6057 6058 return RUs; 6059 } 6060 6061 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6062 ElementCount VF) { 6063 // TODO: Cost model for emulated masked load/store is completely 6064 // broken. This hack guides the cost model to use an artificially 6065 // high enough value to practically disable vectorization with such 6066 // operations, except where previously deployed legality hack allowed 6067 // using very low cost values. This is to avoid regressions coming simply 6068 // from moving "masked load/store" check from legality to cost model. 6069 // Masked Load/Gather emulation was previously never allowed. 6070 // Limited number of Masked Store/Scatter emulation was allowed. 6071 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6072 return isa<LoadInst>(I) || 6073 (isa<StoreInst>(I) && 6074 NumPredStores > NumberOfStoresToPredicate); 6075 } 6076 6077 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6078 // If we aren't vectorizing the loop, or if we've already collected the 6079 // instructions to scalarize, there's nothing to do. Collection may already 6080 // have occurred if we have a user-selected VF and are now computing the 6081 // expected cost for interleaving. 6082 if (VF.isScalar() || VF.isZero() || 6083 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6084 return; 6085 6086 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6087 // not profitable to scalarize any instructions, the presence of VF in the 6088 // map will indicate that we've analyzed it already. 6089 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6090 6091 // Find all the instructions that are scalar with predication in the loop and 6092 // determine if it would be better to not if-convert the blocks they are in. 6093 // If so, we also record the instructions to scalarize. 6094 for (BasicBlock *BB : TheLoop->blocks()) { 6095 if (!blockNeedsPredicationForAnyReason(BB)) 6096 continue; 6097 for (Instruction &I : *BB) 6098 if (isScalarWithPredication(&I, VF)) { 6099 ScalarCostsTy ScalarCosts; 6100 // Do not apply discount if scalable, because that would lead to 6101 // invalid scalarization costs. 6102 // Do not apply discount logic if hacked cost is needed 6103 // for emulated masked memrefs. 6104 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6105 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6106 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6107 // Remember that BB will remain after vectorization. 6108 PredicatedBBsAfterVectorization.insert(BB); 6109 } 6110 } 6111 } 6112 6113 int LoopVectorizationCostModel::computePredInstDiscount( 6114 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6115 assert(!isUniformAfterVectorization(PredInst, VF) && 6116 "Instruction marked uniform-after-vectorization will be predicated"); 6117 6118 // Initialize the discount to zero, meaning that the scalar version and the 6119 // vector version cost the same. 6120 InstructionCost Discount = 0; 6121 6122 // Holds instructions to analyze. The instructions we visit are mapped in 6123 // ScalarCosts. Those instructions are the ones that would be scalarized if 6124 // we find that the scalar version costs less. 6125 SmallVector<Instruction *, 8> Worklist; 6126 6127 // Returns true if the given instruction can be scalarized. 6128 auto canBeScalarized = [&](Instruction *I) -> bool { 6129 // We only attempt to scalarize instructions forming a single-use chain 6130 // from the original predicated block that would otherwise be vectorized. 6131 // Although not strictly necessary, we give up on instructions we know will 6132 // already be scalar to avoid traversing chains that are unlikely to be 6133 // beneficial. 6134 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6135 isScalarAfterVectorization(I, VF)) 6136 return false; 6137 6138 // If the instruction is scalar with predication, it will be analyzed 6139 // separately. We ignore it within the context of PredInst. 6140 if (isScalarWithPredication(I, VF)) 6141 return false; 6142 6143 // If any of the instruction's operands are uniform after vectorization, 6144 // the instruction cannot be scalarized. This prevents, for example, a 6145 // masked load from being scalarized. 6146 // 6147 // We assume we will only emit a value for lane zero of an instruction 6148 // marked uniform after vectorization, rather than VF identical values. 6149 // Thus, if we scalarize an instruction that uses a uniform, we would 6150 // create uses of values corresponding to the lanes we aren't emitting code 6151 // for. This behavior can be changed by allowing getScalarValue to clone 6152 // the lane zero values for uniforms rather than asserting. 6153 for (Use &U : I->operands()) 6154 if (auto *J = dyn_cast<Instruction>(U.get())) 6155 if (isUniformAfterVectorization(J, VF)) 6156 return false; 6157 6158 // Otherwise, we can scalarize the instruction. 6159 return true; 6160 }; 6161 6162 // Compute the expected cost discount from scalarizing the entire expression 6163 // feeding the predicated instruction. We currently only consider expressions 6164 // that are single-use instruction chains. 6165 Worklist.push_back(PredInst); 6166 while (!Worklist.empty()) { 6167 Instruction *I = Worklist.pop_back_val(); 6168 6169 // If we've already analyzed the instruction, there's nothing to do. 6170 if (ScalarCosts.find(I) != ScalarCosts.end()) 6171 continue; 6172 6173 // Compute the cost of the vector instruction. Note that this cost already 6174 // includes the scalarization overhead of the predicated instruction. 6175 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6176 6177 // Compute the cost of the scalarized instruction. This cost is the cost of 6178 // the instruction as if it wasn't if-converted and instead remained in the 6179 // predicated block. We will scale this cost by block probability after 6180 // computing the scalarization overhead. 6181 InstructionCost ScalarCost = 6182 VF.getFixedValue() * 6183 getInstructionCost(I, ElementCount::getFixed(1)).first; 6184 6185 // Compute the scalarization overhead of needed insertelement instructions 6186 // and phi nodes. 6187 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6188 ScalarCost += TTI.getScalarizationOverhead( 6189 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6190 APInt::getAllOnes(VF.getFixedValue()), true, false); 6191 ScalarCost += 6192 VF.getFixedValue() * 6193 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6194 } 6195 6196 // Compute the scalarization overhead of needed extractelement 6197 // instructions. For each of the instruction's operands, if the operand can 6198 // be scalarized, add it to the worklist; otherwise, account for the 6199 // overhead. 6200 for (Use &U : I->operands()) 6201 if (auto *J = dyn_cast<Instruction>(U.get())) { 6202 assert(VectorType::isValidElementType(J->getType()) && 6203 "Instruction has non-scalar type"); 6204 if (canBeScalarized(J)) 6205 Worklist.push_back(J); 6206 else if (needsExtract(J, VF)) { 6207 ScalarCost += TTI.getScalarizationOverhead( 6208 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6209 APInt::getAllOnes(VF.getFixedValue()), false, true); 6210 } 6211 } 6212 6213 // Scale the total scalar cost by block probability. 6214 ScalarCost /= getReciprocalPredBlockProb(); 6215 6216 // Compute the discount. A non-negative discount means the vector version 6217 // of the instruction costs more, and scalarizing would be beneficial. 6218 Discount += VectorCost - ScalarCost; 6219 ScalarCosts[I] = ScalarCost; 6220 } 6221 6222 return *Discount.getValue(); 6223 } 6224 6225 LoopVectorizationCostModel::VectorizationCostTy 6226 LoopVectorizationCostModel::expectedCost( 6227 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6228 VectorizationCostTy Cost; 6229 6230 // For each block. 6231 for (BasicBlock *BB : TheLoop->blocks()) { 6232 VectorizationCostTy BlockCost; 6233 6234 // For each instruction in the old loop. 6235 for (Instruction &I : BB->instructionsWithoutDebug()) { 6236 // Skip ignored values. 6237 if (ValuesToIgnore.count(&I) || 6238 (VF.isVector() && VecValuesToIgnore.count(&I))) 6239 continue; 6240 6241 VectorizationCostTy C = getInstructionCost(&I, VF); 6242 6243 // Check if we should override the cost. 6244 if (C.first.isValid() && 6245 ForceTargetInstructionCost.getNumOccurrences() > 0) 6246 C.first = InstructionCost(ForceTargetInstructionCost); 6247 6248 // Keep a list of instructions with invalid costs. 6249 if (Invalid && !C.first.isValid()) 6250 Invalid->emplace_back(&I, VF); 6251 6252 BlockCost.first += C.first; 6253 BlockCost.second |= C.second; 6254 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6255 << " for VF " << VF << " For instruction: " << I 6256 << '\n'); 6257 } 6258 6259 // If we are vectorizing a predicated block, it will have been 6260 // if-converted. This means that the block's instructions (aside from 6261 // stores and instructions that may divide by zero) will now be 6262 // unconditionally executed. For the scalar case, we may not always execute 6263 // the predicated block, if it is an if-else block. Thus, scale the block's 6264 // cost by the probability of executing it. blockNeedsPredication from 6265 // Legal is used so as to not include all blocks in tail folded loops. 6266 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6267 BlockCost.first /= getReciprocalPredBlockProb(); 6268 6269 Cost.first += BlockCost.first; 6270 Cost.second |= BlockCost.second; 6271 } 6272 6273 return Cost; 6274 } 6275 6276 /// Gets Address Access SCEV after verifying that the access pattern 6277 /// is loop invariant except the induction variable dependence. 6278 /// 6279 /// This SCEV can be sent to the Target in order to estimate the address 6280 /// calculation cost. 6281 static const SCEV *getAddressAccessSCEV( 6282 Value *Ptr, 6283 LoopVectorizationLegality *Legal, 6284 PredicatedScalarEvolution &PSE, 6285 const Loop *TheLoop) { 6286 6287 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6288 if (!Gep) 6289 return nullptr; 6290 6291 // We are looking for a gep with all loop invariant indices except for one 6292 // which should be an induction variable. 6293 auto SE = PSE.getSE(); 6294 unsigned NumOperands = Gep->getNumOperands(); 6295 for (unsigned i = 1; i < NumOperands; ++i) { 6296 Value *Opd = Gep->getOperand(i); 6297 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6298 !Legal->isInductionVariable(Opd)) 6299 return nullptr; 6300 } 6301 6302 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6303 return PSE.getSCEV(Ptr); 6304 } 6305 6306 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6307 return Legal->hasStride(I->getOperand(0)) || 6308 Legal->hasStride(I->getOperand(1)); 6309 } 6310 6311 InstructionCost 6312 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6313 ElementCount VF) { 6314 assert(VF.isVector() && 6315 "Scalarization cost of instruction implies vectorization."); 6316 if (VF.isScalable()) 6317 return InstructionCost::getInvalid(); 6318 6319 Type *ValTy = getLoadStoreType(I); 6320 auto SE = PSE.getSE(); 6321 6322 unsigned AS = getLoadStoreAddressSpace(I); 6323 Value *Ptr = getLoadStorePointerOperand(I); 6324 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6325 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6326 // that it is being called from this specific place. 6327 6328 // Figure out whether the access is strided and get the stride value 6329 // if it's known in compile time 6330 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6331 6332 // Get the cost of the scalar memory instruction and address computation. 6333 InstructionCost Cost = 6334 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6335 6336 // Don't pass *I here, since it is scalar but will actually be part of a 6337 // vectorized loop where the user of it is a vectorized instruction. 6338 const Align Alignment = getLoadStoreAlignment(I); 6339 Cost += VF.getKnownMinValue() * 6340 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6341 AS, TTI::TCK_RecipThroughput); 6342 6343 // Get the overhead of the extractelement and insertelement instructions 6344 // we might create due to scalarization. 6345 Cost += getScalarizationOverhead(I, VF); 6346 6347 // If we have a predicated load/store, it will need extra i1 extracts and 6348 // conditional branches, but may not be executed for each vector lane. Scale 6349 // the cost by the probability of executing the predicated block. 6350 if (isPredicatedInst(I, VF)) { 6351 Cost /= getReciprocalPredBlockProb(); 6352 6353 // Add the cost of an i1 extract and a branch 6354 auto *Vec_i1Ty = 6355 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6356 Cost += TTI.getScalarizationOverhead( 6357 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6358 /*Insert=*/false, /*Extract=*/true); 6359 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6360 6361 if (useEmulatedMaskMemRefHack(I, VF)) 6362 // Artificially setting to a high enough value to practically disable 6363 // vectorization with such operations. 6364 Cost = 3000000; 6365 } 6366 6367 return Cost; 6368 } 6369 6370 InstructionCost 6371 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6372 ElementCount VF) { 6373 Type *ValTy = getLoadStoreType(I); 6374 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6375 Value *Ptr = getLoadStorePointerOperand(I); 6376 unsigned AS = getLoadStoreAddressSpace(I); 6377 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6378 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6379 6380 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6381 "Stride should be 1 or -1 for consecutive memory access"); 6382 const Align Alignment = getLoadStoreAlignment(I); 6383 InstructionCost Cost = 0; 6384 if (Legal->isMaskRequired(I)) 6385 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6386 CostKind); 6387 else 6388 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6389 CostKind, I); 6390 6391 bool Reverse = ConsecutiveStride < 0; 6392 if (Reverse) 6393 Cost += 6394 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6395 return Cost; 6396 } 6397 6398 InstructionCost 6399 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6400 ElementCount VF) { 6401 assert(Legal->isUniformMemOp(*I)); 6402 6403 Type *ValTy = getLoadStoreType(I); 6404 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6405 const Align Alignment = getLoadStoreAlignment(I); 6406 unsigned AS = getLoadStoreAddressSpace(I); 6407 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6408 if (isa<LoadInst>(I)) { 6409 return TTI.getAddressComputationCost(ValTy) + 6410 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6411 CostKind) + 6412 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6413 } 6414 StoreInst *SI = cast<StoreInst>(I); 6415 6416 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6417 return TTI.getAddressComputationCost(ValTy) + 6418 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6419 CostKind) + 6420 (isLoopInvariantStoreValue 6421 ? 0 6422 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6423 VF.getKnownMinValue() - 1)); 6424 } 6425 6426 InstructionCost 6427 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6428 ElementCount VF) { 6429 Type *ValTy = getLoadStoreType(I); 6430 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6431 const Align Alignment = getLoadStoreAlignment(I); 6432 const Value *Ptr = getLoadStorePointerOperand(I); 6433 6434 return TTI.getAddressComputationCost(VectorTy) + 6435 TTI.getGatherScatterOpCost( 6436 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6437 TargetTransformInfo::TCK_RecipThroughput, I); 6438 } 6439 6440 InstructionCost 6441 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6442 ElementCount VF) { 6443 // TODO: Once we have support for interleaving with scalable vectors 6444 // we can calculate the cost properly here. 6445 if (VF.isScalable()) 6446 return InstructionCost::getInvalid(); 6447 6448 Type *ValTy = getLoadStoreType(I); 6449 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6450 unsigned AS = getLoadStoreAddressSpace(I); 6451 6452 auto Group = getInterleavedAccessGroup(I); 6453 assert(Group && "Fail to get an interleaved access group."); 6454 6455 unsigned InterleaveFactor = Group->getFactor(); 6456 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6457 6458 // Holds the indices of existing members in the interleaved group. 6459 SmallVector<unsigned, 4> Indices; 6460 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6461 if (Group->getMember(IF)) 6462 Indices.push_back(IF); 6463 6464 // Calculate the cost of the whole interleaved group. 6465 bool UseMaskForGaps = 6466 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6467 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6468 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6469 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6470 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6471 6472 if (Group->isReverse()) { 6473 // TODO: Add support for reversed masked interleaved access. 6474 assert(!Legal->isMaskRequired(I) && 6475 "Reverse masked interleaved access not supported."); 6476 Cost += 6477 Group->getNumMembers() * 6478 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6479 } 6480 return Cost; 6481 } 6482 6483 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6484 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6485 using namespace llvm::PatternMatch; 6486 // Early exit for no inloop reductions 6487 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6488 return None; 6489 auto *VectorTy = cast<VectorType>(Ty); 6490 6491 // We are looking for a pattern of, and finding the minimal acceptable cost: 6492 // reduce(mul(ext(A), ext(B))) or 6493 // reduce(mul(A, B)) or 6494 // reduce(ext(A)) or 6495 // reduce(A). 6496 // The basic idea is that we walk down the tree to do that, finding the root 6497 // reduction instruction in InLoopReductionImmediateChains. From there we find 6498 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6499 // of the components. If the reduction cost is lower then we return it for the 6500 // reduction instruction and 0 for the other instructions in the pattern. If 6501 // it is not we return an invalid cost specifying the orignal cost method 6502 // should be used. 6503 Instruction *RetI = I; 6504 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6505 if (!RetI->hasOneUser()) 6506 return None; 6507 RetI = RetI->user_back(); 6508 } 6509 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6510 RetI->user_back()->getOpcode() == Instruction::Add) { 6511 if (!RetI->hasOneUser()) 6512 return None; 6513 RetI = RetI->user_back(); 6514 } 6515 6516 // Test if the found instruction is a reduction, and if not return an invalid 6517 // cost specifying the parent to use the original cost modelling. 6518 if (!InLoopReductionImmediateChains.count(RetI)) 6519 return None; 6520 6521 // Find the reduction this chain is a part of and calculate the basic cost of 6522 // the reduction on its own. 6523 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6524 Instruction *ReductionPhi = LastChain; 6525 while (!isa<PHINode>(ReductionPhi)) 6526 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6527 6528 const RecurrenceDescriptor &RdxDesc = 6529 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6530 6531 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6532 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6533 6534 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6535 // normal fmul instruction to the cost of the fadd reduction. 6536 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6537 BaseCost += 6538 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6539 6540 // If we're using ordered reductions then we can just return the base cost 6541 // here, since getArithmeticReductionCost calculates the full ordered 6542 // reduction cost when FP reassociation is not allowed. 6543 if (useOrderedReductions(RdxDesc)) 6544 return BaseCost; 6545 6546 // Get the operand that was not the reduction chain and match it to one of the 6547 // patterns, returning the better cost if it is found. 6548 Instruction *RedOp = RetI->getOperand(1) == LastChain 6549 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6550 : dyn_cast<Instruction>(RetI->getOperand(1)); 6551 6552 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6553 6554 Instruction *Op0, *Op1; 6555 if (RedOp && 6556 match(RedOp, 6557 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6558 match(Op0, m_ZExtOrSExt(m_Value())) && 6559 Op0->getOpcode() == Op1->getOpcode() && 6560 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6561 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6562 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6563 6564 // Matched reduce(ext(mul(ext(A), ext(B))) 6565 // Note that the extend opcodes need to all match, or if A==B they will have 6566 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6567 // which is equally fine. 6568 bool IsUnsigned = isa<ZExtInst>(Op0); 6569 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6570 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6571 6572 InstructionCost ExtCost = 6573 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6574 TTI::CastContextHint::None, CostKind, Op0); 6575 InstructionCost MulCost = 6576 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6577 InstructionCost Ext2Cost = 6578 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6579 TTI::CastContextHint::None, CostKind, RedOp); 6580 6581 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6582 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6583 CostKind); 6584 6585 if (RedCost.isValid() && 6586 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6587 return I == RetI ? RedCost : 0; 6588 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6589 !TheLoop->isLoopInvariant(RedOp)) { 6590 // Matched reduce(ext(A)) 6591 bool IsUnsigned = isa<ZExtInst>(RedOp); 6592 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6593 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6594 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6595 CostKind); 6596 6597 InstructionCost ExtCost = 6598 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6599 TTI::CastContextHint::None, CostKind, RedOp); 6600 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6601 return I == RetI ? RedCost : 0; 6602 } else if (RedOp && 6603 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6604 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6605 Op0->getOpcode() == Op1->getOpcode() && 6606 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6607 bool IsUnsigned = isa<ZExtInst>(Op0); 6608 Type *Op0Ty = Op0->getOperand(0)->getType(); 6609 Type *Op1Ty = Op1->getOperand(0)->getType(); 6610 Type *LargestOpTy = 6611 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6612 : Op0Ty; 6613 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6614 6615 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6616 // different sizes. We take the largest type as the ext to reduce, and add 6617 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6618 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6619 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6620 TTI::CastContextHint::None, CostKind, Op0); 6621 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6622 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6623 TTI::CastContextHint::None, CostKind, Op1); 6624 InstructionCost MulCost = 6625 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6626 6627 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6628 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6629 CostKind); 6630 InstructionCost ExtraExtCost = 0; 6631 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6632 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6633 ExtraExtCost = TTI.getCastInstrCost( 6634 ExtraExtOp->getOpcode(), ExtType, 6635 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6636 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6637 } 6638 6639 if (RedCost.isValid() && 6640 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6641 return I == RetI ? RedCost : 0; 6642 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6643 // Matched reduce(mul()) 6644 InstructionCost MulCost = 6645 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6646 6647 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6648 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6649 CostKind); 6650 6651 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6652 return I == RetI ? RedCost : 0; 6653 } 6654 } 6655 6656 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6657 } 6658 6659 InstructionCost 6660 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6661 ElementCount VF) { 6662 // Calculate scalar cost only. Vectorization cost should be ready at this 6663 // moment. 6664 if (VF.isScalar()) { 6665 Type *ValTy = getLoadStoreType(I); 6666 const Align Alignment = getLoadStoreAlignment(I); 6667 unsigned AS = getLoadStoreAddressSpace(I); 6668 6669 return TTI.getAddressComputationCost(ValTy) + 6670 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6671 TTI::TCK_RecipThroughput, I); 6672 } 6673 return getWideningCost(I, VF); 6674 } 6675 6676 LoopVectorizationCostModel::VectorizationCostTy 6677 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6678 ElementCount VF) { 6679 // If we know that this instruction will remain uniform, check the cost of 6680 // the scalar version. 6681 if (isUniformAfterVectorization(I, VF)) 6682 VF = ElementCount::getFixed(1); 6683 6684 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6685 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6686 6687 // Forced scalars do not have any scalarization overhead. 6688 auto ForcedScalar = ForcedScalars.find(VF); 6689 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6690 auto InstSet = ForcedScalar->second; 6691 if (InstSet.count(I)) 6692 return VectorizationCostTy( 6693 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6694 VF.getKnownMinValue()), 6695 false); 6696 } 6697 6698 Type *VectorTy; 6699 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6700 6701 bool TypeNotScalarized = false; 6702 if (VF.isVector() && VectorTy->isVectorTy()) { 6703 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6704 if (VF.isScalable()) 6705 // <vscale x 1 x iN> is assumed to be profitable over iN because 6706 // scalable registers are a distinct register class from scalar ones. 6707 // If we ever find a target which wants to lower scalable vectors 6708 // back to scalars, we'll need to update this code to explicitly 6709 // ask TTI about the register class uses for each part. 6710 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6711 else 6712 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6713 } else 6714 C = InstructionCost::getInvalid(); 6715 } 6716 return VectorizationCostTy(C, TypeNotScalarized); 6717 } 6718 6719 InstructionCost 6720 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6721 ElementCount VF) const { 6722 6723 // There is no mechanism yet to create a scalable scalarization loop, 6724 // so this is currently Invalid. 6725 if (VF.isScalable()) 6726 return InstructionCost::getInvalid(); 6727 6728 if (VF.isScalar()) 6729 return 0; 6730 6731 InstructionCost Cost = 0; 6732 Type *RetTy = ToVectorTy(I->getType(), VF); 6733 if (!RetTy->isVoidTy() && 6734 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6735 Cost += TTI.getScalarizationOverhead( 6736 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6737 false); 6738 6739 // Some targets keep addresses scalar. 6740 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6741 return Cost; 6742 6743 // Some targets support efficient element stores. 6744 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6745 return Cost; 6746 6747 // Collect operands to consider. 6748 CallInst *CI = dyn_cast<CallInst>(I); 6749 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6750 6751 // Skip operands that do not require extraction/scalarization and do not incur 6752 // any overhead. 6753 SmallVector<Type *> Tys; 6754 for (auto *V : filterExtractingOperands(Ops, VF)) 6755 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6756 return Cost + TTI.getOperandsScalarizationOverhead( 6757 filterExtractingOperands(Ops, VF), Tys); 6758 } 6759 6760 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6761 if (VF.isScalar()) 6762 return; 6763 NumPredStores = 0; 6764 for (BasicBlock *BB : TheLoop->blocks()) { 6765 // For each instruction in the old loop. 6766 for (Instruction &I : *BB) { 6767 Value *Ptr = getLoadStorePointerOperand(&I); 6768 if (!Ptr) 6769 continue; 6770 6771 // TODO: We should generate better code and update the cost model for 6772 // predicated uniform stores. Today they are treated as any other 6773 // predicated store (see added test cases in 6774 // invariant-store-vectorization.ll). 6775 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6776 NumPredStores++; 6777 6778 if (Legal->isUniformMemOp(I)) { 6779 // TODO: Avoid replicating loads and stores instead of 6780 // relying on instcombine to remove them. 6781 // Load: Scalar load + broadcast 6782 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6783 InstructionCost Cost; 6784 if (isa<StoreInst>(&I) && VF.isScalable() && 6785 isLegalGatherOrScatter(&I, VF)) { 6786 Cost = getGatherScatterCost(&I, VF); 6787 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6788 } else { 6789 Cost = getUniformMemOpCost(&I, VF); 6790 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6791 } 6792 continue; 6793 } 6794 6795 // We assume that widening is the best solution when possible. 6796 if (memoryInstructionCanBeWidened(&I, VF)) { 6797 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6798 int ConsecutiveStride = Legal->isConsecutivePtr( 6799 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6800 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6801 "Expected consecutive stride."); 6802 InstWidening Decision = 6803 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6804 setWideningDecision(&I, VF, Decision, Cost); 6805 continue; 6806 } 6807 6808 // Choose between Interleaving, Gather/Scatter or Scalarization. 6809 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6810 unsigned NumAccesses = 1; 6811 if (isAccessInterleaved(&I)) { 6812 auto Group = getInterleavedAccessGroup(&I); 6813 assert(Group && "Fail to get an interleaved access group."); 6814 6815 // Make one decision for the whole group. 6816 if (getWideningDecision(&I, VF) != CM_Unknown) 6817 continue; 6818 6819 NumAccesses = Group->getNumMembers(); 6820 if (interleavedAccessCanBeWidened(&I, VF)) 6821 InterleaveCost = getInterleaveGroupCost(&I, VF); 6822 } 6823 6824 InstructionCost GatherScatterCost = 6825 isLegalGatherOrScatter(&I, VF) 6826 ? getGatherScatterCost(&I, VF) * NumAccesses 6827 : InstructionCost::getInvalid(); 6828 6829 InstructionCost ScalarizationCost = 6830 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6831 6832 // Choose better solution for the current VF, 6833 // write down this decision and use it during vectorization. 6834 InstructionCost Cost; 6835 InstWidening Decision; 6836 if (InterleaveCost <= GatherScatterCost && 6837 InterleaveCost < ScalarizationCost) { 6838 Decision = CM_Interleave; 6839 Cost = InterleaveCost; 6840 } else if (GatherScatterCost < ScalarizationCost) { 6841 Decision = CM_GatherScatter; 6842 Cost = GatherScatterCost; 6843 } else { 6844 Decision = CM_Scalarize; 6845 Cost = ScalarizationCost; 6846 } 6847 // If the instructions belongs to an interleave group, the whole group 6848 // receives the same decision. The whole group receives the cost, but 6849 // the cost will actually be assigned to one instruction. 6850 if (auto Group = getInterleavedAccessGroup(&I)) 6851 setWideningDecision(Group, VF, Decision, Cost); 6852 else 6853 setWideningDecision(&I, VF, Decision, Cost); 6854 } 6855 } 6856 6857 // Make sure that any load of address and any other address computation 6858 // remains scalar unless there is gather/scatter support. This avoids 6859 // inevitable extracts into address registers, and also has the benefit of 6860 // activating LSR more, since that pass can't optimize vectorized 6861 // addresses. 6862 if (TTI.prefersVectorizedAddressing()) 6863 return; 6864 6865 // Start with all scalar pointer uses. 6866 SmallPtrSet<Instruction *, 8> AddrDefs; 6867 for (BasicBlock *BB : TheLoop->blocks()) 6868 for (Instruction &I : *BB) { 6869 Instruction *PtrDef = 6870 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6871 if (PtrDef && TheLoop->contains(PtrDef) && 6872 getWideningDecision(&I, VF) != CM_GatherScatter) 6873 AddrDefs.insert(PtrDef); 6874 } 6875 6876 // Add all instructions used to generate the addresses. 6877 SmallVector<Instruction *, 4> Worklist; 6878 append_range(Worklist, AddrDefs); 6879 while (!Worklist.empty()) { 6880 Instruction *I = Worklist.pop_back_val(); 6881 for (auto &Op : I->operands()) 6882 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6883 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6884 AddrDefs.insert(InstOp).second) 6885 Worklist.push_back(InstOp); 6886 } 6887 6888 for (auto *I : AddrDefs) { 6889 if (isa<LoadInst>(I)) { 6890 // Setting the desired widening decision should ideally be handled in 6891 // by cost functions, but since this involves the task of finding out 6892 // if the loaded register is involved in an address computation, it is 6893 // instead changed here when we know this is the case. 6894 InstWidening Decision = getWideningDecision(I, VF); 6895 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6896 // Scalarize a widened load of address. 6897 setWideningDecision( 6898 I, VF, CM_Scalarize, 6899 (VF.getKnownMinValue() * 6900 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6901 else if (auto Group = getInterleavedAccessGroup(I)) { 6902 // Scalarize an interleave group of address loads. 6903 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6904 if (Instruction *Member = Group->getMember(I)) 6905 setWideningDecision( 6906 Member, VF, CM_Scalarize, 6907 (VF.getKnownMinValue() * 6908 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6909 } 6910 } 6911 } else 6912 // Make sure I gets scalarized and a cost estimate without 6913 // scalarization overhead. 6914 ForcedScalars[VF].insert(I); 6915 } 6916 } 6917 6918 InstructionCost 6919 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6920 Type *&VectorTy) { 6921 Type *RetTy = I->getType(); 6922 if (canTruncateToMinimalBitwidth(I, VF)) 6923 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6924 auto SE = PSE.getSE(); 6925 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6926 6927 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6928 ElementCount VF) -> bool { 6929 if (VF.isScalar()) 6930 return true; 6931 6932 auto Scalarized = InstsToScalarize.find(VF); 6933 assert(Scalarized != InstsToScalarize.end() && 6934 "VF not yet analyzed for scalarization profitability"); 6935 return !Scalarized->second.count(I) && 6936 llvm::all_of(I->users(), [&](User *U) { 6937 auto *UI = cast<Instruction>(U); 6938 return !Scalarized->second.count(UI); 6939 }); 6940 }; 6941 (void) hasSingleCopyAfterVectorization; 6942 6943 if (isScalarAfterVectorization(I, VF)) { 6944 // With the exception of GEPs and PHIs, after scalarization there should 6945 // only be one copy of the instruction generated in the loop. This is 6946 // because the VF is either 1, or any instructions that need scalarizing 6947 // have already been dealt with by the the time we get here. As a result, 6948 // it means we don't have to multiply the instruction cost by VF. 6949 assert(I->getOpcode() == Instruction::GetElementPtr || 6950 I->getOpcode() == Instruction::PHI || 6951 (I->getOpcode() == Instruction::BitCast && 6952 I->getType()->isPointerTy()) || 6953 hasSingleCopyAfterVectorization(I, VF)); 6954 VectorTy = RetTy; 6955 } else 6956 VectorTy = ToVectorTy(RetTy, VF); 6957 6958 // TODO: We need to estimate the cost of intrinsic calls. 6959 switch (I->getOpcode()) { 6960 case Instruction::GetElementPtr: 6961 // We mark this instruction as zero-cost because the cost of GEPs in 6962 // vectorized code depends on whether the corresponding memory instruction 6963 // is scalarized or not. Therefore, we handle GEPs with the memory 6964 // instruction cost. 6965 return 0; 6966 case Instruction::Br: { 6967 // In cases of scalarized and predicated instructions, there will be VF 6968 // predicated blocks in the vectorized loop. Each branch around these 6969 // blocks requires also an extract of its vector compare i1 element. 6970 bool ScalarPredicatedBB = false; 6971 BranchInst *BI = cast<BranchInst>(I); 6972 if (VF.isVector() && BI->isConditional() && 6973 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6974 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6975 ScalarPredicatedBB = true; 6976 6977 if (ScalarPredicatedBB) { 6978 // Not possible to scalarize scalable vector with predicated instructions. 6979 if (VF.isScalable()) 6980 return InstructionCost::getInvalid(); 6981 // Return cost for branches around scalarized and predicated blocks. 6982 auto *Vec_i1Ty = 6983 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6984 return ( 6985 TTI.getScalarizationOverhead( 6986 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 6987 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6988 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6989 // The back-edge branch will remain, as will all scalar branches. 6990 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6991 else 6992 // This branch will be eliminated by if-conversion. 6993 return 0; 6994 // Note: We currently assume zero cost for an unconditional branch inside 6995 // a predicated block since it will become a fall-through, although we 6996 // may decide in the future to call TTI for all branches. 6997 } 6998 case Instruction::PHI: { 6999 auto *Phi = cast<PHINode>(I); 7000 7001 // First-order recurrences are replaced by vector shuffles inside the loop. 7002 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7003 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7004 return TTI.getShuffleCost( 7005 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7006 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7007 7008 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7009 // converted into select instructions. We require N - 1 selects per phi 7010 // node, where N is the number of incoming values. 7011 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7012 return (Phi->getNumIncomingValues() - 1) * 7013 TTI.getCmpSelInstrCost( 7014 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7015 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7016 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7017 7018 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7019 } 7020 case Instruction::UDiv: 7021 case Instruction::SDiv: 7022 case Instruction::URem: 7023 case Instruction::SRem: 7024 // If we have a predicated instruction, it may not be executed for each 7025 // vector lane. Get the scalarization cost and scale this amount by the 7026 // probability of executing the predicated block. If the instruction is not 7027 // predicated, we fall through to the next case. 7028 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7029 InstructionCost Cost = 0; 7030 7031 // These instructions have a non-void type, so account for the phi nodes 7032 // that we will create. This cost is likely to be zero. The phi node 7033 // cost, if any, should be scaled by the block probability because it 7034 // models a copy at the end of each predicated block. 7035 Cost += VF.getKnownMinValue() * 7036 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7037 7038 // The cost of the non-predicated instruction. 7039 Cost += VF.getKnownMinValue() * 7040 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7041 7042 // The cost of insertelement and extractelement instructions needed for 7043 // scalarization. 7044 Cost += getScalarizationOverhead(I, VF); 7045 7046 // Scale the cost by the probability of executing the predicated blocks. 7047 // This assumes the predicated block for each vector lane is equally 7048 // likely. 7049 return Cost / getReciprocalPredBlockProb(); 7050 } 7051 LLVM_FALLTHROUGH; 7052 case Instruction::Add: 7053 case Instruction::FAdd: 7054 case Instruction::Sub: 7055 case Instruction::FSub: 7056 case Instruction::Mul: 7057 case Instruction::FMul: 7058 case Instruction::FDiv: 7059 case Instruction::FRem: 7060 case Instruction::Shl: 7061 case Instruction::LShr: 7062 case Instruction::AShr: 7063 case Instruction::And: 7064 case Instruction::Or: 7065 case Instruction::Xor: { 7066 // Since we will replace the stride by 1 the multiplication should go away. 7067 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7068 return 0; 7069 7070 // Detect reduction patterns 7071 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7072 return *RedCost; 7073 7074 // Certain instructions can be cheaper to vectorize if they have a constant 7075 // second vector operand. One example of this are shifts on x86. 7076 Value *Op2 = I->getOperand(1); 7077 TargetTransformInfo::OperandValueProperties Op2VP; 7078 TargetTransformInfo::OperandValueKind Op2VK = 7079 TTI.getOperandInfo(Op2, Op2VP); 7080 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7081 Op2VK = TargetTransformInfo::OK_UniformValue; 7082 7083 SmallVector<const Value *, 4> Operands(I->operand_values()); 7084 return TTI.getArithmeticInstrCost( 7085 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7086 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7087 } 7088 case Instruction::FNeg: { 7089 return TTI.getArithmeticInstrCost( 7090 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7091 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7092 TargetTransformInfo::OP_None, I->getOperand(0), I); 7093 } 7094 case Instruction::Select: { 7095 SelectInst *SI = cast<SelectInst>(I); 7096 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7097 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7098 7099 const Value *Op0, *Op1; 7100 using namespace llvm::PatternMatch; 7101 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7102 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7103 // select x, y, false --> x & y 7104 // select x, true, y --> x | y 7105 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7106 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7107 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7108 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7109 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7110 Op1->getType()->getScalarSizeInBits() == 1); 7111 7112 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7113 return TTI.getArithmeticInstrCost( 7114 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7115 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7116 } 7117 7118 Type *CondTy = SI->getCondition()->getType(); 7119 if (!ScalarCond) 7120 CondTy = VectorType::get(CondTy, VF); 7121 7122 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7123 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7124 Pred = Cmp->getPredicate(); 7125 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7126 CostKind, I); 7127 } 7128 case Instruction::ICmp: 7129 case Instruction::FCmp: { 7130 Type *ValTy = I->getOperand(0)->getType(); 7131 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7132 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7133 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7134 VectorTy = ToVectorTy(ValTy, VF); 7135 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7136 cast<CmpInst>(I)->getPredicate(), CostKind, 7137 I); 7138 } 7139 case Instruction::Store: 7140 case Instruction::Load: { 7141 ElementCount Width = VF; 7142 if (Width.isVector()) { 7143 InstWidening Decision = getWideningDecision(I, Width); 7144 assert(Decision != CM_Unknown && 7145 "CM decision should be taken at this point"); 7146 if (Decision == CM_Scalarize) { 7147 if (VF.isScalable() && isa<StoreInst>(I)) 7148 // We can't scalarize a scalable vector store (even a uniform one 7149 // currently), return an invalid cost so as to prevent vectorization. 7150 return InstructionCost::getInvalid(); 7151 Width = ElementCount::getFixed(1); 7152 } 7153 } 7154 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7155 return getMemoryInstructionCost(I, VF); 7156 } 7157 case Instruction::BitCast: 7158 if (I->getType()->isPointerTy()) 7159 return 0; 7160 LLVM_FALLTHROUGH; 7161 case Instruction::ZExt: 7162 case Instruction::SExt: 7163 case Instruction::FPToUI: 7164 case Instruction::FPToSI: 7165 case Instruction::FPExt: 7166 case Instruction::PtrToInt: 7167 case Instruction::IntToPtr: 7168 case Instruction::SIToFP: 7169 case Instruction::UIToFP: 7170 case Instruction::Trunc: 7171 case Instruction::FPTrunc: { 7172 // Computes the CastContextHint from a Load/Store instruction. 7173 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7174 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7175 "Expected a load or a store!"); 7176 7177 if (VF.isScalar() || !TheLoop->contains(I)) 7178 return TTI::CastContextHint::Normal; 7179 7180 switch (getWideningDecision(I, VF)) { 7181 case LoopVectorizationCostModel::CM_GatherScatter: 7182 return TTI::CastContextHint::GatherScatter; 7183 case LoopVectorizationCostModel::CM_Interleave: 7184 return TTI::CastContextHint::Interleave; 7185 case LoopVectorizationCostModel::CM_Scalarize: 7186 case LoopVectorizationCostModel::CM_Widen: 7187 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7188 : TTI::CastContextHint::Normal; 7189 case LoopVectorizationCostModel::CM_Widen_Reverse: 7190 return TTI::CastContextHint::Reversed; 7191 case LoopVectorizationCostModel::CM_Unknown: 7192 llvm_unreachable("Instr did not go through cost modelling?"); 7193 } 7194 7195 llvm_unreachable("Unhandled case!"); 7196 }; 7197 7198 unsigned Opcode = I->getOpcode(); 7199 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7200 // For Trunc, the context is the only user, which must be a StoreInst. 7201 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7202 if (I->hasOneUse()) 7203 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7204 CCH = ComputeCCH(Store); 7205 } 7206 // For Z/Sext, the context is the operand, which must be a LoadInst. 7207 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7208 Opcode == Instruction::FPExt) { 7209 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7210 CCH = ComputeCCH(Load); 7211 } 7212 7213 // We optimize the truncation of induction variables having constant 7214 // integer steps. The cost of these truncations is the same as the scalar 7215 // operation. 7216 if (isOptimizableIVTruncate(I, VF)) { 7217 auto *Trunc = cast<TruncInst>(I); 7218 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7219 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7220 } 7221 7222 // Detect reduction patterns 7223 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7224 return *RedCost; 7225 7226 Type *SrcScalarTy = I->getOperand(0)->getType(); 7227 Type *SrcVecTy = 7228 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7229 if (canTruncateToMinimalBitwidth(I, VF)) { 7230 // This cast is going to be shrunk. This may remove the cast or it might 7231 // turn it into slightly different cast. For example, if MinBW == 16, 7232 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7233 // 7234 // Calculate the modified src and dest types. 7235 Type *MinVecTy = VectorTy; 7236 if (Opcode == Instruction::Trunc) { 7237 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7238 VectorTy = 7239 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7240 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7241 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7242 VectorTy = 7243 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7244 } 7245 } 7246 7247 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7248 } 7249 case Instruction::Call: { 7250 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7251 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7252 return *RedCost; 7253 bool NeedToScalarize; 7254 CallInst *CI = cast<CallInst>(I); 7255 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7256 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7257 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7258 return std::min(CallCost, IntrinsicCost); 7259 } 7260 return CallCost; 7261 } 7262 case Instruction::ExtractValue: 7263 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7264 case Instruction::Alloca: 7265 // We cannot easily widen alloca to a scalable alloca, as 7266 // the result would need to be a vector of pointers. 7267 if (VF.isScalable()) 7268 return InstructionCost::getInvalid(); 7269 LLVM_FALLTHROUGH; 7270 default: 7271 // This opcode is unknown. Assume that it is the same as 'mul'. 7272 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7273 } // end of switch. 7274 } 7275 7276 char LoopVectorize::ID = 0; 7277 7278 static const char lv_name[] = "Loop Vectorization"; 7279 7280 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7281 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7282 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7283 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7284 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7285 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7286 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7287 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7288 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7289 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7290 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7291 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7292 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7293 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7294 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7295 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7296 7297 namespace llvm { 7298 7299 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7300 7301 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7302 bool VectorizeOnlyWhenForced) { 7303 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7304 } 7305 7306 } // end namespace llvm 7307 7308 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7309 // Check if the pointer operand of a load or store instruction is 7310 // consecutive. 7311 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7312 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7313 return false; 7314 } 7315 7316 void LoopVectorizationCostModel::collectValuesToIgnore() { 7317 // Ignore ephemeral values. 7318 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7319 7320 // Find all stores to invariant variables. Since they are going to sink 7321 // outside the loop we do not need calculate cost for them. 7322 for (BasicBlock *BB : TheLoop->blocks()) 7323 for (Instruction &I : *BB) { 7324 StoreInst *SI; 7325 if ((SI = dyn_cast<StoreInst>(&I)) && 7326 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7327 ValuesToIgnore.insert(&I); 7328 } 7329 7330 // Ignore type-promoting instructions we identified during reduction 7331 // detection. 7332 for (auto &Reduction : Legal->getReductionVars()) { 7333 const RecurrenceDescriptor &RedDes = Reduction.second; 7334 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7335 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7336 } 7337 // Ignore type-casting instructions we identified during induction 7338 // detection. 7339 for (auto &Induction : Legal->getInductionVars()) { 7340 const InductionDescriptor &IndDes = Induction.second; 7341 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7342 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7343 } 7344 } 7345 7346 void LoopVectorizationCostModel::collectInLoopReductions() { 7347 for (auto &Reduction : Legal->getReductionVars()) { 7348 PHINode *Phi = Reduction.first; 7349 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7350 7351 // We don't collect reductions that are type promoted (yet). 7352 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7353 continue; 7354 7355 // If the target would prefer this reduction to happen "in-loop", then we 7356 // want to record it as such. 7357 unsigned Opcode = RdxDesc.getOpcode(); 7358 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7359 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7360 TargetTransformInfo::ReductionFlags())) 7361 continue; 7362 7363 // Check that we can correctly put the reductions into the loop, by 7364 // finding the chain of operations that leads from the phi to the loop 7365 // exit value. 7366 SmallVector<Instruction *, 4> ReductionOperations = 7367 RdxDesc.getReductionOpChain(Phi, TheLoop); 7368 bool InLoop = !ReductionOperations.empty(); 7369 if (InLoop) { 7370 InLoopReductionChains[Phi] = ReductionOperations; 7371 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7372 Instruction *LastChain = Phi; 7373 for (auto *I : ReductionOperations) { 7374 InLoopReductionImmediateChains[I] = LastChain; 7375 LastChain = I; 7376 } 7377 } 7378 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7379 << " reduction for phi: " << *Phi << "\n"); 7380 } 7381 } 7382 7383 // TODO: we could return a pair of values that specify the max VF and 7384 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7385 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7386 // doesn't have a cost model that can choose which plan to execute if 7387 // more than one is generated. 7388 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7389 LoopVectorizationCostModel &CM) { 7390 unsigned WidestType; 7391 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7392 return WidestVectorRegBits / WidestType; 7393 } 7394 7395 VectorizationFactor 7396 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7397 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7398 ElementCount VF = UserVF; 7399 // Outer loop handling: They may require CFG and instruction level 7400 // transformations before even evaluating whether vectorization is profitable. 7401 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7402 // the vectorization pipeline. 7403 if (!OrigLoop->isInnermost()) { 7404 // If the user doesn't provide a vectorization factor, determine a 7405 // reasonable one. 7406 if (UserVF.isZero()) { 7407 VF = ElementCount::getFixed(determineVPlanVF( 7408 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7409 .getFixedSize(), 7410 CM)); 7411 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7412 7413 // Make sure we have a VF > 1 for stress testing. 7414 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7415 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7416 << "overriding computed VF.\n"); 7417 VF = ElementCount::getFixed(4); 7418 } 7419 } 7420 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7421 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7422 "VF needs to be a power of two"); 7423 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7424 << "VF " << VF << " to build VPlans.\n"); 7425 buildVPlans(VF, VF); 7426 7427 // For VPlan build stress testing, we bail out after VPlan construction. 7428 if (VPlanBuildStressTest) 7429 return VectorizationFactor::Disabled(); 7430 7431 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7432 } 7433 7434 LLVM_DEBUG( 7435 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7436 "VPlan-native path.\n"); 7437 return VectorizationFactor::Disabled(); 7438 } 7439 7440 Optional<VectorizationFactor> 7441 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7442 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7443 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7444 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7445 return None; 7446 7447 // Invalidate interleave groups if all blocks of loop will be predicated. 7448 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7449 !useMaskedInterleavedAccesses(*TTI)) { 7450 LLVM_DEBUG( 7451 dbgs() 7452 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7453 "which requires masked-interleaved support.\n"); 7454 if (CM.InterleaveInfo.invalidateGroups()) 7455 // Invalidating interleave groups also requires invalidating all decisions 7456 // based on them, which includes widening decisions and uniform and scalar 7457 // values. 7458 CM.invalidateCostModelingDecisions(); 7459 } 7460 7461 ElementCount MaxUserVF = 7462 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7463 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7464 if (!UserVF.isZero() && UserVFIsLegal) { 7465 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7466 "VF needs to be a power of two"); 7467 // Collect the instructions (and their associated costs) that will be more 7468 // profitable to scalarize. 7469 if (CM.selectUserVectorizationFactor(UserVF)) { 7470 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7471 CM.collectInLoopReductions(); 7472 buildVPlansWithVPRecipes(UserVF, UserVF); 7473 LLVM_DEBUG(printPlans(dbgs())); 7474 return {{UserVF, 0, 0}}; 7475 } else 7476 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7477 "InvalidCost", ORE, OrigLoop); 7478 } 7479 7480 // Populate the set of Vectorization Factor Candidates. 7481 ElementCountSet VFCandidates; 7482 for (auto VF = ElementCount::getFixed(1); 7483 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7484 VFCandidates.insert(VF); 7485 for (auto VF = ElementCount::getScalable(1); 7486 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7487 VFCandidates.insert(VF); 7488 7489 for (const auto &VF : VFCandidates) { 7490 // Collect Uniform and Scalar instructions after vectorization with VF. 7491 CM.collectUniformsAndScalars(VF); 7492 7493 // Collect the instructions (and their associated costs) that will be more 7494 // profitable to scalarize. 7495 if (VF.isVector()) 7496 CM.collectInstsToScalarize(VF); 7497 } 7498 7499 CM.collectInLoopReductions(); 7500 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7501 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7502 7503 LLVM_DEBUG(printPlans(dbgs())); 7504 if (!MaxFactors.hasVector()) 7505 return VectorizationFactor::Disabled(); 7506 7507 // Select the optimal vectorization factor. 7508 VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); 7509 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7510 return VF; 7511 } 7512 7513 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7514 assert(count_if(VPlans, 7515 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7516 1 && 7517 "Best VF has not a single VPlan."); 7518 7519 for (const VPlanPtr &Plan : VPlans) { 7520 if (Plan->hasVF(VF)) 7521 return *Plan.get(); 7522 } 7523 llvm_unreachable("No plan found!"); 7524 } 7525 7526 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7527 SmallVector<Metadata *, 4> MDs; 7528 // Reserve first location for self reference to the LoopID metadata node. 7529 MDs.push_back(nullptr); 7530 bool IsUnrollMetadata = false; 7531 MDNode *LoopID = L->getLoopID(); 7532 if (LoopID) { 7533 // First find existing loop unrolling disable metadata. 7534 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7535 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7536 if (MD) { 7537 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7538 IsUnrollMetadata = 7539 S && S->getString().startswith("llvm.loop.unroll.disable"); 7540 } 7541 MDs.push_back(LoopID->getOperand(i)); 7542 } 7543 } 7544 7545 if (!IsUnrollMetadata) { 7546 // Add runtime unroll disable metadata. 7547 LLVMContext &Context = L->getHeader()->getContext(); 7548 SmallVector<Metadata *, 1> DisableOperands; 7549 DisableOperands.push_back( 7550 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7551 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7552 MDs.push_back(DisableNode); 7553 MDNode *NewLoopID = MDNode::get(Context, MDs); 7554 // Set operand 0 to refer to the loop id itself. 7555 NewLoopID->replaceOperandWith(0, NewLoopID); 7556 L->setLoopID(NewLoopID); 7557 } 7558 } 7559 7560 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7561 VPlan &BestVPlan, 7562 InnerLoopVectorizer &ILV, 7563 DominatorTree *DT, 7564 bool IsEpilogueVectorization) { 7565 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7566 << '\n'); 7567 7568 // Perform the actual loop transformation. 7569 7570 // 1. Set up the skeleton for vectorization, including vector pre-header and 7571 // middle block. The vector loop is created during VPlan execution. 7572 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7573 Value *CanonicalIVStartValue; 7574 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7575 ILV.createVectorizedLoopSkeleton(); 7576 7577 // Only use noalias metadata when using memory checks guaranteeing no overlap 7578 // across all iterations. 7579 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7580 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7581 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7582 7583 // We currently don't use LoopVersioning for the actual loop cloning but we 7584 // still use it to add the noalias metadata. 7585 // TODO: Find a better way to re-use LoopVersioning functionality to add 7586 // metadata. 7587 State.LVer = std::make_unique<LoopVersioning>( 7588 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7589 PSE.getSE()); 7590 State.LVer->prepareNoAliasMetadata(); 7591 } 7592 7593 ILV.collectPoisonGeneratingRecipes(State); 7594 7595 ILV.printDebugTracesAtStart(); 7596 7597 //===------------------------------------------------===// 7598 // 7599 // Notice: any optimization or new instruction that go 7600 // into the code below should also be implemented in 7601 // the cost-model. 7602 // 7603 //===------------------------------------------------===// 7604 7605 // 2. Copy and widen instructions from the old loop into the new loop. 7606 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7607 ILV.getOrCreateVectorTripCount(nullptr), 7608 CanonicalIVStartValue, State, 7609 IsEpilogueVectorization); 7610 7611 BestVPlan.execute(&State); 7612 7613 // Keep all loop hints from the original loop on the vector loop (we'll 7614 // replace the vectorizer-specific hints below). 7615 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7616 7617 Optional<MDNode *> VectorizedLoopID = 7618 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7619 LLVMLoopVectorizeFollowupVectorized}); 7620 7621 VPBasicBlock *HeaderVPBB = 7622 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7623 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7624 if (VectorizedLoopID) 7625 L->setLoopID(VectorizedLoopID.getValue()); 7626 else { 7627 // Keep all loop hints from the original loop on the vector loop (we'll 7628 // replace the vectorizer-specific hints below). 7629 if (MDNode *LID = OrigLoop->getLoopID()) 7630 L->setLoopID(LID); 7631 7632 LoopVectorizeHints Hints(L, true, *ORE); 7633 Hints.setAlreadyVectorized(); 7634 } 7635 // Disable runtime unrolling when vectorizing the epilogue loop. 7636 if (CanonicalIVStartValue) 7637 AddRuntimeUnrollDisableMetaData(L); 7638 7639 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7640 // predication, updating analyses. 7641 ILV.fixVectorizedLoop(State, BestVPlan); 7642 7643 ILV.printDebugTracesAtEnd(); 7644 } 7645 7646 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7647 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7648 for (const auto &Plan : VPlans) 7649 if (PrintVPlansInDotFormat) 7650 Plan->printDOT(O); 7651 else 7652 Plan->print(O); 7653 } 7654 #endif 7655 7656 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7657 7658 //===--------------------------------------------------------------------===// 7659 // EpilogueVectorizerMainLoop 7660 //===--------------------------------------------------------------------===// 7661 7662 /// This function is partially responsible for generating the control flow 7663 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7664 std::pair<BasicBlock *, Value *> 7665 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7666 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7667 7668 // Workaround! Compute the trip count of the original loop and cache it 7669 // before we start modifying the CFG. This code has a systemic problem 7670 // wherein it tries to run analysis over partially constructed IR; this is 7671 // wrong, and not simply for SCEV. The trip count of the original loop 7672 // simply happens to be prone to hitting this in practice. In theory, we 7673 // can hit the same issue for any SCEV, or ValueTracking query done during 7674 // mutation. See PR49900. 7675 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7676 createVectorLoopSkeleton(""); 7677 7678 // Generate the code to check the minimum iteration count of the vector 7679 // epilogue (see below). 7680 EPI.EpilogueIterationCountCheck = 7681 emitIterationCountCheck(LoopScalarPreHeader, true); 7682 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7683 7684 // Generate the code to check any assumptions that we've made for SCEV 7685 // expressions. 7686 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7687 7688 // Generate the code that checks at runtime if arrays overlap. We put the 7689 // checks into a separate block to make the more common case of few elements 7690 // faster. 7691 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7692 7693 // Generate the iteration count check for the main loop, *after* the check 7694 // for the epilogue loop, so that the path-length is shorter for the case 7695 // that goes directly through the vector epilogue. The longer-path length for 7696 // the main loop is compensated for, by the gain from vectorizing the larger 7697 // trip count. Note: the branch will get updated later on when we vectorize 7698 // the epilogue. 7699 EPI.MainLoopIterationCountCheck = 7700 emitIterationCountCheck(LoopScalarPreHeader, false); 7701 7702 // Generate the induction variable. 7703 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7704 7705 // Skip induction resume value creation here because they will be created in 7706 // the second pass. If we created them here, they wouldn't be used anyway, 7707 // because the vplan in the second pass still contains the inductions from the 7708 // original loop. 7709 7710 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7711 } 7712 7713 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7714 LLVM_DEBUG({ 7715 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7716 << "Main Loop VF:" << EPI.MainLoopVF 7717 << ", Main Loop UF:" << EPI.MainLoopUF 7718 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7719 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7720 }); 7721 } 7722 7723 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7724 DEBUG_WITH_TYPE(VerboseDebug, { 7725 dbgs() << "intermediate fn:\n" 7726 << *OrigLoop->getHeader()->getParent() << "\n"; 7727 }); 7728 } 7729 7730 BasicBlock * 7731 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7732 bool ForEpilogue) { 7733 assert(Bypass && "Expected valid bypass basic block."); 7734 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7735 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7736 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7737 // Reuse existing vector loop preheader for TC checks. 7738 // Note that new preheader block is generated for vector loop. 7739 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7740 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7741 7742 // Generate code to check if the loop's trip count is less than VF * UF of the 7743 // main vector loop. 7744 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7745 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7746 7747 Value *CheckMinIters = Builder.CreateICmp( 7748 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7749 "min.iters.check"); 7750 7751 if (!ForEpilogue) 7752 TCCheckBlock->setName("vector.main.loop.iter.check"); 7753 7754 // Create new preheader for vector loop. 7755 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7756 DT, LI, nullptr, "vector.ph"); 7757 7758 if (ForEpilogue) { 7759 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7760 DT->getNode(Bypass)->getIDom()) && 7761 "TC check is expected to dominate Bypass"); 7762 7763 // Update dominator for Bypass & LoopExit. 7764 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7765 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7766 // For loops with multiple exits, there's no edge from the middle block 7767 // to exit blocks (as the epilogue must run) and thus no need to update 7768 // the immediate dominator of the exit blocks. 7769 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7770 7771 LoopBypassBlocks.push_back(TCCheckBlock); 7772 7773 // Save the trip count so we don't have to regenerate it in the 7774 // vec.epilog.iter.check. This is safe to do because the trip count 7775 // generated here dominates the vector epilog iter check. 7776 EPI.TripCount = Count; 7777 } 7778 7779 ReplaceInstWithInst( 7780 TCCheckBlock->getTerminator(), 7781 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7782 7783 return TCCheckBlock; 7784 } 7785 7786 //===--------------------------------------------------------------------===// 7787 // EpilogueVectorizerEpilogueLoop 7788 //===--------------------------------------------------------------------===// 7789 7790 /// This function is partially responsible for generating the control flow 7791 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7792 std::pair<BasicBlock *, Value *> 7793 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7794 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7795 createVectorLoopSkeleton("vec.epilog."); 7796 7797 // Now, compare the remaining count and if there aren't enough iterations to 7798 // execute the vectorized epilogue skip to the scalar part. 7799 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7800 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7801 LoopVectorPreHeader = 7802 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7803 LI, nullptr, "vec.epilog.ph"); 7804 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7805 VecEpilogueIterationCountCheck); 7806 7807 // Adjust the control flow taking the state info from the main loop 7808 // vectorization into account. 7809 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7810 "expected this to be saved from the previous pass."); 7811 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7812 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7813 7814 DT->changeImmediateDominator(LoopVectorPreHeader, 7815 EPI.MainLoopIterationCountCheck); 7816 7817 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7818 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7819 7820 if (EPI.SCEVSafetyCheck) 7821 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7822 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7823 if (EPI.MemSafetyCheck) 7824 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7825 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7826 7827 DT->changeImmediateDominator( 7828 VecEpilogueIterationCountCheck, 7829 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7830 7831 DT->changeImmediateDominator(LoopScalarPreHeader, 7832 EPI.EpilogueIterationCountCheck); 7833 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7834 // If there is an epilogue which must run, there's no edge from the 7835 // middle block to exit blocks and thus no need to update the immediate 7836 // dominator of the exit blocks. 7837 DT->changeImmediateDominator(LoopExitBlock, 7838 EPI.EpilogueIterationCountCheck); 7839 7840 // Keep track of bypass blocks, as they feed start values to the induction 7841 // phis in the scalar loop preheader. 7842 if (EPI.SCEVSafetyCheck) 7843 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7844 if (EPI.MemSafetyCheck) 7845 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7846 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7847 7848 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7849 // merge control-flow from the latch block and the middle block. Update the 7850 // incoming values here and move the Phi into the preheader. 7851 SmallVector<PHINode *, 4> PhisInBlock; 7852 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7853 PhisInBlock.push_back(&Phi); 7854 7855 for (PHINode *Phi : PhisInBlock) { 7856 Phi->replaceIncomingBlockWith( 7857 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7858 VecEpilogueIterationCountCheck); 7859 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7860 if (EPI.SCEVSafetyCheck) 7861 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7862 if (EPI.MemSafetyCheck) 7863 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7864 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7865 } 7866 7867 // Generate a resume induction for the vector epilogue and put it in the 7868 // vector epilogue preheader 7869 Type *IdxTy = Legal->getWidestInductionType(); 7870 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7871 LoopVectorPreHeader->getFirstNonPHI()); 7872 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7873 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7874 EPI.MainLoopIterationCountCheck); 7875 7876 // Generate induction resume values. These variables save the new starting 7877 // indexes for the scalar loop. They are used to test if there are any tail 7878 // iterations left once the vector loop has completed. 7879 // Note that when the vectorized epilogue is skipped due to iteration count 7880 // check, then the resume value for the induction variable comes from 7881 // the trip count of the main vector loop, hence passing the AdditionalBypass 7882 // argument. 7883 createInductionResumeValues({VecEpilogueIterationCountCheck, 7884 EPI.VectorTripCount} /* AdditionalBypass */); 7885 7886 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7887 } 7888 7889 BasicBlock * 7890 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7891 BasicBlock *Bypass, BasicBlock *Insert) { 7892 7893 assert(EPI.TripCount && 7894 "Expected trip count to have been safed in the first pass."); 7895 assert( 7896 (!isa<Instruction>(EPI.TripCount) || 7897 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7898 "saved trip count does not dominate insertion point."); 7899 Value *TC = EPI.TripCount; 7900 IRBuilder<> Builder(Insert->getTerminator()); 7901 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7902 7903 // Generate code to check if the loop's trip count is less than VF * UF of the 7904 // vector epilogue loop. 7905 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7906 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7907 7908 Value *CheckMinIters = 7909 Builder.CreateICmp(P, Count, 7910 createStepForVF(Builder, Count->getType(), 7911 EPI.EpilogueVF, EPI.EpilogueUF), 7912 "min.epilog.iters.check"); 7913 7914 ReplaceInstWithInst( 7915 Insert->getTerminator(), 7916 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7917 7918 LoopBypassBlocks.push_back(Insert); 7919 return Insert; 7920 } 7921 7922 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7923 LLVM_DEBUG({ 7924 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7925 << "Epilogue Loop VF:" << EPI.EpilogueVF 7926 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7927 }); 7928 } 7929 7930 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7931 DEBUG_WITH_TYPE(VerboseDebug, { 7932 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7933 }); 7934 } 7935 7936 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7937 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7938 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7939 bool PredicateAtRangeStart = Predicate(Range.Start); 7940 7941 for (ElementCount TmpVF = Range.Start * 2; 7942 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7943 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7944 Range.End = TmpVF; 7945 break; 7946 } 7947 7948 return PredicateAtRangeStart; 7949 } 7950 7951 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7952 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7953 /// of VF's starting at a given VF and extending it as much as possible. Each 7954 /// vectorization decision can potentially shorten this sub-range during 7955 /// buildVPlan(). 7956 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7957 ElementCount MaxVF) { 7958 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7959 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7960 VFRange SubRange = {VF, MaxVFPlusOne}; 7961 VPlans.push_back(buildVPlan(SubRange)); 7962 VF = SubRange.End; 7963 } 7964 } 7965 7966 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7967 VPlanPtr &Plan) { 7968 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7969 7970 // Look for cached value. 7971 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7972 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7973 if (ECEntryIt != EdgeMaskCache.end()) 7974 return ECEntryIt->second; 7975 7976 VPValue *SrcMask = createBlockInMask(Src, Plan); 7977 7978 // The terminator has to be a branch inst! 7979 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7980 assert(BI && "Unexpected terminator found"); 7981 7982 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7983 return EdgeMaskCache[Edge] = SrcMask; 7984 7985 // If source is an exiting block, we know the exit edge is dynamically dead 7986 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7987 // adding uses of an otherwise potentially dead instruction. 7988 if (OrigLoop->isLoopExiting(Src)) 7989 return EdgeMaskCache[Edge] = SrcMask; 7990 7991 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 7992 assert(EdgeMask && "No Edge Mask found for condition"); 7993 7994 if (BI->getSuccessor(0) != Dst) 7995 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 7996 7997 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 7998 // The condition is 'SrcMask && EdgeMask', which is equivalent to 7999 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8000 // The select version does not introduce new UB if SrcMask is false and 8001 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8002 VPValue *False = Plan->getOrAddVPValue( 8003 ConstantInt::getFalse(BI->getCondition()->getType())); 8004 EdgeMask = 8005 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8006 } 8007 8008 return EdgeMaskCache[Edge] = EdgeMask; 8009 } 8010 8011 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8012 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8013 8014 // Look for cached value. 8015 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8016 if (BCEntryIt != BlockMaskCache.end()) 8017 return BCEntryIt->second; 8018 8019 // All-one mask is modelled as no-mask following the convention for masked 8020 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8021 VPValue *BlockMask = nullptr; 8022 8023 if (OrigLoop->getHeader() == BB) { 8024 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8025 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8026 8027 assert(CM.foldTailByMasking() && "must fold the tail"); 8028 8029 // If we're using the active lane mask for control flow, then we get the 8030 // mask from the active lane mask PHI that is cached in the VPlan. 8031 PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); 8032 if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) 8033 return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); 8034 8035 // Introduce the early-exit compare IV <= BTC to form header block mask. 8036 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8037 // constructing the desired canonical IV in the header block as its first 8038 // non-phi instructions. 8039 8040 VPBasicBlock *HeaderVPBB = 8041 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8042 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8043 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8044 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8045 8046 VPBuilder::InsertPointGuard Guard(Builder); 8047 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8048 if (EmitGetActiveLaneMask != PredicationStyle::None) { 8049 VPValue *TC = Plan->getOrCreateTripCount(); 8050 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, 8051 nullptr, "active.lane.mask"); 8052 } else { 8053 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8054 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8055 } 8056 return BlockMaskCache[BB] = BlockMask; 8057 } 8058 8059 // This is the block mask. We OR all incoming edges. 8060 for (auto *Predecessor : predecessors(BB)) { 8061 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8062 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8063 return BlockMaskCache[BB] = EdgeMask; 8064 8065 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8066 BlockMask = EdgeMask; 8067 continue; 8068 } 8069 8070 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8071 } 8072 8073 return BlockMaskCache[BB] = BlockMask; 8074 } 8075 8076 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8077 ArrayRef<VPValue *> Operands, 8078 VFRange &Range, 8079 VPlanPtr &Plan) { 8080 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8081 "Must be called with either a load or store"); 8082 8083 auto willWiden = [&](ElementCount VF) -> bool { 8084 LoopVectorizationCostModel::InstWidening Decision = 8085 CM.getWideningDecision(I, VF); 8086 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8087 "CM decision should be taken at this point."); 8088 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8089 return true; 8090 if (CM.isScalarAfterVectorization(I, VF) || 8091 CM.isProfitableToScalarize(I, VF)) 8092 return false; 8093 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8094 }; 8095 8096 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8097 return nullptr; 8098 8099 VPValue *Mask = nullptr; 8100 if (Legal->isMaskRequired(I)) 8101 Mask = createBlockInMask(I->getParent(), Plan); 8102 8103 // Determine if the pointer operand of the access is either consecutive or 8104 // reverse consecutive. 8105 LoopVectorizationCostModel::InstWidening Decision = 8106 CM.getWideningDecision(I, Range.Start); 8107 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8108 bool Consecutive = 8109 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8110 8111 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8112 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8113 Consecutive, Reverse); 8114 8115 StoreInst *Store = cast<StoreInst>(I); 8116 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8117 Mask, Consecutive, Reverse); 8118 } 8119 8120 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8121 /// insert a recipe to expand the step for the induction recipe. 8122 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8123 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8124 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8125 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8126 // Returns true if an instruction \p I should be scalarized instead of 8127 // vectorized for the chosen vectorization factor. 8128 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8129 return CM.isScalarAfterVectorization(I, VF) || 8130 CM.isProfitableToScalarize(I, VF); 8131 }; 8132 8133 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8134 [&](ElementCount VF) { 8135 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8136 }, 8137 Range); 8138 assert(IndDesc.getStartValue() == 8139 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8140 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8141 "step must be loop invariant"); 8142 8143 VPValue *Step = 8144 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8145 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8146 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8147 !NeedsScalarIVOnly); 8148 } 8149 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8150 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8151 !NeedsScalarIVOnly); 8152 } 8153 8154 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8155 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8156 8157 // Check if this is an integer or fp induction. If so, build the recipe that 8158 // produces its scalar and vector values. 8159 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8160 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8161 *PSE.getSE(), *OrigLoop, Range); 8162 8163 // Check if this is pointer induction. If so, build the recipe for it. 8164 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8165 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8166 *PSE.getSE()); 8167 return nullptr; 8168 } 8169 8170 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8171 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8172 // Optimize the special case where the source is a constant integer 8173 // induction variable. Notice that we can only optimize the 'trunc' case 8174 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8175 // (c) other casts depend on pointer size. 8176 8177 // Determine whether \p K is a truncation based on an induction variable that 8178 // can be optimized. 8179 auto isOptimizableIVTruncate = 8180 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8181 return [=](ElementCount VF) -> bool { 8182 return CM.isOptimizableIVTruncate(K, VF); 8183 }; 8184 }; 8185 8186 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8187 isOptimizableIVTruncate(I), Range)) { 8188 8189 auto *Phi = cast<PHINode>(I->getOperand(0)); 8190 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8191 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8192 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8193 *PSE.getSE(), *OrigLoop, Range); 8194 } 8195 return nullptr; 8196 } 8197 8198 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8199 ArrayRef<VPValue *> Operands, 8200 VPlanPtr &Plan) { 8201 // If all incoming values are equal, the incoming VPValue can be used directly 8202 // instead of creating a new VPBlendRecipe. 8203 VPValue *FirstIncoming = Operands[0]; 8204 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8205 return FirstIncoming == Inc; 8206 })) { 8207 return Operands[0]; 8208 } 8209 8210 unsigned NumIncoming = Phi->getNumIncomingValues(); 8211 // For in-loop reductions, we do not need to create an additional select. 8212 VPValue *InLoopVal = nullptr; 8213 for (unsigned In = 0; In < NumIncoming; In++) { 8214 PHINode *PhiOp = 8215 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8216 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8217 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8218 InLoopVal = Operands[In]; 8219 } 8220 } 8221 8222 assert((!InLoopVal || NumIncoming == 2) && 8223 "Found an in-loop reduction for PHI with unexpected number of " 8224 "incoming values"); 8225 if (InLoopVal) 8226 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8227 8228 // We know that all PHIs in non-header blocks are converted into selects, so 8229 // we don't have to worry about the insertion order and we can just use the 8230 // builder. At this point we generate the predication tree. There may be 8231 // duplications since this is a simple recursive scan, but future 8232 // optimizations will clean it up. 8233 SmallVector<VPValue *, 2> OperandsWithMask; 8234 8235 for (unsigned In = 0; In < NumIncoming; In++) { 8236 VPValue *EdgeMask = 8237 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8238 assert((EdgeMask || NumIncoming == 1) && 8239 "Multiple predecessors with one having a full mask"); 8240 OperandsWithMask.push_back(Operands[In]); 8241 if (EdgeMask) 8242 OperandsWithMask.push_back(EdgeMask); 8243 } 8244 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8245 } 8246 8247 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8248 ArrayRef<VPValue *> Operands, 8249 VFRange &Range) const { 8250 8251 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8252 [this, CI](ElementCount VF) { 8253 return CM.isScalarWithPredication(CI, VF); 8254 }, 8255 Range); 8256 8257 if (IsPredicated) 8258 return nullptr; 8259 8260 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8261 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8262 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8263 ID == Intrinsic::pseudoprobe || 8264 ID == Intrinsic::experimental_noalias_scope_decl)) 8265 return nullptr; 8266 8267 auto willWiden = [&](ElementCount VF) -> bool { 8268 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8269 // The following case may be scalarized depending on the VF. 8270 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8271 // version of the instruction. 8272 // Is it beneficial to perform intrinsic call compared to lib call? 8273 bool NeedToScalarize = false; 8274 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8275 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8276 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8277 return UseVectorIntrinsic || !NeedToScalarize; 8278 }; 8279 8280 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8281 return nullptr; 8282 8283 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8284 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8285 } 8286 8287 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8288 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8289 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8290 // Instruction should be widened, unless it is scalar after vectorization, 8291 // scalarization is profitable or it is predicated. 8292 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8293 return CM.isScalarAfterVectorization(I, VF) || 8294 CM.isProfitableToScalarize(I, VF) || 8295 CM.isScalarWithPredication(I, VF); 8296 }; 8297 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8298 Range); 8299 } 8300 8301 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8302 ArrayRef<VPValue *> Operands) const { 8303 auto IsVectorizableOpcode = [](unsigned Opcode) { 8304 switch (Opcode) { 8305 case Instruction::Add: 8306 case Instruction::And: 8307 case Instruction::AShr: 8308 case Instruction::BitCast: 8309 case Instruction::FAdd: 8310 case Instruction::FCmp: 8311 case Instruction::FDiv: 8312 case Instruction::FMul: 8313 case Instruction::FNeg: 8314 case Instruction::FPExt: 8315 case Instruction::FPToSI: 8316 case Instruction::FPToUI: 8317 case Instruction::FPTrunc: 8318 case Instruction::FRem: 8319 case Instruction::FSub: 8320 case Instruction::ICmp: 8321 case Instruction::IntToPtr: 8322 case Instruction::LShr: 8323 case Instruction::Mul: 8324 case Instruction::Or: 8325 case Instruction::PtrToInt: 8326 case Instruction::SDiv: 8327 case Instruction::Select: 8328 case Instruction::SExt: 8329 case Instruction::Shl: 8330 case Instruction::SIToFP: 8331 case Instruction::SRem: 8332 case Instruction::Sub: 8333 case Instruction::Trunc: 8334 case Instruction::UDiv: 8335 case Instruction::UIToFP: 8336 case Instruction::URem: 8337 case Instruction::Xor: 8338 case Instruction::ZExt: 8339 case Instruction::Freeze: 8340 return true; 8341 } 8342 return false; 8343 }; 8344 8345 if (!IsVectorizableOpcode(I->getOpcode())) 8346 return nullptr; 8347 8348 // Success: widen this instruction. 8349 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8350 } 8351 8352 void VPRecipeBuilder::fixHeaderPhis() { 8353 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8354 for (VPHeaderPHIRecipe *R : PhisToFix) { 8355 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8356 VPRecipeBase *IncR = 8357 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8358 R->addOperand(IncR->getVPSingleValue()); 8359 } 8360 } 8361 8362 VPBasicBlock *VPRecipeBuilder::handleReplication( 8363 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8364 VPlanPtr &Plan) { 8365 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8366 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8367 Range); 8368 8369 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8370 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8371 Range); 8372 8373 // Even if the instruction is not marked as uniform, there are certain 8374 // intrinsic calls that can be effectively treated as such, so we check for 8375 // them here. Conservatively, we only do this for scalable vectors, since 8376 // for fixed-width VFs we can always fall back on full scalarization. 8377 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8378 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8379 case Intrinsic::assume: 8380 case Intrinsic::lifetime_start: 8381 case Intrinsic::lifetime_end: 8382 // For scalable vectors if one of the operands is variant then we still 8383 // want to mark as uniform, which will generate one instruction for just 8384 // the first lane of the vector. We can't scalarize the call in the same 8385 // way as for fixed-width vectors because we don't know how many lanes 8386 // there are. 8387 // 8388 // The reasons for doing it this way for scalable vectors are: 8389 // 1. For the assume intrinsic generating the instruction for the first 8390 // lane is still be better than not generating any at all. For 8391 // example, the input may be a splat across all lanes. 8392 // 2. For the lifetime start/end intrinsics the pointer operand only 8393 // does anything useful when the input comes from a stack object, 8394 // which suggests it should always be uniform. For non-stack objects 8395 // the effect is to poison the object, which still allows us to 8396 // remove the call. 8397 IsUniform = true; 8398 break; 8399 default: 8400 break; 8401 } 8402 } 8403 8404 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8405 IsUniform, IsPredicated); 8406 setRecipe(I, Recipe); 8407 Plan->addVPValue(I, Recipe); 8408 8409 // Find if I uses a predicated instruction. If so, it will use its scalar 8410 // value. Avoid hoisting the insert-element which packs the scalar value into 8411 // a vector value, as that happens iff all users use the vector value. 8412 for (VPValue *Op : Recipe->operands()) { 8413 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8414 if (!PredR) 8415 continue; 8416 auto *RepR = 8417 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8418 assert(RepR->isPredicated() && 8419 "expected Replicate recipe to be predicated"); 8420 RepR->setAlsoPack(false); 8421 } 8422 8423 // Finalize the recipe for Instr, first if it is not predicated. 8424 if (!IsPredicated) { 8425 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8426 VPBB->appendRecipe(Recipe); 8427 return VPBB; 8428 } 8429 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8430 8431 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8432 assert(SingleSucc && "VPBB must have a single successor when handling " 8433 "predicated replication."); 8434 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8435 // Record predicated instructions for above packing optimizations. 8436 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8437 VPBlockUtils::insertBlockAfter(Region, VPBB); 8438 auto *RegSucc = new VPBasicBlock(); 8439 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8440 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8441 return RegSucc; 8442 } 8443 8444 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8445 VPRecipeBase *PredRecipe, 8446 VPlanPtr &Plan) { 8447 // Instructions marked for predication are replicated and placed under an 8448 // if-then construct to prevent side-effects. 8449 8450 // Generate recipes to compute the block mask for this region. 8451 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8452 8453 // Build the triangular if-then region. 8454 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8455 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8456 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8457 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8458 auto *PHIRecipe = Instr->getType()->isVoidTy() 8459 ? nullptr 8460 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8461 if (PHIRecipe) { 8462 Plan->removeVPValueFor(Instr); 8463 Plan->addVPValue(Instr, PHIRecipe); 8464 } 8465 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8466 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8467 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8468 8469 // Note: first set Entry as region entry and then connect successors starting 8470 // from it in order, to propagate the "parent" of each VPBasicBlock. 8471 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8472 VPBlockUtils::connectBlocks(Pred, Exiting); 8473 8474 return Region; 8475 } 8476 8477 VPRecipeOrVPValueTy 8478 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8479 ArrayRef<VPValue *> Operands, 8480 VFRange &Range, VPlanPtr &Plan) { 8481 // First, check for specific widening recipes that deal with inductions, Phi 8482 // nodes, calls and memory operations. 8483 VPRecipeBase *Recipe; 8484 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8485 if (Phi->getParent() != OrigLoop->getHeader()) 8486 return tryToBlend(Phi, Operands, Plan); 8487 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8488 return toVPRecipeResult(Recipe); 8489 8490 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8491 assert((Legal->isReductionVariable(Phi) || 8492 Legal->isFirstOrderRecurrence(Phi)) && 8493 "can only widen reductions and first-order recurrences here"); 8494 VPValue *StartV = Operands[0]; 8495 if (Legal->isReductionVariable(Phi)) { 8496 const RecurrenceDescriptor &RdxDesc = 8497 Legal->getReductionVars().find(Phi)->second; 8498 assert(RdxDesc.getRecurrenceStartValue() == 8499 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8500 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8501 CM.isInLoopReduction(Phi), 8502 CM.useOrderedReductions(RdxDesc)); 8503 } else { 8504 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8505 } 8506 8507 // Record the incoming value from the backedge, so we can add the incoming 8508 // value from the backedge after all recipes have been created. 8509 recordRecipeOf(cast<Instruction>( 8510 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8511 PhisToFix.push_back(PhiRecipe); 8512 return toVPRecipeResult(PhiRecipe); 8513 } 8514 8515 if (isa<TruncInst>(Instr) && 8516 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8517 Range, *Plan))) 8518 return toVPRecipeResult(Recipe); 8519 8520 // All widen recipes below deal only with VF > 1. 8521 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8522 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8523 return nullptr; 8524 8525 if (auto *CI = dyn_cast<CallInst>(Instr)) 8526 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8527 8528 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8529 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8530 8531 if (!shouldWiden(Instr, Range)) 8532 return nullptr; 8533 8534 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8535 return toVPRecipeResult(new VPWidenGEPRecipe( 8536 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8537 8538 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8539 bool InvariantCond = 8540 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8541 return toVPRecipeResult(new VPWidenSelectRecipe( 8542 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8543 } 8544 8545 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8546 } 8547 8548 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8549 ElementCount MaxVF) { 8550 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8551 8552 // Add assume instructions we need to drop to DeadInstructions, to prevent 8553 // them from being added to the VPlan. 8554 // TODO: We only need to drop assumes in blocks that get flattend. If the 8555 // control flow is preserved, we should keep them. 8556 SmallPtrSet<Instruction *, 4> DeadInstructions; 8557 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8558 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8559 8560 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8561 // Dead instructions do not need sinking. Remove them from SinkAfter. 8562 for (Instruction *I : DeadInstructions) 8563 SinkAfter.erase(I); 8564 8565 // Cannot sink instructions after dead instructions (there won't be any 8566 // recipes for them). Instead, find the first non-dead previous instruction. 8567 for (auto &P : Legal->getSinkAfter()) { 8568 Instruction *SinkTarget = P.second; 8569 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8570 (void)FirstInst; 8571 while (DeadInstructions.contains(SinkTarget)) { 8572 assert( 8573 SinkTarget != FirstInst && 8574 "Must find a live instruction (at least the one feeding the " 8575 "first-order recurrence PHI) before reaching beginning of the block"); 8576 SinkTarget = SinkTarget->getPrevNode(); 8577 assert(SinkTarget != P.first && 8578 "sink source equals target, no sinking required"); 8579 } 8580 P.second = SinkTarget; 8581 } 8582 8583 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8584 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8585 VFRange SubRange = {VF, MaxVFPlusOne}; 8586 VPlans.push_back( 8587 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8588 VF = SubRange.End; 8589 } 8590 } 8591 8592 // Add the necessary canonical IV and branch recipes required to control the 8593 // loop. 8594 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8595 bool HasNUW, 8596 bool UseLaneMaskForLoopControlFlow) { 8597 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8598 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8599 8600 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8601 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8602 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8603 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8604 Header->insert(CanonicalIVPHI, Header->begin()); 8605 8606 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8607 // IV by VF * UF. 8608 auto *CanonicalIVIncrement = 8609 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8610 : VPInstruction::CanonicalIVIncrement, 8611 {CanonicalIVPHI}, DL, "index.next"); 8612 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8613 8614 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8615 EB->appendRecipe(CanonicalIVIncrement); 8616 8617 if (UseLaneMaskForLoopControlFlow) { 8618 // Create the active lane mask instruction in the vplan preheader. 8619 VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); 8620 8621 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since 8622 // we have to take unrolling into account. Each part needs to start at 8623 // Part * VF 8624 auto *CanonicalIVIncrementParts = 8625 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8626 : VPInstruction::CanonicalIVIncrementForPart, 8627 {StartV}, DL, "index.part.next"); 8628 Preheader->appendRecipe(CanonicalIVIncrementParts); 8629 8630 // Create the ActiveLaneMask instruction using the correct start values. 8631 VPValue *TC = Plan.getOrCreateTripCount(); 8632 auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8633 {CanonicalIVIncrementParts, TC}, DL, 8634 "active.lane.mask.entry"); 8635 Preheader->appendRecipe(EntryALM); 8636 8637 // Now create the ActiveLaneMaskPhi recipe in the main loop using the 8638 // preheader ActiveLaneMask instruction. 8639 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); 8640 Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); 8641 8642 // Create the active lane mask for the next iteration of the loop. 8643 CanonicalIVIncrementParts = 8644 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8645 : VPInstruction::CanonicalIVIncrementForPart, 8646 {CanonicalIVIncrement}, DL); 8647 EB->appendRecipe(CanonicalIVIncrementParts); 8648 8649 auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8650 {CanonicalIVIncrementParts, TC}, DL, 8651 "active.lane.mask.next"); 8652 EB->appendRecipe(ALM); 8653 LaneMaskPhi->addOperand(ALM); 8654 8655 // We have to invert the mask here because a true condition means jumping 8656 // to the exit block. 8657 auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); 8658 EB->appendRecipe(NotMask); 8659 8660 VPInstruction *BranchBack = 8661 new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); 8662 EB->appendRecipe(BranchBack); 8663 } else { 8664 // Add the BranchOnCount VPInstruction to the latch. 8665 VPInstruction *BranchBack = new VPInstruction( 8666 VPInstruction::BranchOnCount, 8667 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8668 EB->appendRecipe(BranchBack); 8669 } 8670 } 8671 8672 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8673 // original exit block. 8674 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8675 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8676 VPlan &Plan) { 8677 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8678 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8679 // Only handle single-exit loops with unique exit blocks for now. 8680 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8681 return; 8682 8683 // Introduce VPUsers modeling the exit values. 8684 for (PHINode &ExitPhi : ExitBB->phis()) { 8685 Value *IncomingValue = 8686 ExitPhi.getIncomingValueForBlock(ExitingBB); 8687 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8688 Plan.addLiveOut(&ExitPhi, V); 8689 } 8690 } 8691 8692 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8693 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8694 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8695 8696 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8697 8698 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8699 8700 // --------------------------------------------------------------------------- 8701 // Pre-construction: record ingredients whose recipes we'll need to further 8702 // process after constructing the initial VPlan. 8703 // --------------------------------------------------------------------------- 8704 8705 // Mark instructions we'll need to sink later and their targets as 8706 // ingredients whose recipe we'll need to record. 8707 for (auto &Entry : SinkAfter) { 8708 RecipeBuilder.recordRecipeOf(Entry.first); 8709 RecipeBuilder.recordRecipeOf(Entry.second); 8710 } 8711 for (auto &Reduction : CM.getInLoopReductionChains()) { 8712 PHINode *Phi = Reduction.first; 8713 RecurKind Kind = 8714 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8715 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8716 8717 RecipeBuilder.recordRecipeOf(Phi); 8718 for (auto &R : ReductionOperations) { 8719 RecipeBuilder.recordRecipeOf(R); 8720 // For min/max reductions, where we have a pair of icmp/select, we also 8721 // need to record the ICmp recipe, so it can be removed later. 8722 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8723 "Only min/max recurrences allowed for inloop reductions"); 8724 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8725 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8726 } 8727 } 8728 8729 // For each interleave group which is relevant for this (possibly trimmed) 8730 // Range, add it to the set of groups to be later applied to the VPlan and add 8731 // placeholders for its members' Recipes which we'll be replacing with a 8732 // single VPInterleaveRecipe. 8733 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8734 auto applyIG = [IG, this](ElementCount VF) -> bool { 8735 return (VF.isVector() && // Query is illegal for VF == 1 8736 CM.getWideningDecision(IG->getInsertPos(), VF) == 8737 LoopVectorizationCostModel::CM_Interleave); 8738 }; 8739 if (!getDecisionAndClampRange(applyIG, Range)) 8740 continue; 8741 InterleaveGroups.insert(IG); 8742 for (unsigned i = 0; i < IG->getFactor(); i++) 8743 if (Instruction *Member = IG->getMember(i)) 8744 RecipeBuilder.recordRecipeOf(Member); 8745 }; 8746 8747 // --------------------------------------------------------------------------- 8748 // Build initial VPlan: Scan the body of the loop in a topological order to 8749 // visit each basic block after having visited its predecessor basic blocks. 8750 // --------------------------------------------------------------------------- 8751 8752 // Create initial VPlan skeleton, starting with a block for the pre-header, 8753 // followed by a region for the vector loop, followed by the middle block. The 8754 // skeleton vector loop region contains a header and latch block. 8755 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8756 auto Plan = std::make_unique<VPlan>(Preheader); 8757 8758 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8759 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8760 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8761 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8762 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8763 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8764 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8765 8766 Instruction *DLInst = 8767 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8768 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8769 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8770 !CM.foldTailByMasking(), 8771 CM.useActiveLaneMaskForControlFlow()); 8772 8773 // Scan the body of the loop in a topological order to visit each basic block 8774 // after having visited its predecessor basic blocks. 8775 LoopBlocksDFS DFS(OrigLoop); 8776 DFS.perform(LI); 8777 8778 VPBasicBlock *VPBB = HeaderVPBB; 8779 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8780 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8781 // Relevant instructions from basic block BB will be grouped into VPRecipe 8782 // ingredients and fill a new VPBasicBlock. 8783 unsigned VPBBsForBB = 0; 8784 if (VPBB != HeaderVPBB) 8785 VPBB->setName(BB->getName()); 8786 Builder.setInsertPoint(VPBB); 8787 8788 // Introduce each ingredient into VPlan. 8789 // TODO: Model and preserve debug intrinsics in VPlan. 8790 for (Instruction &I : BB->instructionsWithoutDebug()) { 8791 Instruction *Instr = &I; 8792 8793 // First filter out irrelevant instructions, to ensure no recipes are 8794 // built for them. 8795 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8796 continue; 8797 8798 SmallVector<VPValue *, 4> Operands; 8799 auto *Phi = dyn_cast<PHINode>(Instr); 8800 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8801 Operands.push_back(Plan->getOrAddVPValue( 8802 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8803 } else { 8804 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8805 Operands = {OpRange.begin(), OpRange.end()}; 8806 } 8807 8808 // Invariant stores inside loop will be deleted and a single store 8809 // with the final reduction value will be added to the exit block 8810 StoreInst *SI; 8811 if ((SI = dyn_cast<StoreInst>(&I)) && 8812 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8813 continue; 8814 8815 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8816 Instr, Operands, Range, Plan)) { 8817 // If Instr can be simplified to an existing VPValue, use it. 8818 if (RecipeOrValue.is<VPValue *>()) { 8819 auto *VPV = RecipeOrValue.get<VPValue *>(); 8820 Plan->addVPValue(Instr, VPV); 8821 // If the re-used value is a recipe, register the recipe for the 8822 // instruction, in case the recipe for Instr needs to be recorded. 8823 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8824 RecipeBuilder.setRecipe(Instr, R); 8825 continue; 8826 } 8827 // Otherwise, add the new recipe. 8828 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8829 for (auto *Def : Recipe->definedValues()) { 8830 auto *UV = Def->getUnderlyingValue(); 8831 Plan->addVPValue(UV, Def); 8832 } 8833 8834 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8835 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8836 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8837 // of the header block. That can happen for truncates of induction 8838 // variables. Those recipes are moved to the phi section of the header 8839 // block after applying SinkAfter, which relies on the original 8840 // position of the trunc. 8841 assert(isa<TruncInst>(Instr)); 8842 InductionsToMove.push_back( 8843 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8844 } 8845 RecipeBuilder.setRecipe(Instr, Recipe); 8846 VPBB->appendRecipe(Recipe); 8847 continue; 8848 } 8849 8850 // Otherwise, if all widening options failed, Instruction is to be 8851 // replicated. This may create a successor for VPBB. 8852 VPBasicBlock *NextVPBB = 8853 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8854 if (NextVPBB != VPBB) { 8855 VPBB = NextVPBB; 8856 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8857 : ""); 8858 } 8859 } 8860 8861 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8862 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8863 } 8864 8865 HeaderVPBB->setName("vector.body"); 8866 8867 // Fold the last, empty block into its predecessor. 8868 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8869 assert(VPBB && "expected to fold last (empty) block"); 8870 // After here, VPBB should not be used. 8871 VPBB = nullptr; 8872 8873 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8874 8875 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8876 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8877 "entry block must be set to a VPRegionBlock having a non-empty entry " 8878 "VPBasicBlock"); 8879 RecipeBuilder.fixHeaderPhis(); 8880 8881 // --------------------------------------------------------------------------- 8882 // Transform initial VPlan: Apply previously taken decisions, in order, to 8883 // bring the VPlan to its final state. 8884 // --------------------------------------------------------------------------- 8885 8886 // Apply Sink-After legal constraints. 8887 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8888 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8889 if (Region && Region->isReplicator()) { 8890 assert(Region->getNumSuccessors() == 1 && 8891 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8892 assert(R->getParent()->size() == 1 && 8893 "A recipe in an original replicator region must be the only " 8894 "recipe in its block"); 8895 return Region; 8896 } 8897 return nullptr; 8898 }; 8899 for (auto &Entry : SinkAfter) { 8900 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8901 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8902 8903 auto *TargetRegion = GetReplicateRegion(Target); 8904 auto *SinkRegion = GetReplicateRegion(Sink); 8905 if (!SinkRegion) { 8906 // If the sink source is not a replicate region, sink the recipe directly. 8907 if (TargetRegion) { 8908 // The target is in a replication region, make sure to move Sink to 8909 // the block after it, not into the replication region itself. 8910 VPBasicBlock *NextBlock = 8911 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8912 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8913 } else 8914 Sink->moveAfter(Target); 8915 continue; 8916 } 8917 8918 // The sink source is in a replicate region. Unhook the region from the CFG. 8919 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8920 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8921 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8922 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8923 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8924 8925 if (TargetRegion) { 8926 // The target recipe is also in a replicate region, move the sink region 8927 // after the target region. 8928 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8929 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8930 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8931 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8932 } else { 8933 // The sink source is in a replicate region, we need to move the whole 8934 // replicate region, which should only contain a single recipe in the 8935 // main block. 8936 auto *SplitBlock = 8937 Target->getParent()->splitAt(std::next(Target->getIterator())); 8938 8939 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8940 8941 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8942 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8943 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8944 } 8945 } 8946 8947 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8948 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8949 8950 // Now that sink-after is done, move induction recipes for optimized truncates 8951 // to the phi section of the header block. 8952 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8953 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8954 8955 // Adjust the recipes for any inloop reductions. 8956 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 8957 RecipeBuilder, Range.Start); 8958 8959 // Introduce a recipe to combine the incoming and previous values of a 8960 // first-order recurrence. 8961 for (VPRecipeBase &R : 8962 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8963 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8964 if (!RecurPhi) 8965 continue; 8966 8967 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8968 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8969 auto *Region = GetReplicateRegion(PrevRecipe); 8970 if (Region) 8971 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); 8972 if (!InsertBlock) { 8973 InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); 8974 VPBlockUtils::insertBlockAfter(InsertBlock, Region); 8975 } 8976 if (Region || PrevRecipe->isPhi()) 8977 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8978 else 8979 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 8980 8981 auto *RecurSplice = cast<VPInstruction>( 8982 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 8983 {RecurPhi, RecurPhi->getBackedgeValue()})); 8984 8985 RecurPhi->replaceAllUsesWith(RecurSplice); 8986 // Set the first operand of RecurSplice to RecurPhi again, after replacing 8987 // all users. 8988 RecurSplice->setOperand(0, RecurPhi); 8989 } 8990 8991 // Interleave memory: for each Interleave Group we marked earlier as relevant 8992 // for this VPlan, replace the Recipes widening its memory instructions with a 8993 // single VPInterleaveRecipe at its insertion point. 8994 for (auto IG : InterleaveGroups) { 8995 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8996 RecipeBuilder.getRecipe(IG->getInsertPos())); 8997 SmallVector<VPValue *, 4> StoredValues; 8998 for (unsigned i = 0; i < IG->getFactor(); ++i) 8999 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9000 auto *StoreR = 9001 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9002 StoredValues.push_back(StoreR->getStoredValue()); 9003 } 9004 9005 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9006 Recipe->getMask()); 9007 VPIG->insertBefore(Recipe); 9008 unsigned J = 0; 9009 for (unsigned i = 0; i < IG->getFactor(); ++i) 9010 if (Instruction *Member = IG->getMember(i)) { 9011 if (!Member->getType()->isVoidTy()) { 9012 VPValue *OriginalV = Plan->getVPValue(Member); 9013 Plan->removeVPValueFor(Member); 9014 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9015 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9016 J++; 9017 } 9018 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9019 } 9020 } 9021 9022 std::string PlanName; 9023 raw_string_ostream RSO(PlanName); 9024 ElementCount VF = Range.Start; 9025 Plan->addVF(VF); 9026 RSO << "Initial VPlan for VF={" << VF; 9027 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9028 Plan->addVF(VF); 9029 RSO << "," << VF; 9030 } 9031 RSO << "},UF>=1"; 9032 RSO.flush(); 9033 Plan->setName(PlanName); 9034 9035 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9036 // in ways that accessing values using original IR values is incorrect. 9037 Plan->disableValue2VPValue(); 9038 9039 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9040 VPlanTransforms::sinkScalarOperands(*Plan); 9041 VPlanTransforms::removeDeadRecipes(*Plan); 9042 VPlanTransforms::mergeReplicateRegions(*Plan); 9043 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9044 9045 // Fold Exit block into its predecessor if possible. 9046 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9047 // VPBasicBlock as exit. 9048 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); 9049 9050 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9051 return Plan; 9052 } 9053 9054 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9055 // Outer loop handling: They may require CFG and instruction level 9056 // transformations before even evaluating whether vectorization is profitable. 9057 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9058 // the vectorization pipeline. 9059 assert(!OrigLoop->isInnermost()); 9060 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9061 9062 // Create new empty VPlan 9063 auto Plan = std::make_unique<VPlan>(); 9064 9065 // Build hierarchical CFG 9066 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9067 HCFGBuilder.buildHierarchicalCFG(); 9068 9069 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9070 VF *= 2) 9071 Plan->addVF(VF); 9072 9073 SmallPtrSet<Instruction *, 1> DeadInstructions; 9074 VPlanTransforms::VPInstructionsToVPRecipes( 9075 OrigLoop, Plan, 9076 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9077 DeadInstructions, *PSE.getSE()); 9078 9079 // Remove the existing terminator of the exiting block of the top-most region. 9080 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9081 auto *Term = 9082 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9083 Term->eraseFromParent(); 9084 9085 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9086 true, CM.useActiveLaneMaskForControlFlow()); 9087 return Plan; 9088 } 9089 9090 // Adjust the recipes for reductions. For in-loop reductions the chain of 9091 // instructions leading from the loop exit instr to the phi need to be converted 9092 // to reductions, with one operand being vector and the other being the scalar 9093 // reduction chain. For other reductions, a select is introduced between the phi 9094 // and live-out recipes when folding the tail. 9095 void LoopVectorizationPlanner::adjustRecipesForReductions( 9096 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9097 ElementCount MinVF) { 9098 for (auto &Reduction : CM.getInLoopReductionChains()) { 9099 PHINode *Phi = Reduction.first; 9100 const RecurrenceDescriptor &RdxDesc = 9101 Legal->getReductionVars().find(Phi)->second; 9102 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9103 9104 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9105 continue; 9106 9107 // ReductionOperations are orders top-down from the phi's use to the 9108 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9109 // which of the two operands will remain scalar and which will be reduced. 9110 // For minmax the chain will be the select instructions. 9111 Instruction *Chain = Phi; 9112 for (Instruction *R : ReductionOperations) { 9113 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9114 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9115 9116 VPValue *ChainOp = Plan->getVPValue(Chain); 9117 unsigned FirstOpId; 9118 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9119 "Only min/max recurrences allowed for inloop reductions"); 9120 // Recognize a call to the llvm.fmuladd intrinsic. 9121 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9122 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9123 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9124 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9125 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9126 "Expected to replace a VPWidenSelectSC"); 9127 FirstOpId = 1; 9128 } else { 9129 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9130 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9131 "Expected to replace a VPWidenSC"); 9132 FirstOpId = 0; 9133 } 9134 unsigned VecOpId = 9135 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9136 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9137 9138 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9139 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9140 : nullptr; 9141 9142 if (IsFMulAdd) { 9143 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9144 // need to create an fmul recipe to use as the vector operand for the 9145 // fadd reduction. 9146 VPInstruction *FMulRecipe = new VPInstruction( 9147 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9148 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9149 WidenRecipe->getParent()->insert(FMulRecipe, 9150 WidenRecipe->getIterator()); 9151 VecOp = FMulRecipe; 9152 } 9153 VPReductionRecipe *RedRecipe = 9154 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9155 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9156 Plan->removeVPValueFor(R); 9157 Plan->addVPValue(R, RedRecipe); 9158 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9159 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9160 WidenRecipe->eraseFromParent(); 9161 9162 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9163 VPRecipeBase *CompareRecipe = 9164 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9165 assert(isa<VPWidenRecipe>(CompareRecipe) && 9166 "Expected to replace a VPWidenSC"); 9167 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9168 "Expected no remaining users"); 9169 CompareRecipe->eraseFromParent(); 9170 } 9171 Chain = R; 9172 } 9173 } 9174 9175 // If tail is folded by masking, introduce selects between the phi 9176 // and the live-out instruction of each reduction, at the beginning of the 9177 // dedicated latch block. 9178 if (CM.foldTailByMasking()) { 9179 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9180 for (VPRecipeBase &R : 9181 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9182 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9183 if (!PhiR || PhiR->isInLoop()) 9184 continue; 9185 VPValue *Cond = 9186 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9187 VPValue *Red = PhiR->getBackedgeValue(); 9188 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9189 "reduction recipe must be defined before latch"); 9190 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9191 } 9192 } 9193 } 9194 9195 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9196 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9197 VPSlotTracker &SlotTracker) const { 9198 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9199 IG->getInsertPos()->printAsOperand(O, false); 9200 O << ", "; 9201 getAddr()->printAsOperand(O, SlotTracker); 9202 VPValue *Mask = getMask(); 9203 if (Mask) { 9204 O << ", "; 9205 Mask->printAsOperand(O, SlotTracker); 9206 } 9207 9208 unsigned OpIdx = 0; 9209 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9210 if (!IG->getMember(i)) 9211 continue; 9212 if (getNumStoreOperands() > 0) { 9213 O << "\n" << Indent << " store "; 9214 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9215 O << " to index " << i; 9216 } else { 9217 O << "\n" << Indent << " "; 9218 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9219 O << " = load from index " << i; 9220 } 9221 ++OpIdx; 9222 } 9223 } 9224 #endif 9225 9226 void VPWidenCallRecipe::execute(VPTransformState &State) { 9227 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9228 *this, State); 9229 } 9230 9231 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9232 assert(!State.Instance && "Int or FP induction being replicated."); 9233 9234 Value *Start = getStartValue()->getLiveInIRValue(); 9235 const InductionDescriptor &ID = getInductionDescriptor(); 9236 TruncInst *Trunc = getTruncInst(); 9237 IRBuilderBase &Builder = State.Builder; 9238 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9239 assert(State.VF.isVector() && "must have vector VF"); 9240 9241 // The value from the original loop to which we are mapping the new induction 9242 // variable. 9243 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9244 9245 // Fast-math-flags propagate from the original induction instruction. 9246 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9247 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9248 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9249 9250 // Now do the actual transformations, and start with fetching the step value. 9251 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9252 9253 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9254 "Expected either an induction phi-node or a truncate of it!"); 9255 9256 // Construct the initial value of the vector IV in the vector loop preheader 9257 auto CurrIP = Builder.saveIP(); 9258 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9259 Builder.SetInsertPoint(VectorPH->getTerminator()); 9260 if (isa<TruncInst>(EntryVal)) { 9261 assert(Start->getType()->isIntegerTy() && 9262 "Truncation requires an integer type"); 9263 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9264 Step = Builder.CreateTrunc(Step, TruncType); 9265 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9266 } 9267 9268 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9269 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9270 Value *SteppedStart = getStepVector( 9271 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9272 9273 // We create vector phi nodes for both integer and floating-point induction 9274 // variables. Here, we determine the kind of arithmetic we will perform. 9275 Instruction::BinaryOps AddOp; 9276 Instruction::BinaryOps MulOp; 9277 if (Step->getType()->isIntegerTy()) { 9278 AddOp = Instruction::Add; 9279 MulOp = Instruction::Mul; 9280 } else { 9281 AddOp = ID.getInductionOpcode(); 9282 MulOp = Instruction::FMul; 9283 } 9284 9285 // Multiply the vectorization factor by the step using integer or 9286 // floating-point arithmetic as appropriate. 9287 Type *StepType = Step->getType(); 9288 Value *RuntimeVF; 9289 if (Step->getType()->isFloatingPointTy()) 9290 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9291 else 9292 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9293 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9294 9295 // Create a vector splat to use in the induction update. 9296 // 9297 // FIXME: If the step is non-constant, we create the vector splat with 9298 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9299 // handle a constant vector splat. 9300 Value *SplatVF = isa<Constant>(Mul) 9301 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9302 : Builder.CreateVectorSplat(State.VF, Mul); 9303 Builder.restoreIP(CurrIP); 9304 9305 // We may need to add the step a number of times, depending on the unroll 9306 // factor. The last of those goes into the PHI. 9307 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9308 &*State.CFG.PrevBB->getFirstInsertionPt()); 9309 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9310 Instruction *LastInduction = VecInd; 9311 for (unsigned Part = 0; Part < State.UF; ++Part) { 9312 State.set(this, LastInduction, Part); 9313 9314 if (isa<TruncInst>(EntryVal)) 9315 State.addMetadata(LastInduction, EntryVal); 9316 9317 LastInduction = cast<Instruction>( 9318 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9319 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9320 } 9321 9322 LastInduction->setName("vec.ind.next"); 9323 VecInd->addIncoming(SteppedStart, VectorPH); 9324 // Add induction update using an incorrect block temporarily. The phi node 9325 // will be fixed after VPlan execution. Note that at this point the latch 9326 // block cannot be used, as it does not exist yet. 9327 // TODO: Model increment value in VPlan, by turning the recipe into a 9328 // multi-def and a subclass of VPHeaderPHIRecipe. 9329 VecInd->addIncoming(LastInduction, VectorPH); 9330 } 9331 9332 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9333 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9334 "Not a pointer induction according to InductionDescriptor!"); 9335 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9336 "Unexpected type."); 9337 9338 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9339 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9340 9341 if (onlyScalarsGenerated(State.VF)) { 9342 // This is the normalized GEP that starts counting at zero. 9343 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9344 CanonicalIV, IndDesc.getStep()->getType()); 9345 // Determine the number of scalars we need to generate for each unroll 9346 // iteration. If the instruction is uniform, we only need to generate the 9347 // first lane. Otherwise, we generate all VF values. 9348 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9349 assert((IsUniform || !State.VF.isScalable()) && 9350 "Cannot scalarize a scalable VF"); 9351 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9352 9353 for (unsigned Part = 0; Part < State.UF; ++Part) { 9354 Value *PartStart = 9355 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9356 9357 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9358 Value *Idx = State.Builder.CreateAdd( 9359 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9360 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9361 9362 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9363 State.CFG.PrevBB->getTerminator()); 9364 Value *SclrGep = emitTransformedIndex( 9365 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9366 SclrGep->setName("next.gep"); 9367 State.set(this, SclrGep, VPIteration(Part, Lane)); 9368 } 9369 } 9370 return; 9371 } 9372 9373 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9374 "Induction step not a SCEV constant!"); 9375 Type *PhiType = IndDesc.getStep()->getType(); 9376 9377 // Build a pointer phi 9378 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9379 Type *ScStValueType = ScalarStartValue->getType(); 9380 PHINode *NewPointerPhi = 9381 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9382 9383 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9384 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9385 9386 // A pointer induction, performed by using a gep 9387 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9388 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9389 9390 const SCEV *ScalarStep = IndDesc.getStep(); 9391 SCEVExpander Exp(SE, DL, "induction"); 9392 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9393 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9394 Value *NumUnrolledElems = 9395 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9396 Value *InductionGEP = GetElementPtrInst::Create( 9397 IndDesc.getElementType(), NewPointerPhi, 9398 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9399 InductionLoc); 9400 // Add induction update using an incorrect block temporarily. The phi node 9401 // will be fixed after VPlan execution. Note that at this point the latch 9402 // block cannot be used, as it does not exist yet. 9403 // TODO: Model increment value in VPlan, by turning the recipe into a 9404 // multi-def and a subclass of VPHeaderPHIRecipe. 9405 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9406 9407 // Create UF many actual address geps that use the pointer 9408 // phi as base and a vectorized version of the step value 9409 // (<step*0, ..., step*N>) as offset. 9410 for (unsigned Part = 0; Part < State.UF; ++Part) { 9411 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9412 Value *StartOffsetScalar = 9413 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9414 Value *StartOffset = 9415 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9416 // Create a vector of consecutive numbers from zero to VF. 9417 StartOffset = State.Builder.CreateAdd( 9418 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9419 9420 Value *GEP = State.Builder.CreateGEP( 9421 IndDesc.getElementType(), NewPointerPhi, 9422 State.Builder.CreateMul( 9423 StartOffset, 9424 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9425 "vector.gep")); 9426 State.set(this, GEP, Part); 9427 } 9428 } 9429 9430 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9431 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9432 9433 // Fast-math-flags propagate from the original induction instruction. 9434 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9435 if (IndDesc.getInductionBinOp() && 9436 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9437 State.Builder.setFastMathFlags( 9438 IndDesc.getInductionBinOp()->getFastMathFlags()); 9439 9440 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9441 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9442 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9443 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9444 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9445 ScalarIV = 9446 Ty->isIntegerTy() 9447 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9448 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9449 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9450 getStartValue()->getLiveInIRValue(), Step, 9451 IndDesc); 9452 ScalarIV->setName("offset.idx"); 9453 } 9454 if (TruncToTy) { 9455 assert(Step->getType()->isIntegerTy() && 9456 "Truncation requires an integer step"); 9457 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9458 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9459 } 9460 return ScalarIV; 9461 }; 9462 9463 Value *ScalarIV = CreateScalarIV(Step); 9464 if (State.VF.isVector()) { 9465 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9466 return; 9467 } 9468 9469 for (unsigned Part = 0; Part < State.UF; ++Part) { 9470 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9471 Value *EntryPart; 9472 if (Step->getType()->isFloatingPointTy()) { 9473 Value *StartIdx = 9474 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9475 // Floating-point operations inherit FMF via the builder's flags. 9476 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9477 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9478 ScalarIV, MulOp); 9479 } else { 9480 Value *StartIdx = 9481 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9482 EntryPart = State.Builder.CreateAdd( 9483 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9484 } 9485 State.set(this, EntryPart, Part); 9486 } 9487 } 9488 9489 void VPBlendRecipe::execute(VPTransformState &State) { 9490 State.setDebugLocFromInst(Phi); 9491 // We know that all PHIs in non-header blocks are converted into 9492 // selects, so we don't have to worry about the insertion order and we 9493 // can just use the builder. 9494 // At this point we generate the predication tree. There may be 9495 // duplications since this is a simple recursive scan, but future 9496 // optimizations will clean it up. 9497 9498 unsigned NumIncoming = getNumIncomingValues(); 9499 9500 // Generate a sequence of selects of the form: 9501 // SELECT(Mask3, In3, 9502 // SELECT(Mask2, In2, 9503 // SELECT(Mask1, In1, 9504 // In0))) 9505 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9506 // are essentially undef are taken from In0. 9507 InnerLoopVectorizer::VectorParts Entry(State.UF); 9508 for (unsigned In = 0; In < NumIncoming; ++In) { 9509 for (unsigned Part = 0; Part < State.UF; ++Part) { 9510 // We might have single edge PHIs (blocks) - use an identity 9511 // 'select' for the first PHI operand. 9512 Value *In0 = State.get(getIncomingValue(In), Part); 9513 if (In == 0) 9514 Entry[Part] = In0; // Initialize with the first incoming value. 9515 else { 9516 // Select between the current value and the previous incoming edge 9517 // based on the incoming mask. 9518 Value *Cond = State.get(getMask(In), Part); 9519 Entry[Part] = 9520 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9521 } 9522 } 9523 } 9524 for (unsigned Part = 0; Part < State.UF; ++Part) 9525 State.set(this, Entry[Part], Part); 9526 } 9527 9528 void VPInterleaveRecipe::execute(VPTransformState &State) { 9529 assert(!State.Instance && "Interleave group being replicated."); 9530 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9531 getStoredValues(), getMask()); 9532 } 9533 9534 void VPReductionRecipe::execute(VPTransformState &State) { 9535 assert(!State.Instance && "Reduction being replicated."); 9536 Value *PrevInChain = State.get(getChainOp(), 0); 9537 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9538 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9539 // Propagate the fast-math flags carried by the underlying instruction. 9540 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9541 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9542 for (unsigned Part = 0; Part < State.UF; ++Part) { 9543 Value *NewVecOp = State.get(getVecOp(), Part); 9544 if (VPValue *Cond = getCondOp()) { 9545 Value *NewCond = State.get(Cond, Part); 9546 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9547 Value *Iden = RdxDesc->getRecurrenceIdentity( 9548 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9549 Value *IdenVec = 9550 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9551 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9552 NewVecOp = Select; 9553 } 9554 Value *NewRed; 9555 Value *NextInChain; 9556 if (IsOrdered) { 9557 if (State.VF.isVector()) 9558 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9559 PrevInChain); 9560 else 9561 NewRed = State.Builder.CreateBinOp( 9562 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9563 NewVecOp); 9564 PrevInChain = NewRed; 9565 } else { 9566 PrevInChain = State.get(getChainOp(), Part); 9567 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9568 } 9569 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9570 NextInChain = 9571 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9572 NewRed, PrevInChain); 9573 } else if (IsOrdered) 9574 NextInChain = NewRed; 9575 else 9576 NextInChain = State.Builder.CreateBinOp( 9577 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9578 PrevInChain); 9579 State.set(this, NextInChain, Part); 9580 } 9581 } 9582 9583 void VPReplicateRecipe::execute(VPTransformState &State) { 9584 if (State.Instance) { // Generate a single instance. 9585 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9586 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9587 IsPredicated, State); 9588 // Insert scalar instance packing it into a vector. 9589 if (AlsoPack && State.VF.isVector()) { 9590 // If we're constructing lane 0, initialize to start from poison. 9591 if (State.Instance->Lane.isFirstLane()) { 9592 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9593 Value *Poison = PoisonValue::get( 9594 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9595 State.set(this, Poison, State.Instance->Part); 9596 } 9597 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9598 } 9599 return; 9600 } 9601 9602 // Generate scalar instances for all VF lanes of all UF parts, unless the 9603 // instruction is uniform inwhich case generate only the first lane for each 9604 // of the UF parts. 9605 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9606 assert((!State.VF.isScalable() || IsUniform) && 9607 "Can't scalarize a scalable vector"); 9608 for (unsigned Part = 0; Part < State.UF; ++Part) 9609 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9610 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9611 VPIteration(Part, Lane), IsPredicated, 9612 State); 9613 } 9614 9615 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9616 assert(State.Instance && "Branch on Mask works only on single instance."); 9617 9618 unsigned Part = State.Instance->Part; 9619 unsigned Lane = State.Instance->Lane.getKnownLane(); 9620 9621 Value *ConditionBit = nullptr; 9622 VPValue *BlockInMask = getMask(); 9623 if (BlockInMask) { 9624 ConditionBit = State.get(BlockInMask, Part); 9625 if (ConditionBit->getType()->isVectorTy()) 9626 ConditionBit = State.Builder.CreateExtractElement( 9627 ConditionBit, State.Builder.getInt32(Lane)); 9628 } else // Block in mask is all-one. 9629 ConditionBit = State.Builder.getTrue(); 9630 9631 // Replace the temporary unreachable terminator with a new conditional branch, 9632 // whose two destinations will be set later when they are created. 9633 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9634 assert(isa<UnreachableInst>(CurrentTerminator) && 9635 "Expected to replace unreachable terminator with conditional branch."); 9636 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9637 CondBr->setSuccessor(0, nullptr); 9638 ReplaceInstWithInst(CurrentTerminator, CondBr); 9639 } 9640 9641 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9642 assert(State.Instance && "Predicated instruction PHI works per instance."); 9643 Instruction *ScalarPredInst = 9644 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9645 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9646 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9647 assert(PredicatingBB && "Predicated block has no single predecessor."); 9648 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9649 "operand must be VPReplicateRecipe"); 9650 9651 // By current pack/unpack logic we need to generate only a single phi node: if 9652 // a vector value for the predicated instruction exists at this point it means 9653 // the instruction has vector users only, and a phi for the vector value is 9654 // needed. In this case the recipe of the predicated instruction is marked to 9655 // also do that packing, thereby "hoisting" the insert-element sequence. 9656 // Otherwise, a phi node for the scalar value is needed. 9657 unsigned Part = State.Instance->Part; 9658 if (State.hasVectorValue(getOperand(0), Part)) { 9659 Value *VectorValue = State.get(getOperand(0), Part); 9660 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9661 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9662 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9663 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9664 if (State.hasVectorValue(this, Part)) 9665 State.reset(this, VPhi, Part); 9666 else 9667 State.set(this, VPhi, Part); 9668 // NOTE: Currently we need to update the value of the operand, so the next 9669 // predicated iteration inserts its generated value in the correct vector. 9670 State.reset(getOperand(0), VPhi, Part); 9671 } else { 9672 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9673 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9674 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9675 PredicatingBB); 9676 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9677 if (State.hasScalarValue(this, *State.Instance)) 9678 State.reset(this, Phi, *State.Instance); 9679 else 9680 State.set(this, Phi, *State.Instance); 9681 // NOTE: Currently we need to update the value of the operand, so the next 9682 // predicated iteration inserts its generated value in the correct vector. 9683 State.reset(getOperand(0), Phi, *State.Instance); 9684 } 9685 } 9686 9687 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9688 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9689 9690 // Attempt to issue a wide load. 9691 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9692 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9693 9694 assert((LI || SI) && "Invalid Load/Store instruction"); 9695 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9696 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9697 9698 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9699 9700 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9701 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9702 bool CreateGatherScatter = !Consecutive; 9703 9704 auto &Builder = State.Builder; 9705 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9706 bool isMaskRequired = getMask(); 9707 if (isMaskRequired) 9708 for (unsigned Part = 0; Part < State.UF; ++Part) 9709 BlockInMaskParts[Part] = State.get(getMask(), Part); 9710 9711 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9712 // Calculate the pointer for the specific unroll-part. 9713 GetElementPtrInst *PartPtr = nullptr; 9714 9715 bool InBounds = false; 9716 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9717 InBounds = gep->isInBounds(); 9718 if (Reverse) { 9719 // If the address is consecutive but reversed, then the 9720 // wide store needs to start at the last vector element. 9721 // RunTimeVF = VScale * VF.getKnownMinValue() 9722 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9723 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9724 // NumElt = -Part * RunTimeVF 9725 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9726 // LastLane = 1 - RunTimeVF 9727 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9728 PartPtr = 9729 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9730 PartPtr->setIsInBounds(InBounds); 9731 PartPtr = cast<GetElementPtrInst>( 9732 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9733 PartPtr->setIsInBounds(InBounds); 9734 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9735 BlockInMaskParts[Part] = 9736 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9737 } else { 9738 Value *Increment = 9739 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9740 PartPtr = cast<GetElementPtrInst>( 9741 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9742 PartPtr->setIsInBounds(InBounds); 9743 } 9744 9745 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9746 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9747 }; 9748 9749 // Handle Stores: 9750 if (SI) { 9751 State.setDebugLocFromInst(SI); 9752 9753 for (unsigned Part = 0; Part < State.UF; ++Part) { 9754 Instruction *NewSI = nullptr; 9755 Value *StoredVal = State.get(StoredValue, Part); 9756 if (CreateGatherScatter) { 9757 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9758 Value *VectorGep = State.get(getAddr(), Part); 9759 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9760 MaskPart); 9761 } else { 9762 if (Reverse) { 9763 // If we store to reverse consecutive memory locations, then we need 9764 // to reverse the order of elements in the stored value. 9765 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9766 // We don't want to update the value in the map as it might be used in 9767 // another expression. So don't call resetVectorValue(StoredVal). 9768 } 9769 auto *VecPtr = 9770 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9771 if (isMaskRequired) 9772 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9773 BlockInMaskParts[Part]); 9774 else 9775 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9776 } 9777 State.addMetadata(NewSI, SI); 9778 } 9779 return; 9780 } 9781 9782 // Handle loads. 9783 assert(LI && "Must have a load instruction"); 9784 State.setDebugLocFromInst(LI); 9785 for (unsigned Part = 0; Part < State.UF; ++Part) { 9786 Value *NewLI; 9787 if (CreateGatherScatter) { 9788 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9789 Value *VectorGep = State.get(getAddr(), Part); 9790 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9791 nullptr, "wide.masked.gather"); 9792 State.addMetadata(NewLI, LI); 9793 } else { 9794 auto *VecPtr = 9795 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9796 if (isMaskRequired) 9797 NewLI = Builder.CreateMaskedLoad( 9798 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9799 PoisonValue::get(DataTy), "wide.masked.load"); 9800 else 9801 NewLI = 9802 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9803 9804 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9805 State.addMetadata(NewLI, LI); 9806 if (Reverse) 9807 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9808 } 9809 9810 State.set(getVPSingleValue(), NewLI, Part); 9811 } 9812 } 9813 9814 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9815 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9816 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9817 // for predication. 9818 static ScalarEpilogueLowering getScalarEpilogueLowering( 9819 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9820 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9821 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9822 LoopVectorizationLegality &LVL) { 9823 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9824 // don't look at hints or options, and don't request a scalar epilogue. 9825 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9826 // LoopAccessInfo (due to code dependency and not being able to reliably get 9827 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9828 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9829 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9830 // back to the old way and vectorize with versioning when forced. See D81345.) 9831 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9832 PGSOQueryType::IRPass) && 9833 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9834 return CM_ScalarEpilogueNotAllowedOptSize; 9835 9836 // 2) If set, obey the directives 9837 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9838 switch (PreferPredicateOverEpilogue) { 9839 case PreferPredicateTy::ScalarEpilogue: 9840 return CM_ScalarEpilogueAllowed; 9841 case PreferPredicateTy::PredicateElseScalarEpilogue: 9842 return CM_ScalarEpilogueNotNeededUsePredicate; 9843 case PreferPredicateTy::PredicateOrDontVectorize: 9844 return CM_ScalarEpilogueNotAllowedUsePredicate; 9845 }; 9846 } 9847 9848 // 3) If set, obey the hints 9849 switch (Hints.getPredicate()) { 9850 case LoopVectorizeHints::FK_Enabled: 9851 return CM_ScalarEpilogueNotNeededUsePredicate; 9852 case LoopVectorizeHints::FK_Disabled: 9853 return CM_ScalarEpilogueAllowed; 9854 }; 9855 9856 // 4) if the TTI hook indicates this is profitable, request predication. 9857 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9858 LVL.getLAI())) 9859 return CM_ScalarEpilogueNotNeededUsePredicate; 9860 9861 return CM_ScalarEpilogueAllowed; 9862 } 9863 9864 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9865 // If Values have been set for this Def return the one relevant for \p Part. 9866 if (hasVectorValue(Def, Part)) 9867 return Data.PerPartOutput[Def][Part]; 9868 9869 if (!hasScalarValue(Def, {Part, 0})) { 9870 Value *IRV = Def->getLiveInIRValue(); 9871 Value *B = ILV->getBroadcastInstrs(IRV); 9872 set(Def, B, Part); 9873 return B; 9874 } 9875 9876 Value *ScalarValue = get(Def, {Part, 0}); 9877 // If we aren't vectorizing, we can just copy the scalar map values over 9878 // to the vector map. 9879 if (VF.isScalar()) { 9880 set(Def, ScalarValue, Part); 9881 return ScalarValue; 9882 } 9883 9884 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9885 bool IsUniform = RepR && RepR->isUniform(); 9886 9887 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9888 // Check if there is a scalar value for the selected lane. 9889 if (!hasScalarValue(Def, {Part, LastLane})) { 9890 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9891 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 9892 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 9893 "unexpected recipe found to be invariant"); 9894 IsUniform = true; 9895 LastLane = 0; 9896 } 9897 9898 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9899 // Set the insert point after the last scalarized instruction or after the 9900 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9901 // will directly follow the scalar definitions. 9902 auto OldIP = Builder.saveIP(); 9903 auto NewIP = 9904 isa<PHINode>(LastInst) 9905 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9906 : std::next(BasicBlock::iterator(LastInst)); 9907 Builder.SetInsertPoint(&*NewIP); 9908 9909 // However, if we are vectorizing, we need to construct the vector values. 9910 // If the value is known to be uniform after vectorization, we can just 9911 // broadcast the scalar value corresponding to lane zero for each unroll 9912 // iteration. Otherwise, we construct the vector values using 9913 // insertelement instructions. Since the resulting vectors are stored in 9914 // State, we will only generate the insertelements once. 9915 Value *VectorValue = nullptr; 9916 if (IsUniform) { 9917 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9918 set(Def, VectorValue, Part); 9919 } else { 9920 // Initialize packing with insertelements to start from undef. 9921 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9922 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9923 set(Def, Undef, Part); 9924 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9925 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9926 VectorValue = get(Def, Part); 9927 } 9928 Builder.restoreIP(OldIP); 9929 return VectorValue; 9930 } 9931 9932 // Process the loop in the VPlan-native vectorization path. This path builds 9933 // VPlan upfront in the vectorization pipeline, which allows to apply 9934 // VPlan-to-VPlan transformations from the very beginning without modifying the 9935 // input LLVM IR. 9936 static bool processLoopInVPlanNativePath( 9937 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9938 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9939 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9940 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9941 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9942 LoopVectorizationRequirements &Requirements) { 9943 9944 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9945 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9946 return false; 9947 } 9948 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9949 Function *F = L->getHeader()->getParent(); 9950 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9951 9952 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9953 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9954 9955 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9956 &Hints, IAI); 9957 // Use the planner for outer loop vectorization. 9958 // TODO: CM is not used at this point inside the planner. Turn CM into an 9959 // optional argument if we don't need it in the future. 9960 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); 9961 9962 // Get user vectorization factor. 9963 ElementCount UserVF = Hints.getWidth(); 9964 9965 CM.collectElementTypesForWidening(); 9966 9967 // Plan how to best vectorize, return the best VF and its cost. 9968 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9969 9970 // If we are stress testing VPlan builds, do not attempt to generate vector 9971 // code. Masked vector code generation support will follow soon. 9972 // Also, do not attempt to vectorize if no vector code will be produced. 9973 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9974 return false; 9975 9976 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 9977 9978 { 9979 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9980 F->getParent()->getDataLayout()); 9981 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 9982 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 9983 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9984 << L->getHeader()->getParent()->getName() << "\"\n"); 9985 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 9986 } 9987 9988 // Mark the loop as already vectorized to avoid vectorizing again. 9989 Hints.setAlreadyVectorized(); 9990 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9991 return true; 9992 } 9993 9994 // Emit a remark if there are stores to floats that required a floating point 9995 // extension. If the vectorized loop was generated with floating point there 9996 // will be a performance penalty from the conversion overhead and the change in 9997 // the vector width. 9998 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9999 SmallVector<Instruction *, 4> Worklist; 10000 for (BasicBlock *BB : L->getBlocks()) { 10001 for (Instruction &Inst : *BB) { 10002 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10003 if (S->getValueOperand()->getType()->isFloatTy()) 10004 Worklist.push_back(S); 10005 } 10006 } 10007 } 10008 10009 // Traverse the floating point stores upwards searching, for floating point 10010 // conversions. 10011 SmallPtrSet<const Instruction *, 4> Visited; 10012 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10013 while (!Worklist.empty()) { 10014 auto *I = Worklist.pop_back_val(); 10015 if (!L->contains(I)) 10016 continue; 10017 if (!Visited.insert(I).second) 10018 continue; 10019 10020 // Emit a remark if the floating point store required a floating 10021 // point conversion. 10022 // TODO: More work could be done to identify the root cause such as a 10023 // constant or a function return type and point the user to it. 10024 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10025 ORE->emit([&]() { 10026 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10027 I->getDebugLoc(), L->getHeader()) 10028 << "floating point conversion changes vector width. " 10029 << "Mixed floating point precision requires an up/down " 10030 << "cast that will negatively impact performance."; 10031 }); 10032 10033 for (Use &Op : I->operands()) 10034 if (auto *OpI = dyn_cast<Instruction>(Op)) 10035 Worklist.push_back(OpI); 10036 } 10037 } 10038 10039 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10040 VectorizationFactor &VF, 10041 Optional<unsigned> VScale, Loop *L, 10042 ScalarEvolution &SE) { 10043 InstructionCost CheckCost = Checks.getCost(); 10044 if (!CheckCost.isValid()) 10045 return false; 10046 10047 // When interleaving only scalar and vector cost will be equal, which in turn 10048 // would lead to a divide by 0. Fall back to hard threshold. 10049 if (VF.Width.isScalar()) { 10050 if (CheckCost > VectorizeMemoryCheckThreshold) { 10051 LLVM_DEBUG( 10052 dbgs() 10053 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10054 return false; 10055 } 10056 return true; 10057 } 10058 10059 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10060 double ScalarC = *VF.ScalarCost.getValue(); 10061 if (ScalarC == 0) 10062 return true; 10063 10064 // First, compute the minimum iteration count required so that the vector 10065 // loop outperforms the scalar loop. 10066 // The total cost of the scalar loop is 10067 // ScalarC * TC 10068 // where 10069 // * TC is the actual trip count of the loop. 10070 // * ScalarC is the cost of a single scalar iteration. 10071 // 10072 // The total cost of the vector loop is 10073 // RtC + VecC * (TC / VF) + EpiC 10074 // where 10075 // * RtC is the cost of the generated runtime checks 10076 // * VecC is the cost of a single vector iteration. 10077 // * TC is the actual trip count of the loop 10078 // * VF is the vectorization factor 10079 // * EpiCost is the cost of the generated epilogue, including the cost 10080 // of the remaining scalar operations. 10081 // 10082 // Vectorization is profitable once the total vector cost is less than the 10083 // total scalar cost: 10084 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10085 // 10086 // Now we can compute the minimum required trip count TC as 10087 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 10088 // 10089 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10090 // the computations are performed on doubles, not integers and the result 10091 // is rounded up, hence we get an upper estimate of the TC. 10092 unsigned IntVF = VF.Width.getKnownMinValue(); 10093 if (VF.Width.isScalable()) { 10094 unsigned AssumedMinimumVscale = 1; 10095 if (VScale) 10096 AssumedMinimumVscale = *VScale; 10097 IntVF *= AssumedMinimumVscale; 10098 } 10099 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 10100 double RtC = *CheckCost.getValue(); 10101 double MinTC1 = RtC / (ScalarC - VecCOverVF); 10102 10103 // Second, compute a minimum iteration count so that the cost of the 10104 // runtime checks is only a fraction of the total scalar loop cost. This 10105 // adds a loop-dependent bound on the overhead incurred if the runtime 10106 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10107 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10108 // cost, compute 10109 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10110 double MinTC2 = RtC * 10 / ScalarC; 10111 10112 // Now pick the larger minimum. If it is not a multiple of VF, choose the 10113 // next closest multiple of VF. This should partly compensate for ignoring 10114 // the epilogue cost. 10115 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 10116 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); 10117 10118 LLVM_DEBUG( 10119 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10120 << VF.MinProfitableTripCount << "\n"); 10121 10122 // Skip vectorization if the expected trip count is less than the minimum 10123 // required trip count. 10124 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 10125 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10126 VF.MinProfitableTripCount)) { 10127 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10128 "trip count < minimum profitable VF (" 10129 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10130 << ")\n"); 10131 10132 return false; 10133 } 10134 } 10135 return true; 10136 } 10137 10138 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10139 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10140 !EnableLoopInterleaving), 10141 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10142 !EnableLoopVectorization) {} 10143 10144 bool LoopVectorizePass::processLoop(Loop *L) { 10145 assert((EnableVPlanNativePath || L->isInnermost()) && 10146 "VPlan-native path is not enabled. Only process inner loops."); 10147 10148 #ifndef NDEBUG 10149 const std::string DebugLocStr = getDebugLocString(L); 10150 #endif /* NDEBUG */ 10151 10152 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10153 << L->getHeader()->getParent()->getName() << "' from " 10154 << DebugLocStr << "\n"); 10155 10156 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10157 10158 LLVM_DEBUG( 10159 dbgs() << "LV: Loop hints:" 10160 << " force=" 10161 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10162 ? "disabled" 10163 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10164 ? "enabled" 10165 : "?")) 10166 << " width=" << Hints.getWidth() 10167 << " interleave=" << Hints.getInterleave() << "\n"); 10168 10169 // Function containing loop 10170 Function *F = L->getHeader()->getParent(); 10171 10172 // Looking at the diagnostic output is the only way to determine if a loop 10173 // was vectorized (other than looking at the IR or machine code), so it 10174 // is important to generate an optimization remark for each loop. Most of 10175 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10176 // generated as OptimizationRemark and OptimizationRemarkMissed are 10177 // less verbose reporting vectorized loops and unvectorized loops that may 10178 // benefit from vectorization, respectively. 10179 10180 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10181 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10182 return false; 10183 } 10184 10185 PredicatedScalarEvolution PSE(*SE, *L); 10186 10187 // Check if it is legal to vectorize the loop. 10188 LoopVectorizationRequirements Requirements; 10189 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10190 &Requirements, &Hints, DB, AC, BFI, PSI); 10191 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10192 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10193 Hints.emitRemarkWithHints(); 10194 return false; 10195 } 10196 10197 // Check the function attributes and profiles to find out if this function 10198 // should be optimized for size. 10199 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10200 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10201 10202 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10203 // here. They may require CFG and instruction level transformations before 10204 // even evaluating whether vectorization is profitable. Since we cannot modify 10205 // the incoming IR, we need to build VPlan upfront in the vectorization 10206 // pipeline. 10207 if (!L->isInnermost()) 10208 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10209 ORE, BFI, PSI, Hints, Requirements); 10210 10211 assert(L->isInnermost() && "Inner loop expected."); 10212 10213 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10214 // count by optimizing for size, to minimize overheads. 10215 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10216 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10217 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10218 << "This loop is worth vectorizing only if no scalar " 10219 << "iteration overheads are incurred."); 10220 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10221 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10222 else { 10223 LLVM_DEBUG(dbgs() << "\n"); 10224 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10225 } 10226 } 10227 10228 // Check the function attributes to see if implicit floats are allowed. 10229 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10230 // an integer loop and the vector instructions selected are purely integer 10231 // vector instructions? 10232 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10233 reportVectorizationFailure( 10234 "Can't vectorize when the NoImplicitFloat attribute is used", 10235 "loop not vectorized due to NoImplicitFloat attribute", 10236 "NoImplicitFloat", ORE, L); 10237 Hints.emitRemarkWithHints(); 10238 return false; 10239 } 10240 10241 // Check if the target supports potentially unsafe FP vectorization. 10242 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10243 // for the target we're vectorizing for, to make sure none of the 10244 // additional fp-math flags can help. 10245 if (Hints.isPotentiallyUnsafe() && 10246 TTI->isFPVectorizationPotentiallyUnsafe()) { 10247 reportVectorizationFailure( 10248 "Potentially unsafe FP op prevents vectorization", 10249 "loop not vectorized due to unsafe FP support.", 10250 "UnsafeFP", ORE, L); 10251 Hints.emitRemarkWithHints(); 10252 return false; 10253 } 10254 10255 bool AllowOrderedReductions; 10256 // If the flag is set, use that instead and override the TTI behaviour. 10257 if (ForceOrderedReductions.getNumOccurrences() > 0) 10258 AllowOrderedReductions = ForceOrderedReductions; 10259 else 10260 AllowOrderedReductions = TTI->enableOrderedReductions(); 10261 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10262 ORE->emit([&]() { 10263 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10264 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10265 ExactFPMathInst->getDebugLoc(), 10266 ExactFPMathInst->getParent()) 10267 << "loop not vectorized: cannot prove it is safe to reorder " 10268 "floating-point operations"; 10269 }); 10270 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10271 "reorder floating-point operations\n"); 10272 Hints.emitRemarkWithHints(); 10273 return false; 10274 } 10275 10276 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10277 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10278 10279 // If an override option has been passed in for interleaved accesses, use it. 10280 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10281 UseInterleaved = EnableInterleavedMemAccesses; 10282 10283 // Analyze interleaved memory accesses. 10284 if (UseInterleaved) { 10285 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10286 } 10287 10288 // Use the cost model. 10289 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10290 F, &Hints, IAI); 10291 CM.collectValuesToIgnore(); 10292 CM.collectElementTypesForWidening(); 10293 10294 // Use the planner for vectorization. 10295 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); 10296 10297 // Get user vectorization factor and interleave count. 10298 ElementCount UserVF = Hints.getWidth(); 10299 unsigned UserIC = Hints.getInterleave(); 10300 10301 // Plan how to best vectorize, return the best VF and its cost. 10302 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10303 10304 VectorizationFactor VF = VectorizationFactor::Disabled(); 10305 unsigned IC = 1; 10306 10307 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10308 F->getParent()->getDataLayout()); 10309 if (MaybeVF) { 10310 VF = *MaybeVF; 10311 // Select the interleave count. 10312 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10313 10314 unsigned SelectedIC = std::max(IC, UserIC); 10315 // Optimistically generate runtime checks if they are needed. Drop them if 10316 // they turn out to not be profitable. 10317 if (VF.Width.isVector() || SelectedIC > 1) 10318 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10319 10320 // Check if it is profitable to vectorize with runtime checks. 10321 bool ForceVectorization = 10322 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10323 if (!ForceVectorization && 10324 !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, 10325 *PSE.getSE())) { 10326 ORE->emit([&]() { 10327 return OptimizationRemarkAnalysisAliasing( 10328 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10329 L->getHeader()) 10330 << "loop not vectorized: cannot prove it is safe to reorder " 10331 "memory operations"; 10332 }); 10333 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10334 Hints.emitRemarkWithHints(); 10335 return false; 10336 } 10337 } 10338 10339 // Identify the diagnostic messages that should be produced. 10340 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10341 bool VectorizeLoop = true, InterleaveLoop = true; 10342 if (VF.Width.isScalar()) { 10343 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10344 VecDiagMsg = std::make_pair( 10345 "VectorizationNotBeneficial", 10346 "the cost-model indicates that vectorization is not beneficial"); 10347 VectorizeLoop = false; 10348 } 10349 10350 if (!MaybeVF && UserIC > 1) { 10351 // Tell the user interleaving was avoided up-front, despite being explicitly 10352 // requested. 10353 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10354 "interleaving should be avoided up front\n"); 10355 IntDiagMsg = std::make_pair( 10356 "InterleavingAvoided", 10357 "Ignoring UserIC, because interleaving was avoided up front"); 10358 InterleaveLoop = false; 10359 } else if (IC == 1 && UserIC <= 1) { 10360 // Tell the user interleaving is not beneficial. 10361 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10362 IntDiagMsg = std::make_pair( 10363 "InterleavingNotBeneficial", 10364 "the cost-model indicates that interleaving is not beneficial"); 10365 InterleaveLoop = false; 10366 if (UserIC == 1) { 10367 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10368 IntDiagMsg.second += 10369 " and is explicitly disabled or interleave count is set to 1"; 10370 } 10371 } else if (IC > 1 && UserIC == 1) { 10372 // Tell the user interleaving is beneficial, but it explicitly disabled. 10373 LLVM_DEBUG( 10374 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10375 IntDiagMsg = std::make_pair( 10376 "InterleavingBeneficialButDisabled", 10377 "the cost-model indicates that interleaving is beneficial " 10378 "but is explicitly disabled or interleave count is set to 1"); 10379 InterleaveLoop = false; 10380 } 10381 10382 // Override IC if user provided an interleave count. 10383 IC = UserIC > 0 ? UserIC : IC; 10384 10385 // Emit diagnostic messages, if any. 10386 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10387 if (!VectorizeLoop && !InterleaveLoop) { 10388 // Do not vectorize or interleaving the loop. 10389 ORE->emit([&]() { 10390 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10391 L->getStartLoc(), L->getHeader()) 10392 << VecDiagMsg.second; 10393 }); 10394 ORE->emit([&]() { 10395 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10396 L->getStartLoc(), L->getHeader()) 10397 << IntDiagMsg.second; 10398 }); 10399 return false; 10400 } else if (!VectorizeLoop && InterleaveLoop) { 10401 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10402 ORE->emit([&]() { 10403 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10404 L->getStartLoc(), L->getHeader()) 10405 << VecDiagMsg.second; 10406 }); 10407 } else if (VectorizeLoop && !InterleaveLoop) { 10408 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10409 << ") in " << DebugLocStr << '\n'); 10410 ORE->emit([&]() { 10411 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10412 L->getStartLoc(), L->getHeader()) 10413 << IntDiagMsg.second; 10414 }); 10415 } else if (VectorizeLoop && InterleaveLoop) { 10416 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10417 << ") in " << DebugLocStr << '\n'); 10418 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10419 } 10420 10421 bool DisableRuntimeUnroll = false; 10422 MDNode *OrigLoopID = L->getLoopID(); 10423 { 10424 using namespace ore; 10425 if (!VectorizeLoop) { 10426 assert(IC > 1 && "interleave count should not be 1 or 0"); 10427 // If we decided that it is not legal to vectorize the loop, then 10428 // interleave it. 10429 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10430 &CM, BFI, PSI, Checks); 10431 10432 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10433 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10434 10435 ORE->emit([&]() { 10436 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10437 L->getHeader()) 10438 << "interleaved loop (interleaved count: " 10439 << NV("InterleaveCount", IC) << ")"; 10440 }); 10441 } else { 10442 // If we decided that it is *legal* to vectorize the loop, then do it. 10443 10444 // Consider vectorizing the epilogue too if it's profitable. 10445 VectorizationFactor EpilogueVF = 10446 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10447 if (EpilogueVF.Width.isVector()) { 10448 10449 // The first pass vectorizes the main loop and creates a scalar epilogue 10450 // to be vectorized by executing the plan (potentially with a different 10451 // factor) again shortly afterwards. 10452 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10453 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10454 EPI, &LVL, &CM, BFI, PSI, Checks); 10455 10456 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10457 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10458 DT, true); 10459 ++LoopsVectorized; 10460 10461 // Second pass vectorizes the epilogue and adjusts the control flow 10462 // edges from the first pass. 10463 EPI.MainLoopVF = EPI.EpilogueVF; 10464 EPI.MainLoopUF = EPI.EpilogueUF; 10465 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10466 ORE, EPI, &LVL, &CM, BFI, PSI, 10467 Checks); 10468 10469 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10470 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10471 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10472 Header->setName("vec.epilog.vector.body"); 10473 10474 // Ensure that the start values for any VPReductionPHIRecipes are 10475 // updated before vectorising the epilogue loop. 10476 for (VPRecipeBase &R : Header->phis()) { 10477 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10478 if (auto *Resume = MainILV.getReductionResumeValue( 10479 ReductionPhi->getRecurrenceDescriptor())) { 10480 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10481 ReductionPhi->setOperand(0, StartVal); 10482 } 10483 } 10484 } 10485 10486 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10487 DT, true); 10488 ++LoopsEpilogueVectorized; 10489 10490 if (!MainILV.areSafetyChecksAdded()) 10491 DisableRuntimeUnroll = true; 10492 } else { 10493 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10494 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10495 PSI, Checks); 10496 10497 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10498 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10499 ++LoopsVectorized; 10500 10501 // Add metadata to disable runtime unrolling a scalar loop when there 10502 // are no runtime checks about strides and memory. A scalar loop that is 10503 // rarely used is not worth unrolling. 10504 if (!LB.areSafetyChecksAdded()) 10505 DisableRuntimeUnroll = true; 10506 } 10507 // Report the vectorization decision. 10508 ORE->emit([&]() { 10509 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10510 L->getHeader()) 10511 << "vectorized loop (vectorization width: " 10512 << NV("VectorizationFactor", VF.Width) 10513 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10514 }); 10515 } 10516 10517 if (ORE->allowExtraAnalysis(LV_NAME)) 10518 checkMixedPrecision(L, ORE); 10519 } 10520 10521 Optional<MDNode *> RemainderLoopID = 10522 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10523 LLVMLoopVectorizeFollowupEpilogue}); 10524 if (RemainderLoopID) { 10525 L->setLoopID(RemainderLoopID.getValue()); 10526 } else { 10527 if (DisableRuntimeUnroll) 10528 AddRuntimeUnrollDisableMetaData(L); 10529 10530 // Mark the loop as already vectorized to avoid vectorizing again. 10531 Hints.setAlreadyVectorized(); 10532 } 10533 10534 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10535 return true; 10536 } 10537 10538 LoopVectorizeResult LoopVectorizePass::runImpl( 10539 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10540 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10541 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10542 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10543 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10544 SE = &SE_; 10545 LI = &LI_; 10546 TTI = &TTI_; 10547 DT = &DT_; 10548 BFI = &BFI_; 10549 TLI = TLI_; 10550 AA = &AA_; 10551 AC = &AC_; 10552 GetLAA = &GetLAA_; 10553 DB = &DB_; 10554 ORE = &ORE_; 10555 PSI = PSI_; 10556 10557 // Don't attempt if 10558 // 1. the target claims to have no vector registers, and 10559 // 2. interleaving won't help ILP. 10560 // 10561 // The second condition is necessary because, even if the target has no 10562 // vector registers, loop vectorization may still enable scalar 10563 // interleaving. 10564 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10565 TTI->getMaxInterleaveFactor(1) < 2) 10566 return LoopVectorizeResult(false, false); 10567 10568 bool Changed = false, CFGChanged = false; 10569 10570 // The vectorizer requires loops to be in simplified form. 10571 // Since simplification may add new inner loops, it has to run before the 10572 // legality and profitability checks. This means running the loop vectorizer 10573 // will simplify all loops, regardless of whether anything end up being 10574 // vectorized. 10575 for (auto &L : *LI) 10576 Changed |= CFGChanged |= 10577 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10578 10579 // Build up a worklist of inner-loops to vectorize. This is necessary as 10580 // the act of vectorizing or partially unrolling a loop creates new loops 10581 // and can invalidate iterators across the loops. 10582 SmallVector<Loop *, 8> Worklist; 10583 10584 for (Loop *L : *LI) 10585 collectSupportedLoops(*L, LI, ORE, Worklist); 10586 10587 LoopsAnalyzed += Worklist.size(); 10588 10589 // Now walk the identified inner loops. 10590 while (!Worklist.empty()) { 10591 Loop *L = Worklist.pop_back_val(); 10592 10593 // For the inner loops we actually process, form LCSSA to simplify the 10594 // transform. 10595 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10596 10597 Changed |= CFGChanged |= processLoop(L); 10598 } 10599 10600 // Process each loop nest in the function. 10601 return LoopVectorizeResult(Changed, CFGChanged); 10602 } 10603 10604 PreservedAnalyses LoopVectorizePass::run(Function &F, 10605 FunctionAnalysisManager &AM) { 10606 auto &LI = AM.getResult<LoopAnalysis>(F); 10607 // There are no loops in the function. Return before computing other expensive 10608 // analyses. 10609 if (LI.empty()) 10610 return PreservedAnalyses::all(); 10611 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10612 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10613 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10614 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10615 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10616 auto &AA = AM.getResult<AAManager>(F); 10617 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10618 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10619 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10620 10621 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10622 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10623 [&](Loop &L) -> const LoopAccessInfo & { 10624 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10625 TLI, TTI, nullptr, nullptr, nullptr}; 10626 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10627 }; 10628 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10629 ProfileSummaryInfo *PSI = 10630 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10631 LoopVectorizeResult Result = 10632 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10633 if (!Result.MadeAnyChange) 10634 return PreservedAnalyses::all(); 10635 PreservedAnalyses PA; 10636 10637 // We currently do not preserve loopinfo/dominator analyses with outer loop 10638 // vectorization. Until this is addressed, mark these analyses as preserved 10639 // only for non-VPlan-native path. 10640 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10641 if (!EnableVPlanNativePath) { 10642 PA.preserve<LoopAnalysis>(); 10643 PA.preserve<DominatorTreeAnalysis>(); 10644 } 10645 10646 if (Result.MadeCFGChange) { 10647 // Making CFG changes likely means a loop got vectorized. Indicate that 10648 // extra simplification passes should be run. 10649 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10650 // be run if runtime checks have been added. 10651 AM.getResult<ShouldRunExtraVectorPasses>(F); 10652 PA.preserve<ShouldRunExtraVectorPasses>(); 10653 } else { 10654 PA.preserveSet<CFGAnalyses>(); 10655 } 10656 return PA; 10657 } 10658 10659 void LoopVectorizePass::printPipeline( 10660 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10661 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10662 OS, MapClassName2PassName); 10663 10664 OS << "<"; 10665 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10666 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10667 OS << ">"; 10668 } 10669