1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/None.h" 69 #include "llvm/ADT/Optional.h" 70 #include "llvm/ADT/STLExtras.h" 71 #include "llvm/ADT/SmallPtrSet.h" 72 #include "llvm/ADT/SmallSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 90 #include "llvm/Analysis/ProfileSummaryInfo.h" 91 #include "llvm/Analysis/ScalarEvolution.h" 92 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 93 #include "llvm/Analysis/TargetLibraryInfo.h" 94 #include "llvm/Analysis/TargetTransformInfo.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/InitializePasses.h" 125 #include "llvm/Pass.h" 126 #include "llvm/Support/Casting.h" 127 #include "llvm/Support/CommandLine.h" 128 #include "llvm/Support/Compiler.h" 129 #include "llvm/Support/Debug.h" 130 #include "llvm/Support/ErrorHandling.h" 131 #include "llvm/Support/InstructionCost.h" 132 #include "llvm/Support/MathExtras.h" 133 #include "llvm/Support/raw_ostream.h" 134 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 135 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 200 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks with a " 202 "vectorize(enable) pragma.")); 203 204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 205 // that predication is preferred, and this lists all options. I.e., the 206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 207 // and predicate the instructions accordingly. If tail-folding fails, there are 208 // different fallback strategies depending on these values: 209 namespace PreferPredicateTy { 210 enum Option { 211 ScalarEpilogue = 0, 212 PredicateElseScalarEpilogue, 213 PredicateOrDontVectorize 214 }; 215 } // namespace PreferPredicateTy 216 217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 218 "prefer-predicate-over-epilogue", 219 cl::init(PreferPredicateTy::ScalarEpilogue), 220 cl::Hidden, 221 cl::desc("Tail-folding and predication preferences over creating a scalar " 222 "epilogue loop."), 223 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 224 "scalar-epilogue", 225 "Don't tail-predicate loops, create scalar epilogue"), 226 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 227 "predicate-else-scalar-epilogue", 228 "prefer tail-folding, create scalar epilogue if tail " 229 "folding fails."), 230 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 231 "predicate-dont-vectorize", 232 "prefers tail-folding, don't attempt vectorization if " 233 "tail-folding fails."))); 234 235 static cl::opt<bool> MaximizeBandwidth( 236 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 237 cl::desc("Maximize bandwidth when selecting vectorization factor which " 238 "will be determined by the smallest type in loop.")); 239 240 static cl::opt<bool> EnableInterleavedMemAccesses( 241 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 242 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 243 244 /// An interleave-group may need masking if it resides in a block that needs 245 /// predication, or in order to mask away gaps. 246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 247 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 248 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 249 250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 251 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 252 cl::desc("We don't interleave loops with a estimated constant trip count " 253 "below this number")); 254 255 static cl::opt<unsigned> ForceTargetNumScalarRegs( 256 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of scalar registers.")); 258 259 static cl::opt<unsigned> ForceTargetNumVectorRegs( 260 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's number of vector registers.")); 262 263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 264 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 265 cl::desc("A flag that overrides the target's max interleave factor for " 266 "scalar loops.")); 267 268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 269 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 270 cl::desc("A flag that overrides the target's max interleave factor for " 271 "vectorized loops.")); 272 273 static cl::opt<unsigned> ForceTargetInstructionCost( 274 "force-target-instruction-cost", cl::init(0), cl::Hidden, 275 cl::desc("A flag that overrides the target's expected cost for " 276 "an instruction to a single constant value. Mostly " 277 "useful for getting consistent testing.")); 278 279 static cl::opt<bool> ForceTargetSupportsScalableVectors( 280 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 281 cl::desc( 282 "Pretend that scalable vectors are supported, even if the target does " 283 "not support them. This flag should only be used for testing.")); 284 285 static cl::opt<unsigned> SmallLoopCost( 286 "small-loop-cost", cl::init(20), cl::Hidden, 287 cl::desc( 288 "The cost of a loop that is considered 'small' by the interleaver.")); 289 290 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 291 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 292 cl::desc("Enable the use of the block frequency analysis to access PGO " 293 "heuristics minimizing code growth in cold regions and being more " 294 "aggressive in hot regions.")); 295 296 // Runtime interleave loops for load/store throughput. 297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 298 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 299 cl::desc( 300 "Enable runtime interleaving until load/store ports are saturated")); 301 302 /// Interleave small loops with scalar reductions. 303 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 304 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 305 cl::desc("Enable interleaving for loops with small iteration counts that " 306 "contain scalar reductions to expose ILP.")); 307 308 /// The number of stores in a loop that are allowed to need predication. 309 static cl::opt<unsigned> NumberOfStoresToPredicate( 310 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 311 cl::desc("Max number of stores to be predicated behind an if.")); 312 313 static cl::opt<bool> EnableIndVarRegisterHeur( 314 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 315 cl::desc("Count the induction variable only once when interleaving")); 316 317 static cl::opt<bool> EnableCondStoresVectorization( 318 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 319 cl::desc("Enable if predication of stores during vectorization.")); 320 321 static cl::opt<unsigned> MaxNestedScalarReductionIC( 322 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 323 cl::desc("The maximum interleave count to use when interleaving a scalar " 324 "reduction in a nested loop.")); 325 326 static cl::opt<bool> 327 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 328 cl::Hidden, 329 cl::desc("Prefer in-loop vector reductions, " 330 "overriding the targets preference.")); 331 332 static cl::opt<bool> ForceOrderedReductions( 333 "force-ordered-reductions", cl::init(false), cl::Hidden, 334 cl::desc("Enable the vectorisation of loops with in-order (strict) " 335 "FP reductions")); 336 337 static cl::opt<bool> PreferPredicatedReductionSelect( 338 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 339 cl::desc( 340 "Prefer predicating a reduction operation over an after loop select.")); 341 342 cl::opt<bool> EnableVPlanNativePath( 343 "enable-vplan-native-path", cl::init(false), cl::Hidden, 344 cl::desc("Enable VPlan-native vectorization path with " 345 "support for outer loop vectorization.")); 346 347 // This flag enables the stress testing of the VPlan H-CFG construction in the 348 // VPlan-native vectorization path. It must be used in conjuction with 349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 350 // verification of the H-CFGs built. 351 static cl::opt<bool> VPlanBuildStressTest( 352 "vplan-build-stress-test", cl::init(false), cl::Hidden, 353 cl::desc( 354 "Build VPlan for every supported loop nest in the function and bail " 355 "out right after the build (stress test the VPlan H-CFG construction " 356 "in the VPlan-native vectorization path).")); 357 358 cl::opt<bool> llvm::EnableLoopInterleaving( 359 "interleave-loops", cl::init(true), cl::Hidden, 360 cl::desc("Enable loop interleaving in Loop vectorization passes")); 361 cl::opt<bool> llvm::EnableLoopVectorization( 362 "vectorize-loops", cl::init(true), cl::Hidden, 363 cl::desc("Run the Loop vectorization passes")); 364 365 cl::opt<bool> PrintVPlansInDotFormat( 366 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 367 cl::desc("Use dot format instead of plain text when dumping VPlans")); 368 369 /// A helper function that returns true if the given type is irregular. The 370 /// type is irregular if its allocated size doesn't equal the store size of an 371 /// element of the corresponding vector type. 372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 373 // Determine if an array of N elements of type Ty is "bitcast compatible" 374 // with a <N x Ty> vector. 375 // This is only true if there is no padding between the array elements. 376 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 377 } 378 379 /// A helper function that returns the reciprocal of the block probability of 380 /// predicated blocks. If we return X, we are assuming the predicated block 381 /// will execute once for every X iterations of the loop header. 382 /// 383 /// TODO: We should use actual block probability here, if available. Currently, 384 /// we always assume predicated blocks have a 50% chance of executing. 385 static unsigned getReciprocalPredBlockProb() { return 2; } 386 387 /// A helper function that returns an integer or floating-point constant with 388 /// value C. 389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 390 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 391 : ConstantFP::get(Ty, C); 392 } 393 394 /// Returns "best known" trip count for the specified loop \p L as defined by 395 /// the following procedure: 396 /// 1) Returns exact trip count if it is known. 397 /// 2) Returns expected trip count according to profile data if any. 398 /// 3) Returns upper bound estimate if it is known. 399 /// 4) Returns None if all of the above failed. 400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 401 // Check if exact trip count is known. 402 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 403 return ExpectedTC; 404 405 // Check if there is an expected trip count available from profile data. 406 if (LoopVectorizeWithBlockFrequency) 407 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 408 return EstimatedTC; 409 410 // Check if upper bound estimate is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 412 return ExpectedTC; 413 414 return None; 415 } 416 417 // Forward declare GeneratedRTChecks. 418 class GeneratedRTChecks; 419 420 namespace llvm { 421 422 AnalysisKey ShouldRunExtraVectorPasses::Key; 423 424 /// InnerLoopVectorizer vectorizes loops which contain only one basic 425 /// block to a specified vectorization factor (VF). 426 /// This class performs the widening of scalars into vectors, or multiple 427 /// scalars. This class also implements the following features: 428 /// * It inserts an epilogue loop for handling loops that don't have iteration 429 /// counts that are known to be a multiple of the vectorization factor. 430 /// * It handles the code generation for reduction variables. 431 /// * Scalarization (implementation using scalars) of un-vectorizable 432 /// instructions. 433 /// InnerLoopVectorizer does not perform any vectorization-legality 434 /// checks, and relies on the caller to check for the different legality 435 /// aspects. The InnerLoopVectorizer relies on the 436 /// LoopVectorizationLegality class to provide information about the induction 437 /// and reduction variables that were found to a given vectorization factor. 438 class InnerLoopVectorizer { 439 public: 440 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 441 LoopInfo *LI, DominatorTree *DT, 442 const TargetLibraryInfo *TLI, 443 const TargetTransformInfo *TTI, AssumptionCache *AC, 444 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 445 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 446 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 447 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 448 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 449 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 450 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 451 PSI(PSI), RTChecks(RTChecks) { 452 // Query this against the original loop and save it here because the profile 453 // of the original loop header may change as the transformation happens. 454 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 455 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 456 } 457 458 virtual ~InnerLoopVectorizer() = default; 459 460 /// Create a new empty loop that will contain vectorized instructions later 461 /// on, while the old loop will be used as the scalar remainder. Control flow 462 /// is generated around the vectorized (and scalar epilogue) loops consisting 463 /// of various checks and bypasses. Return the pre-header block of the new 464 /// loop and the start value for the canonical induction, if it is != 0. The 465 /// latter is the case when vectorizing the epilogue loop. In the case of 466 /// epilogue vectorization, this function is overriden to handle the more 467 /// complex control flow around the loops. 468 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 469 470 /// Widen a single call instruction within the innermost loop. 471 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 472 VPTransformState &State); 473 474 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 475 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 476 477 // Return true if any runtime check is added. 478 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 479 480 /// A type for vectorized values in the new loop. Each value from the 481 /// original loop, when vectorized, is represented by UF vector values in the 482 /// new unrolled loop, where UF is the unroll factor. 483 using VectorParts = SmallVector<Value *, 2>; 484 485 /// Vectorize a single vector PHINode in a block in the VPlan-native path 486 /// only. 487 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 488 VPTransformState &State); 489 490 /// A helper function to scalarize a single Instruction in the innermost loop. 491 /// Generates a sequence of scalar instances for each lane between \p MinLane 492 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 493 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 494 /// Instr's operands. 495 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 496 const VPIteration &Instance, bool IfPredicateInstr, 497 VPTransformState &State); 498 499 /// Construct the vector value of a scalarized value \p V one lane at a time. 500 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 501 VPTransformState &State); 502 503 /// Try to vectorize interleaved access group \p Group with the base address 504 /// given in \p Addr, optionally masking the vector operations if \p 505 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 506 /// values in the vectorized loop. 507 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 508 ArrayRef<VPValue *> VPDefs, 509 VPTransformState &State, VPValue *Addr, 510 ArrayRef<VPValue *> StoredValues, 511 VPValue *BlockInMask = nullptr); 512 513 /// Set the debug location in the builder \p Ptr using the debug location in 514 /// \p V. If \p Ptr is None then it uses the class member's Builder. 515 void setDebugLocFromInst(const Value *V, 516 Optional<IRBuilderBase *> CustomBuilder = None); 517 518 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 519 void fixNonInductionPHIs(VPTransformState &State); 520 521 /// Returns true if the reordering of FP operations is not allowed, but we are 522 /// able to vectorize with strict in-order reductions for the given RdxDesc. 523 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 524 525 /// Create a broadcast instruction. This method generates a broadcast 526 /// instruction (shuffle) for loop invariant values and for the induction 527 /// value. If this is the induction variable then we extend it to N, N+1, ... 528 /// this is needed because each iteration in the loop corresponds to a SIMD 529 /// element. 530 virtual Value *getBroadcastInstrs(Value *V); 531 532 /// Add metadata from one instruction to another. 533 /// 534 /// This includes both the original MDs from \p From and additional ones (\see 535 /// addNewMetadata). Use this for *newly created* instructions in the vector 536 /// loop. 537 void addMetadata(Instruction *To, Instruction *From); 538 539 /// Similar to the previous function but it adds the metadata to a 540 /// vector of instructions. 541 void addMetadata(ArrayRef<Value *> To, Instruction *From); 542 543 // Returns the resume value (bc.merge.rdx) for a reduction as 544 // generated by fixReduction. 545 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 546 547 protected: 548 friend class LoopVectorizationPlanner; 549 550 /// A small list of PHINodes. 551 using PhiVector = SmallVector<PHINode *, 4>; 552 553 /// A type for scalarized values in the new loop. Each value from the 554 /// original loop, when scalarized, is represented by UF x VF scalar values 555 /// in the new unrolled loop, where UF is the unroll factor and VF is the 556 /// vectorization factor. 557 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 558 559 /// Set up the values of the IVs correctly when exiting the vector loop. 560 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 561 Value *VectorTripCount, Value *EndValue, 562 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 563 VPlan &Plan); 564 565 /// Handle all cross-iteration phis in the header. 566 void fixCrossIterationPHIs(VPTransformState &State); 567 568 /// Create the exit value of first order recurrences in the middle block and 569 /// update their users. 570 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 571 VPTransformState &State); 572 573 /// Create code for the loop exit value of the reduction. 574 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 575 576 /// Clear NSW/NUW flags from reduction instructions if necessary. 577 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 578 VPTransformState &State); 579 580 /// Iteratively sink the scalarized operands of a predicated instruction into 581 /// the block that was created for it. 582 void sinkScalarOperands(Instruction *PredInst); 583 584 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 585 /// represented as. 586 void truncateToMinimalBitwidths(VPTransformState &State); 587 588 /// Returns (and creates if needed) the original loop trip count. 589 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 590 591 /// Returns (and creates if needed) the trip count of the widened loop. 592 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 593 594 /// Returns a bitcasted value to the requested vector type. 595 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 596 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 597 const DataLayout &DL); 598 599 /// Emit a bypass check to see if the vector trip count is zero, including if 600 /// it overflows. 601 void emitIterationCountCheck(BasicBlock *Bypass); 602 603 /// Emit a bypass check to see if all of the SCEV assumptions we've 604 /// had to make are correct. Returns the block containing the checks or 605 /// nullptr if no checks have been added. 606 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 607 608 /// Emit bypass checks to check any memory assumptions we may have made. 609 /// Returns the block containing the checks or nullptr if no checks have been 610 /// added. 611 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 612 613 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 614 /// vector loop preheader, middle block and scalar preheader. 615 void createVectorLoopSkeleton(StringRef Prefix); 616 617 /// Create new phi nodes for the induction variables to resume iteration count 618 /// in the scalar epilogue, from where the vectorized loop left off. 619 /// In cases where the loop skeleton is more complicated (eg. epilogue 620 /// vectorization) and the resume values can come from an additional bypass 621 /// block, the \p AdditionalBypass pair provides information about the bypass 622 /// block and the end value on the edge from bypass to this loop. 623 void createInductionResumeValues( 624 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 625 626 /// Complete the loop skeleton by adding debug MDs, creating appropriate 627 /// conditional branches in the middle block, preparing the builder and 628 /// running the verifier. Return the preheader of the completed vector loop. 629 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 630 631 /// Add additional metadata to \p To that was not present on \p Orig. 632 /// 633 /// Currently this is used to add the noalias annotations based on the 634 /// inserted memchecks. Use this for instructions that are *cloned* into the 635 /// vector loop. 636 void addNewMetadata(Instruction *To, const Instruction *Orig); 637 638 /// Collect poison-generating recipes that may generate a poison value that is 639 /// used after vectorization, even when their operands are not poison. Those 640 /// recipes meet the following conditions: 641 /// * Contribute to the address computation of a recipe generating a widen 642 /// memory load/store (VPWidenMemoryInstructionRecipe or 643 /// VPInterleaveRecipe). 644 /// * Such a widen memory load/store has at least one underlying Instruction 645 /// that is in a basic block that needs predication and after vectorization 646 /// the generated instruction won't be predicated. 647 void collectPoisonGeneratingRecipes(VPTransformState &State); 648 649 /// Allow subclasses to override and print debug traces before/after vplan 650 /// execution, when trace information is requested. 651 virtual void printDebugTracesAtStart(){}; 652 virtual void printDebugTracesAtEnd(){}; 653 654 /// The original loop. 655 Loop *OrigLoop; 656 657 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 658 /// dynamic knowledge to simplify SCEV expressions and converts them to a 659 /// more usable form. 660 PredicatedScalarEvolution &PSE; 661 662 /// Loop Info. 663 LoopInfo *LI; 664 665 /// Dominator Tree. 666 DominatorTree *DT; 667 668 /// Alias Analysis. 669 AAResults *AA; 670 671 /// Target Library Info. 672 const TargetLibraryInfo *TLI; 673 674 /// Target Transform Info. 675 const TargetTransformInfo *TTI; 676 677 /// Assumption Cache. 678 AssumptionCache *AC; 679 680 /// Interface to emit optimization remarks. 681 OptimizationRemarkEmitter *ORE; 682 683 /// LoopVersioning. It's only set up (non-null) if memchecks were 684 /// used. 685 /// 686 /// This is currently only used to add no-alias metadata based on the 687 /// memchecks. The actually versioning is performed manually. 688 std::unique_ptr<LoopVersioning> LVer; 689 690 /// The vectorization SIMD factor to use. Each vector will have this many 691 /// vector elements. 692 ElementCount VF; 693 694 /// The vectorization unroll factor to use. Each scalar is vectorized to this 695 /// many different vector instructions. 696 unsigned UF; 697 698 /// The builder that we use 699 IRBuilder<> Builder; 700 701 // --- Vectorization state --- 702 703 /// The vector-loop preheader. 704 BasicBlock *LoopVectorPreHeader; 705 706 /// The scalar-loop preheader. 707 BasicBlock *LoopScalarPreHeader; 708 709 /// Middle Block between the vector and the scalar. 710 BasicBlock *LoopMiddleBlock; 711 712 /// The unique ExitBlock of the scalar loop if one exists. Note that 713 /// there can be multiple exiting edges reaching this block. 714 BasicBlock *LoopExitBlock; 715 716 /// The scalar loop body. 717 BasicBlock *LoopScalarBody; 718 719 /// A list of all bypass blocks. The first block is the entry of the loop. 720 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 721 722 /// Store instructions that were predicated. 723 SmallVector<Instruction *, 4> PredicatedInstructions; 724 725 /// Trip count of the original loop. 726 Value *TripCount = nullptr; 727 728 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 729 Value *VectorTripCount = nullptr; 730 731 /// The legality analysis. 732 LoopVectorizationLegality *Legal; 733 734 /// The profitablity analysis. 735 LoopVectorizationCostModel *Cost; 736 737 // Record whether runtime checks are added. 738 bool AddedSafetyChecks = false; 739 740 // Holds the end values for each induction variable. We save the end values 741 // so we can later fix-up the external users of the induction variables. 742 DenseMap<PHINode *, Value *> IVEndValues; 743 744 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 745 // fixed up at the end of vector code generation. 746 SmallVector<PHINode *, 8> OrigPHIsToFix; 747 748 /// BFI and PSI are used to check for profile guided size optimizations. 749 BlockFrequencyInfo *BFI; 750 ProfileSummaryInfo *PSI; 751 752 // Whether this loop should be optimized for size based on profile guided size 753 // optimizatios. 754 bool OptForSizeBasedOnProfile; 755 756 /// Structure to hold information about generated runtime checks, responsible 757 /// for cleaning the checks, if vectorization turns out unprofitable. 758 GeneratedRTChecks &RTChecks; 759 760 // Holds the resume values for reductions in the loops, used to set the 761 // correct start value of reduction PHIs when vectorizing the epilogue. 762 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 763 ReductionResumeValues; 764 }; 765 766 class InnerLoopUnroller : public InnerLoopVectorizer { 767 public: 768 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 769 LoopInfo *LI, DominatorTree *DT, 770 const TargetLibraryInfo *TLI, 771 const TargetTransformInfo *TTI, AssumptionCache *AC, 772 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 773 LoopVectorizationLegality *LVL, 774 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 775 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 776 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 777 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 778 BFI, PSI, Check) {} 779 780 private: 781 Value *getBroadcastInstrs(Value *V) override; 782 }; 783 784 /// Encapsulate information regarding vectorization of a loop and its epilogue. 785 /// This information is meant to be updated and used across two stages of 786 /// epilogue vectorization. 787 struct EpilogueLoopVectorizationInfo { 788 ElementCount MainLoopVF = ElementCount::getFixed(0); 789 unsigned MainLoopUF = 0; 790 ElementCount EpilogueVF = ElementCount::getFixed(0); 791 unsigned EpilogueUF = 0; 792 BasicBlock *MainLoopIterationCountCheck = nullptr; 793 BasicBlock *EpilogueIterationCountCheck = nullptr; 794 BasicBlock *SCEVSafetyCheck = nullptr; 795 BasicBlock *MemSafetyCheck = nullptr; 796 Value *TripCount = nullptr; 797 Value *VectorTripCount = nullptr; 798 799 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 800 ElementCount EVF, unsigned EUF) 801 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 802 assert(EUF == 1 && 803 "A high UF for the epilogue loop is likely not beneficial."); 804 } 805 }; 806 807 /// An extension of the inner loop vectorizer that creates a skeleton for a 808 /// vectorized loop that has its epilogue (residual) also vectorized. 809 /// The idea is to run the vplan on a given loop twice, firstly to setup the 810 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 811 /// from the first step and vectorize the epilogue. This is achieved by 812 /// deriving two concrete strategy classes from this base class and invoking 813 /// them in succession from the loop vectorizer planner. 814 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 815 public: 816 InnerLoopAndEpilogueVectorizer( 817 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 818 DominatorTree *DT, const TargetLibraryInfo *TLI, 819 const TargetTransformInfo *TTI, AssumptionCache *AC, 820 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 821 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 822 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 823 GeneratedRTChecks &Checks) 824 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 825 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 826 Checks), 827 EPI(EPI) {} 828 829 // Override this function to handle the more complex control flow around the 830 // three loops. 831 std::pair<BasicBlock *, Value *> 832 createVectorizedLoopSkeleton() final override { 833 return createEpilogueVectorizedLoopSkeleton(); 834 } 835 836 /// The interface for creating a vectorized skeleton using one of two 837 /// different strategies, each corresponding to one execution of the vplan 838 /// as described above. 839 virtual std::pair<BasicBlock *, Value *> 840 createEpilogueVectorizedLoopSkeleton() = 0; 841 842 /// Holds and updates state information required to vectorize the main loop 843 /// and its epilogue in two separate passes. This setup helps us avoid 844 /// regenerating and recomputing runtime safety checks. It also helps us to 845 /// shorten the iteration-count-check path length for the cases where the 846 /// iteration count of the loop is so small that the main vector loop is 847 /// completely skipped. 848 EpilogueLoopVectorizationInfo &EPI; 849 }; 850 851 /// A specialized derived class of inner loop vectorizer that performs 852 /// vectorization of *main* loops in the process of vectorizing loops and their 853 /// epilogues. 854 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 855 public: 856 EpilogueVectorizerMainLoop( 857 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 858 DominatorTree *DT, const TargetLibraryInfo *TLI, 859 const TargetTransformInfo *TTI, AssumptionCache *AC, 860 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 861 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 862 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 863 GeneratedRTChecks &Check) 864 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 865 EPI, LVL, CM, BFI, PSI, Check) {} 866 /// Implements the interface for creating a vectorized skeleton using the 867 /// *main loop* strategy (ie the first pass of vplan execution). 868 std::pair<BasicBlock *, Value *> 869 createEpilogueVectorizedLoopSkeleton() final override; 870 871 protected: 872 /// Emits an iteration count bypass check once for the main loop (when \p 873 /// ForEpilogue is false) and once for the epilogue loop (when \p 874 /// ForEpilogue is true). 875 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 876 void printDebugTracesAtStart() override; 877 void printDebugTracesAtEnd() override; 878 }; 879 880 // A specialized derived class of inner loop vectorizer that performs 881 // vectorization of *epilogue* loops in the process of vectorizing loops and 882 // their epilogues. 883 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 884 public: 885 EpilogueVectorizerEpilogueLoop( 886 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 887 DominatorTree *DT, const TargetLibraryInfo *TLI, 888 const TargetTransformInfo *TTI, AssumptionCache *AC, 889 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 890 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 891 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 892 GeneratedRTChecks &Checks) 893 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 894 EPI, LVL, CM, BFI, PSI, Checks) { 895 TripCount = EPI.TripCount; 896 } 897 /// Implements the interface for creating a vectorized skeleton using the 898 /// *epilogue loop* strategy (ie the second pass of vplan execution). 899 std::pair<BasicBlock *, Value *> 900 createEpilogueVectorizedLoopSkeleton() final override; 901 902 protected: 903 /// Emits an iteration count bypass check after the main vector loop has 904 /// finished to see if there are any iterations left to execute by either 905 /// the vector epilogue or the scalar epilogue. 906 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 907 BasicBlock *Bypass, 908 BasicBlock *Insert); 909 void printDebugTracesAtStart() override; 910 void printDebugTracesAtEnd() override; 911 }; 912 } // end namespace llvm 913 914 /// Look for a meaningful debug location on the instruction or it's 915 /// operands. 916 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 917 if (!I) 918 return I; 919 920 DebugLoc Empty; 921 if (I->getDebugLoc() != Empty) 922 return I; 923 924 for (Use &Op : I->operands()) { 925 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 926 if (OpInst->getDebugLoc() != Empty) 927 return OpInst; 928 } 929 930 return I; 931 } 932 933 void InnerLoopVectorizer::setDebugLocFromInst( 934 const Value *V, Optional<IRBuilderBase *> CustomBuilder) { 935 IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 936 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 937 const DILocation *DIL = Inst->getDebugLoc(); 938 939 // When a FSDiscriminator is enabled, we don't need to add the multiply 940 // factors to the discriminators. 941 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 942 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 943 // FIXME: For scalable vectors, assume vscale=1. 944 auto NewDIL = 945 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 946 if (NewDIL) 947 B->SetCurrentDebugLocation(NewDIL.getValue()); 948 else 949 LLVM_DEBUG(dbgs() 950 << "Failed to create new discriminator: " 951 << DIL->getFilename() << " Line: " << DIL->getLine()); 952 } else 953 B->SetCurrentDebugLocation(DIL); 954 } else 955 B->SetCurrentDebugLocation(DebugLoc()); 956 } 957 958 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 959 /// is passed, the message relates to that particular instruction. 960 #ifndef NDEBUG 961 static void debugVectorizationMessage(const StringRef Prefix, 962 const StringRef DebugMsg, 963 Instruction *I) { 964 dbgs() << "LV: " << Prefix << DebugMsg; 965 if (I != nullptr) 966 dbgs() << " " << *I; 967 else 968 dbgs() << '.'; 969 dbgs() << '\n'; 970 } 971 #endif 972 973 /// Create an analysis remark that explains why vectorization failed 974 /// 975 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 976 /// RemarkName is the identifier for the remark. If \p I is passed it is an 977 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 978 /// the location of the remark. \return the remark object that can be 979 /// streamed to. 980 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 981 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 982 Value *CodeRegion = TheLoop->getHeader(); 983 DebugLoc DL = TheLoop->getStartLoc(); 984 985 if (I) { 986 CodeRegion = I->getParent(); 987 // If there is no debug location attached to the instruction, revert back to 988 // using the loop's. 989 if (I->getDebugLoc()) 990 DL = I->getDebugLoc(); 991 } 992 993 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 994 } 995 996 namespace llvm { 997 998 /// Return a value for Step multiplied by VF. 999 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1000 int64_t Step) { 1001 assert(Ty->isIntegerTy() && "Expected an integer step"); 1002 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1003 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1004 } 1005 1006 /// Return the runtime value for VF. 1007 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1008 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1009 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1010 } 1011 1012 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1013 ElementCount VF) { 1014 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1015 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1016 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1017 return B.CreateUIToFP(RuntimeVF, FTy); 1018 } 1019 1020 void reportVectorizationFailure(const StringRef DebugMsg, 1021 const StringRef OREMsg, const StringRef ORETag, 1022 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1023 Instruction *I) { 1024 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1025 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1026 ORE->emit( 1027 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1028 << "loop not vectorized: " << OREMsg); 1029 } 1030 1031 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1032 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1033 Instruction *I) { 1034 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1035 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1036 ORE->emit( 1037 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1038 << Msg); 1039 } 1040 1041 } // end namespace llvm 1042 1043 #ifndef NDEBUG 1044 /// \return string containing a file name and a line # for the given loop. 1045 static std::string getDebugLocString(const Loop *L) { 1046 std::string Result; 1047 if (L) { 1048 raw_string_ostream OS(Result); 1049 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1050 LoopDbgLoc.print(OS); 1051 else 1052 // Just print the module name. 1053 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1054 OS.flush(); 1055 } 1056 return Result; 1057 } 1058 #endif 1059 1060 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1061 const Instruction *Orig) { 1062 // If the loop was versioned with memchecks, add the corresponding no-alias 1063 // metadata. 1064 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1065 LVer->annotateInstWithNoAlias(To, Orig); 1066 } 1067 1068 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1069 VPTransformState &State) { 1070 1071 // Collect recipes in the backward slice of `Root` that may generate a poison 1072 // value that is used after vectorization. 1073 SmallPtrSet<VPRecipeBase *, 16> Visited; 1074 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1075 SmallVector<VPRecipeBase *, 16> Worklist; 1076 Worklist.push_back(Root); 1077 1078 // Traverse the backward slice of Root through its use-def chain. 1079 while (!Worklist.empty()) { 1080 VPRecipeBase *CurRec = Worklist.back(); 1081 Worklist.pop_back(); 1082 1083 if (!Visited.insert(CurRec).second) 1084 continue; 1085 1086 // Prune search if we find another recipe generating a widen memory 1087 // instruction. Widen memory instructions involved in address computation 1088 // will lead to gather/scatter instructions, which don't need to be 1089 // handled. 1090 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1091 isa<VPInterleaveRecipe>(CurRec) || 1092 isa<VPScalarIVStepsRecipe>(CurRec) || 1093 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1094 continue; 1095 1096 // This recipe contributes to the address computation of a widen 1097 // load/store. Collect recipe if its underlying instruction has 1098 // poison-generating flags. 1099 Instruction *Instr = CurRec->getUnderlyingInstr(); 1100 if (Instr && Instr->hasPoisonGeneratingFlags()) 1101 State.MayGeneratePoisonRecipes.insert(CurRec); 1102 1103 // Add new definitions to the worklist. 1104 for (VPValue *operand : CurRec->operands()) 1105 if (VPDef *OpDef = operand->getDef()) 1106 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1107 } 1108 }); 1109 1110 // Traverse all the recipes in the VPlan and collect the poison-generating 1111 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1112 // VPInterleaveRecipe. 1113 auto Iter = depth_first( 1114 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1115 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1116 for (VPRecipeBase &Recipe : *VPBB) { 1117 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1118 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1119 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1120 if (AddrDef && WidenRec->isConsecutive() && 1121 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1122 collectPoisonGeneratingInstrsInBackwardSlice( 1123 cast<VPRecipeBase>(AddrDef)); 1124 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1125 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1126 if (AddrDef) { 1127 // Check if any member of the interleave group needs predication. 1128 const InterleaveGroup<Instruction> *InterGroup = 1129 InterleaveRec->getInterleaveGroup(); 1130 bool NeedPredication = false; 1131 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1132 I < NumMembers; ++I) { 1133 Instruction *Member = InterGroup->getMember(I); 1134 if (Member) 1135 NeedPredication |= 1136 Legal->blockNeedsPredication(Member->getParent()); 1137 } 1138 1139 if (NeedPredication) 1140 collectPoisonGeneratingInstrsInBackwardSlice( 1141 cast<VPRecipeBase>(AddrDef)); 1142 } 1143 } 1144 } 1145 } 1146 } 1147 1148 void InnerLoopVectorizer::addMetadata(Instruction *To, 1149 Instruction *From) { 1150 propagateMetadata(To, From); 1151 addNewMetadata(To, From); 1152 } 1153 1154 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1155 Instruction *From) { 1156 for (Value *V : To) { 1157 if (Instruction *I = dyn_cast<Instruction>(V)) 1158 addMetadata(I, From); 1159 } 1160 } 1161 1162 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1163 const RecurrenceDescriptor &RdxDesc) { 1164 auto It = ReductionResumeValues.find(&RdxDesc); 1165 assert(It != ReductionResumeValues.end() && 1166 "Expected to find a resume value for the reduction."); 1167 return It->second; 1168 } 1169 1170 namespace llvm { 1171 1172 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1173 // lowered. 1174 enum ScalarEpilogueLowering { 1175 1176 // The default: allowing scalar epilogues. 1177 CM_ScalarEpilogueAllowed, 1178 1179 // Vectorization with OptForSize: don't allow epilogues. 1180 CM_ScalarEpilogueNotAllowedOptSize, 1181 1182 // A special case of vectorisation with OptForSize: loops with a very small 1183 // trip count are considered for vectorization under OptForSize, thereby 1184 // making sure the cost of their loop body is dominant, free of runtime 1185 // guards and scalar iteration overheads. 1186 CM_ScalarEpilogueNotAllowedLowTripLoop, 1187 1188 // Loop hint predicate indicating an epilogue is undesired. 1189 CM_ScalarEpilogueNotNeededUsePredicate, 1190 1191 // Directive indicating we must either tail fold or not vectorize 1192 CM_ScalarEpilogueNotAllowedUsePredicate 1193 }; 1194 1195 /// ElementCountComparator creates a total ordering for ElementCount 1196 /// for the purposes of using it in a set structure. 1197 struct ElementCountComparator { 1198 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1199 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1200 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1201 } 1202 }; 1203 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1204 1205 /// LoopVectorizationCostModel - estimates the expected speedups due to 1206 /// vectorization. 1207 /// In many cases vectorization is not profitable. This can happen because of 1208 /// a number of reasons. In this class we mainly attempt to predict the 1209 /// expected speedup/slowdowns due to the supported instruction set. We use the 1210 /// TargetTransformInfo to query the different backends for the cost of 1211 /// different operations. 1212 class LoopVectorizationCostModel { 1213 public: 1214 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1215 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1216 LoopVectorizationLegality *Legal, 1217 const TargetTransformInfo &TTI, 1218 const TargetLibraryInfo *TLI, DemandedBits *DB, 1219 AssumptionCache *AC, 1220 OptimizationRemarkEmitter *ORE, const Function *F, 1221 const LoopVectorizeHints *Hints, 1222 InterleavedAccessInfo &IAI) 1223 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1224 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1225 Hints(Hints), InterleaveInfo(IAI) {} 1226 1227 /// \return An upper bound for the vectorization factors (both fixed and 1228 /// scalable). If the factors are 0, vectorization and interleaving should be 1229 /// avoided up front. 1230 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1231 1232 /// \return True if runtime checks are required for vectorization, and false 1233 /// otherwise. 1234 bool runtimeChecksRequired(); 1235 1236 /// \return The most profitable vectorization factor and the cost of that VF. 1237 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1238 /// then this vectorization factor will be selected if vectorization is 1239 /// possible. 1240 VectorizationFactor 1241 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1242 1243 VectorizationFactor 1244 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1245 const LoopVectorizationPlanner &LVP); 1246 1247 /// Setup cost-based decisions for user vectorization factor. 1248 /// \return true if the UserVF is a feasible VF to be chosen. 1249 bool selectUserVectorizationFactor(ElementCount UserVF) { 1250 collectUniformsAndScalars(UserVF); 1251 collectInstsToScalarize(UserVF); 1252 return expectedCost(UserVF).first.isValid(); 1253 } 1254 1255 /// \return The size (in bits) of the smallest and widest types in the code 1256 /// that needs to be vectorized. We ignore values that remain scalar such as 1257 /// 64 bit loop indices. 1258 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1259 1260 /// \return The desired interleave count. 1261 /// If interleave count has been specified by metadata it will be returned. 1262 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1263 /// are the selected vectorization factor and the cost of the selected VF. 1264 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1265 1266 /// Memory access instruction may be vectorized in more than one way. 1267 /// Form of instruction after vectorization depends on cost. 1268 /// This function takes cost-based decisions for Load/Store instructions 1269 /// and collects them in a map. This decisions map is used for building 1270 /// the lists of loop-uniform and loop-scalar instructions. 1271 /// The calculated cost is saved with widening decision in order to 1272 /// avoid redundant calculations. 1273 void setCostBasedWideningDecision(ElementCount VF); 1274 1275 /// A struct that represents some properties of the register usage 1276 /// of a loop. 1277 struct RegisterUsage { 1278 /// Holds the number of loop invariant values that are used in the loop. 1279 /// The key is ClassID of target-provided register class. 1280 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1281 /// Holds the maximum number of concurrent live intervals in the loop. 1282 /// The key is ClassID of target-provided register class. 1283 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1284 }; 1285 1286 /// \return Returns information about the register usages of the loop for the 1287 /// given vectorization factors. 1288 SmallVector<RegisterUsage, 8> 1289 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1290 1291 /// Collect values we want to ignore in the cost model. 1292 void collectValuesToIgnore(); 1293 1294 /// Collect all element types in the loop for which widening is needed. 1295 void collectElementTypesForWidening(); 1296 1297 /// Split reductions into those that happen in the loop, and those that happen 1298 /// outside. In loop reductions are collected into InLoopReductionChains. 1299 void collectInLoopReductions(); 1300 1301 /// Returns true if we should use strict in-order reductions for the given 1302 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1303 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1304 /// of FP operations. 1305 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1306 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1307 } 1308 1309 /// \returns The smallest bitwidth each instruction can be represented with. 1310 /// The vector equivalents of these instructions should be truncated to this 1311 /// type. 1312 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1313 return MinBWs; 1314 } 1315 1316 /// \returns True if it is more profitable to scalarize instruction \p I for 1317 /// vectorization factor \p VF. 1318 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1319 assert(VF.isVector() && 1320 "Profitable to scalarize relevant only for VF > 1."); 1321 1322 // Cost model is not run in the VPlan-native path - return conservative 1323 // result until this changes. 1324 if (EnableVPlanNativePath) 1325 return false; 1326 1327 auto Scalars = InstsToScalarize.find(VF); 1328 assert(Scalars != InstsToScalarize.end() && 1329 "VF not yet analyzed for scalarization profitability"); 1330 return Scalars->second.find(I) != Scalars->second.end(); 1331 } 1332 1333 /// Returns true if \p I is known to be uniform after vectorization. 1334 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1335 if (VF.isScalar()) 1336 return true; 1337 1338 // Cost model is not run in the VPlan-native path - return conservative 1339 // result until this changes. 1340 if (EnableVPlanNativePath) 1341 return false; 1342 1343 auto UniformsPerVF = Uniforms.find(VF); 1344 assert(UniformsPerVF != Uniforms.end() && 1345 "VF not yet analyzed for uniformity"); 1346 return UniformsPerVF->second.count(I); 1347 } 1348 1349 /// Returns true if \p I is known to be scalar after vectorization. 1350 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1351 if (VF.isScalar()) 1352 return true; 1353 1354 // Cost model is not run in the VPlan-native path - return conservative 1355 // result until this changes. 1356 if (EnableVPlanNativePath) 1357 return false; 1358 1359 auto ScalarsPerVF = Scalars.find(VF); 1360 assert(ScalarsPerVF != Scalars.end() && 1361 "Scalar values are not calculated for VF"); 1362 return ScalarsPerVF->second.count(I); 1363 } 1364 1365 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1366 /// for vectorization factor \p VF. 1367 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1368 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1369 !isProfitableToScalarize(I, VF) && 1370 !isScalarAfterVectorization(I, VF); 1371 } 1372 1373 /// Decision that was taken during cost calculation for memory instruction. 1374 enum InstWidening { 1375 CM_Unknown, 1376 CM_Widen, // For consecutive accesses with stride +1. 1377 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1378 CM_Interleave, 1379 CM_GatherScatter, 1380 CM_Scalarize 1381 }; 1382 1383 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1384 /// instruction \p I and vector width \p VF. 1385 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1386 InstructionCost Cost) { 1387 assert(VF.isVector() && "Expected VF >=2"); 1388 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1389 } 1390 1391 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1392 /// interleaving group \p Grp and vector width \p VF. 1393 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1394 ElementCount VF, InstWidening W, 1395 InstructionCost Cost) { 1396 assert(VF.isVector() && "Expected VF >=2"); 1397 /// Broadcast this decicion to all instructions inside the group. 1398 /// But the cost will be assigned to one instruction only. 1399 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1400 if (auto *I = Grp->getMember(i)) { 1401 if (Grp->getInsertPos() == I) 1402 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1403 else 1404 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1405 } 1406 } 1407 } 1408 1409 /// Return the cost model decision for the given instruction \p I and vector 1410 /// width \p VF. Return CM_Unknown if this instruction did not pass 1411 /// through the cost modeling. 1412 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1413 assert(VF.isVector() && "Expected VF to be a vector VF"); 1414 // Cost model is not run in the VPlan-native path - return conservative 1415 // result until this changes. 1416 if (EnableVPlanNativePath) 1417 return CM_GatherScatter; 1418 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 auto Itr = WideningDecisions.find(InstOnVF); 1421 if (Itr == WideningDecisions.end()) 1422 return CM_Unknown; 1423 return Itr->second.first; 1424 } 1425 1426 /// Return the vectorization cost for the given instruction \p I and vector 1427 /// width \p VF. 1428 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1429 assert(VF.isVector() && "Expected VF >=2"); 1430 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1431 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1432 "The cost is not calculated"); 1433 return WideningDecisions[InstOnVF].second; 1434 } 1435 1436 /// Return True if instruction \p I is an optimizable truncate whose operand 1437 /// is an induction variable. Such a truncate will be removed by adding a new 1438 /// induction variable with the destination type. 1439 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1440 // If the instruction is not a truncate, return false. 1441 auto *Trunc = dyn_cast<TruncInst>(I); 1442 if (!Trunc) 1443 return false; 1444 1445 // Get the source and destination types of the truncate. 1446 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1447 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1448 1449 // If the truncate is free for the given types, return false. Replacing a 1450 // free truncate with an induction variable would add an induction variable 1451 // update instruction to each iteration of the loop. We exclude from this 1452 // check the primary induction variable since it will need an update 1453 // instruction regardless. 1454 Value *Op = Trunc->getOperand(0); 1455 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1456 return false; 1457 1458 // If the truncated value is not an induction variable, return false. 1459 return Legal->isInductionPhi(Op); 1460 } 1461 1462 /// Collects the instructions to scalarize for each predicated instruction in 1463 /// the loop. 1464 void collectInstsToScalarize(ElementCount VF); 1465 1466 /// Collect Uniform and Scalar values for the given \p VF. 1467 /// The sets depend on CM decision for Load/Store instructions 1468 /// that may be vectorized as interleave, gather-scatter or scalarized. 1469 void collectUniformsAndScalars(ElementCount VF) { 1470 // Do the analysis once. 1471 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1472 return; 1473 setCostBasedWideningDecision(VF); 1474 collectLoopUniforms(VF); 1475 collectLoopScalars(VF); 1476 } 1477 1478 /// Returns true if the target machine supports masked store operation 1479 /// for the given \p DataType and kind of access to \p Ptr. 1480 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1481 return Legal->isConsecutivePtr(DataType, Ptr) && 1482 TTI.isLegalMaskedStore(DataType, Alignment); 1483 } 1484 1485 /// Returns true if the target machine supports masked load operation 1486 /// for the given \p DataType and kind of access to \p Ptr. 1487 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1488 return Legal->isConsecutivePtr(DataType, Ptr) && 1489 TTI.isLegalMaskedLoad(DataType, Alignment); 1490 } 1491 1492 /// Returns true if the target machine can represent \p V as a masked gather 1493 /// or scatter operation. 1494 bool isLegalGatherOrScatter(Value *V, 1495 ElementCount VF = ElementCount::getFixed(1)) { 1496 bool LI = isa<LoadInst>(V); 1497 bool SI = isa<StoreInst>(V); 1498 if (!LI && !SI) 1499 return false; 1500 auto *Ty = getLoadStoreType(V); 1501 Align Align = getLoadStoreAlignment(V); 1502 if (VF.isVector()) 1503 Ty = VectorType::get(Ty, VF); 1504 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1505 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1506 } 1507 1508 /// Returns true if the target machine supports all of the reduction 1509 /// variables found for the given VF. 1510 bool canVectorizeReductions(ElementCount VF) const { 1511 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1512 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1513 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1514 })); 1515 } 1516 1517 /// Returns true if \p I is an instruction that will be scalarized with 1518 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1519 /// instructions include conditional stores and instructions that may divide 1520 /// by zero. 1521 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1522 1523 // Returns true if \p I is an instruction that will be predicated either 1524 // through scalar predication or masked load/store or masked gather/scatter. 1525 // \p VF is the vectorization factor that will be used to vectorize \p I. 1526 // Superset of instructions that return true for isScalarWithPredication. 1527 bool isPredicatedInst(Instruction *I, ElementCount VF, 1528 bool IsKnownUniform = false) { 1529 // When we know the load is uniform and the original scalar loop was not 1530 // predicated we don't need to mark it as a predicated instruction. Any 1531 // vectorised blocks created when tail-folding are something artificial we 1532 // have introduced and we know there is always at least one active lane. 1533 // That's why we call Legal->blockNeedsPredication here because it doesn't 1534 // query tail-folding. 1535 if (IsKnownUniform && isa<LoadInst>(I) && 1536 !Legal->blockNeedsPredication(I->getParent())) 1537 return false; 1538 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1539 return false; 1540 // Loads and stores that need some form of masked operation are predicated 1541 // instructions. 1542 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1543 return Legal->isMaskRequired(I); 1544 return isScalarWithPredication(I, VF); 1545 } 1546 1547 /// Returns true if \p I is a memory instruction with consecutive memory 1548 /// access that can be widened. 1549 bool 1550 memoryInstructionCanBeWidened(Instruction *I, 1551 ElementCount VF = ElementCount::getFixed(1)); 1552 1553 /// Returns true if \p I is a memory instruction in an interleaved-group 1554 /// of memory accesses that can be vectorized with wide vector loads/stores 1555 /// and shuffles. 1556 bool 1557 interleavedAccessCanBeWidened(Instruction *I, 1558 ElementCount VF = ElementCount::getFixed(1)); 1559 1560 /// Check if \p Instr belongs to any interleaved access group. 1561 bool isAccessInterleaved(Instruction *Instr) { 1562 return InterleaveInfo.isInterleaved(Instr); 1563 } 1564 1565 /// Get the interleaved access group that \p Instr belongs to. 1566 const InterleaveGroup<Instruction> * 1567 getInterleavedAccessGroup(Instruction *Instr) { 1568 return InterleaveInfo.getInterleaveGroup(Instr); 1569 } 1570 1571 /// Returns true if we're required to use a scalar epilogue for at least 1572 /// the final iteration of the original loop. 1573 bool requiresScalarEpilogue(ElementCount VF) const { 1574 if (!isScalarEpilogueAllowed()) 1575 return false; 1576 // If we might exit from anywhere but the latch, must run the exiting 1577 // iteration in scalar form. 1578 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1579 return true; 1580 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1581 } 1582 1583 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1584 /// loop hint annotation. 1585 bool isScalarEpilogueAllowed() const { 1586 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1587 } 1588 1589 /// Returns true if all loop blocks should be masked to fold tail loop. 1590 bool foldTailByMasking() const { return FoldTailByMasking; } 1591 1592 /// Returns true if the instructions in this block requires predication 1593 /// for any reason, e.g. because tail folding now requires a predicate 1594 /// or because the block in the original loop was predicated. 1595 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1596 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1597 } 1598 1599 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1600 /// nodes to the chain of instructions representing the reductions. Uses a 1601 /// MapVector to ensure deterministic iteration order. 1602 using ReductionChainMap = 1603 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1604 1605 /// Return the chain of instructions representing an inloop reduction. 1606 const ReductionChainMap &getInLoopReductionChains() const { 1607 return InLoopReductionChains; 1608 } 1609 1610 /// Returns true if the Phi is part of an inloop reduction. 1611 bool isInLoopReduction(PHINode *Phi) const { 1612 return InLoopReductionChains.count(Phi); 1613 } 1614 1615 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1616 /// with factor VF. Return the cost of the instruction, including 1617 /// scalarization overhead if it's needed. 1618 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1619 1620 /// Estimate cost of a call instruction CI if it were vectorized with factor 1621 /// VF. Return the cost of the instruction, including scalarization overhead 1622 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1623 /// scalarized - 1624 /// i.e. either vector version isn't available, or is too expensive. 1625 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1626 bool &NeedToScalarize) const; 1627 1628 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1629 /// that of B. 1630 bool isMoreProfitable(const VectorizationFactor &A, 1631 const VectorizationFactor &B) const; 1632 1633 /// Invalidates decisions already taken by the cost model. 1634 void invalidateCostModelingDecisions() { 1635 WideningDecisions.clear(); 1636 Uniforms.clear(); 1637 Scalars.clear(); 1638 } 1639 1640 private: 1641 unsigned NumPredStores = 0; 1642 1643 /// Convenience function that returns the value of vscale_range iff 1644 /// vscale_range.min == vscale_range.max or otherwise returns the value 1645 /// returned by the corresponding TLI method. 1646 Optional<unsigned> getVScaleForTuning() const; 1647 1648 /// \return An upper bound for the vectorization factors for both 1649 /// fixed and scalable vectorization, where the minimum-known number of 1650 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1651 /// disabled or unsupported, then the scalable part will be equal to 1652 /// ElementCount::getScalable(0). 1653 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1654 ElementCount UserVF, 1655 bool FoldTailByMasking); 1656 1657 /// \return the maximized element count based on the targets vector 1658 /// registers and the loop trip-count, but limited to a maximum safe VF. 1659 /// This is a helper function of computeFeasibleMaxVF. 1660 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1661 /// issue that occurred on one of the buildbots which cannot be reproduced 1662 /// without having access to the properietary compiler (see comments on 1663 /// D98509). The issue is currently under investigation and this workaround 1664 /// will be removed as soon as possible. 1665 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1666 unsigned SmallestType, 1667 unsigned WidestType, 1668 const ElementCount &MaxSafeVF, 1669 bool FoldTailByMasking); 1670 1671 /// \return the maximum legal scalable VF, based on the safe max number 1672 /// of elements. 1673 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1674 1675 /// The vectorization cost is a combination of the cost itself and a boolean 1676 /// indicating whether any of the contributing operations will actually 1677 /// operate on vector values after type legalization in the backend. If this 1678 /// latter value is false, then all operations will be scalarized (i.e. no 1679 /// vectorization has actually taken place). 1680 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1681 1682 /// Returns the expected execution cost. The unit of the cost does 1683 /// not matter because we use the 'cost' units to compare different 1684 /// vector widths. The cost that is returned is *not* normalized by 1685 /// the factor width. If \p Invalid is not nullptr, this function 1686 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1687 /// each instruction that has an Invalid cost for the given VF. 1688 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1689 VectorizationCostTy 1690 expectedCost(ElementCount VF, 1691 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1692 1693 /// Returns the execution time cost of an instruction for a given vector 1694 /// width. Vector width of one means scalar. 1695 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1696 1697 /// The cost-computation logic from getInstructionCost which provides 1698 /// the vector type as an output parameter. 1699 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1700 Type *&VectorTy); 1701 1702 /// Return the cost of instructions in an inloop reduction pattern, if I is 1703 /// part of that pattern. 1704 Optional<InstructionCost> 1705 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1706 TTI::TargetCostKind CostKind); 1707 1708 /// Calculate vectorization cost of memory instruction \p I. 1709 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1710 1711 /// The cost computation for scalarized memory instruction. 1712 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1713 1714 /// The cost computation for interleaving group of memory instructions. 1715 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1716 1717 /// The cost computation for Gather/Scatter instruction. 1718 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1719 1720 /// The cost computation for widening instruction \p I with consecutive 1721 /// memory access. 1722 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1723 1724 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1725 /// Load: scalar load + broadcast. 1726 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1727 /// element) 1728 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1729 1730 /// Estimate the overhead of scalarizing an instruction. This is a 1731 /// convenience wrapper for the type-based getScalarizationOverhead API. 1732 InstructionCost getScalarizationOverhead(Instruction *I, 1733 ElementCount VF) const; 1734 1735 /// Returns whether the instruction is a load or store and will be a emitted 1736 /// as a vector operation. 1737 bool isConsecutiveLoadOrStore(Instruction *I); 1738 1739 /// Returns true if an artificially high cost for emulated masked memrefs 1740 /// should be used. 1741 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1742 1743 /// Map of scalar integer values to the smallest bitwidth they can be legally 1744 /// represented as. The vector equivalents of these values should be truncated 1745 /// to this type. 1746 MapVector<Instruction *, uint64_t> MinBWs; 1747 1748 /// A type representing the costs for instructions if they were to be 1749 /// scalarized rather than vectorized. The entries are Instruction-Cost 1750 /// pairs. 1751 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1752 1753 /// A set containing all BasicBlocks that are known to present after 1754 /// vectorization as a predicated block. 1755 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1756 1757 /// Records whether it is allowed to have the original scalar loop execute at 1758 /// least once. This may be needed as a fallback loop in case runtime 1759 /// aliasing/dependence checks fail, or to handle the tail/remainder 1760 /// iterations when the trip count is unknown or doesn't divide by the VF, 1761 /// or as a peel-loop to handle gaps in interleave-groups. 1762 /// Under optsize and when the trip count is very small we don't allow any 1763 /// iterations to execute in the scalar loop. 1764 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1765 1766 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1767 bool FoldTailByMasking = false; 1768 1769 /// A map holding scalar costs for different vectorization factors. The 1770 /// presence of a cost for an instruction in the mapping indicates that the 1771 /// instruction will be scalarized when vectorizing with the associated 1772 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1773 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1774 1775 /// Holds the instructions known to be uniform after vectorization. 1776 /// The data is collected per VF. 1777 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1778 1779 /// Holds the instructions known to be scalar after vectorization. 1780 /// The data is collected per VF. 1781 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1782 1783 /// Holds the instructions (address computations) that are forced to be 1784 /// scalarized. 1785 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1786 1787 /// PHINodes of the reductions that should be expanded in-loop along with 1788 /// their associated chains of reduction operations, in program order from top 1789 /// (PHI) to bottom 1790 ReductionChainMap InLoopReductionChains; 1791 1792 /// A Map of inloop reduction operations and their immediate chain operand. 1793 /// FIXME: This can be removed once reductions can be costed correctly in 1794 /// vplan. This was added to allow quick lookup to the inloop operations, 1795 /// without having to loop through InLoopReductionChains. 1796 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1797 1798 /// Returns the expected difference in cost from scalarizing the expression 1799 /// feeding a predicated instruction \p PredInst. The instructions to 1800 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1801 /// non-negative return value implies the expression will be scalarized. 1802 /// Currently, only single-use chains are considered for scalarization. 1803 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1804 ElementCount VF); 1805 1806 /// Collect the instructions that are uniform after vectorization. An 1807 /// instruction is uniform if we represent it with a single scalar value in 1808 /// the vectorized loop corresponding to each vector iteration. Examples of 1809 /// uniform instructions include pointer operands of consecutive or 1810 /// interleaved memory accesses. Note that although uniformity implies an 1811 /// instruction will be scalar, the reverse is not true. In general, a 1812 /// scalarized instruction will be represented by VF scalar values in the 1813 /// vectorized loop, each corresponding to an iteration of the original 1814 /// scalar loop. 1815 void collectLoopUniforms(ElementCount VF); 1816 1817 /// Collect the instructions that are scalar after vectorization. An 1818 /// instruction is scalar if it is known to be uniform or will be scalarized 1819 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1820 /// to the list if they are used by a load/store instruction that is marked as 1821 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1822 /// VF values in the vectorized loop, each corresponding to an iteration of 1823 /// the original scalar loop. 1824 void collectLoopScalars(ElementCount VF); 1825 1826 /// Keeps cost model vectorization decision and cost for instructions. 1827 /// Right now it is used for memory instructions only. 1828 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1829 std::pair<InstWidening, InstructionCost>>; 1830 1831 DecisionList WideningDecisions; 1832 1833 /// Returns true if \p V is expected to be vectorized and it needs to be 1834 /// extracted. 1835 bool needsExtract(Value *V, ElementCount VF) const { 1836 Instruction *I = dyn_cast<Instruction>(V); 1837 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1838 TheLoop->isLoopInvariant(I)) 1839 return false; 1840 1841 // Assume we can vectorize V (and hence we need extraction) if the 1842 // scalars are not computed yet. This can happen, because it is called 1843 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1844 // the scalars are collected. That should be a safe assumption in most 1845 // cases, because we check if the operands have vectorizable types 1846 // beforehand in LoopVectorizationLegality. 1847 return Scalars.find(VF) == Scalars.end() || 1848 !isScalarAfterVectorization(I, VF); 1849 }; 1850 1851 /// Returns a range containing only operands needing to be extracted. 1852 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1853 ElementCount VF) const { 1854 return SmallVector<Value *, 4>(make_filter_range( 1855 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1856 } 1857 1858 /// Determines if we have the infrastructure to vectorize loop \p L and its 1859 /// epilogue, assuming the main loop is vectorized by \p VF. 1860 bool isCandidateForEpilogueVectorization(const Loop &L, 1861 const ElementCount VF) const; 1862 1863 /// Returns true if epilogue vectorization is considered profitable, and 1864 /// false otherwise. 1865 /// \p VF is the vectorization factor chosen for the original loop. 1866 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1867 1868 public: 1869 /// The loop that we evaluate. 1870 Loop *TheLoop; 1871 1872 /// Predicated scalar evolution analysis. 1873 PredicatedScalarEvolution &PSE; 1874 1875 /// Loop Info analysis. 1876 LoopInfo *LI; 1877 1878 /// Vectorization legality. 1879 LoopVectorizationLegality *Legal; 1880 1881 /// Vector target information. 1882 const TargetTransformInfo &TTI; 1883 1884 /// Target Library Info. 1885 const TargetLibraryInfo *TLI; 1886 1887 /// Demanded bits analysis. 1888 DemandedBits *DB; 1889 1890 /// Assumption cache. 1891 AssumptionCache *AC; 1892 1893 /// Interface to emit optimization remarks. 1894 OptimizationRemarkEmitter *ORE; 1895 1896 const Function *TheFunction; 1897 1898 /// Loop Vectorize Hint. 1899 const LoopVectorizeHints *Hints; 1900 1901 /// The interleave access information contains groups of interleaved accesses 1902 /// with the same stride and close to each other. 1903 InterleavedAccessInfo &InterleaveInfo; 1904 1905 /// Values to ignore in the cost model. 1906 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1907 1908 /// Values to ignore in the cost model when VF > 1. 1909 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1910 1911 /// All element types found in the loop. 1912 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1913 1914 /// Profitable vector factors. 1915 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1916 }; 1917 } // end namespace llvm 1918 1919 /// Helper struct to manage generating runtime checks for vectorization. 1920 /// 1921 /// The runtime checks are created up-front in temporary blocks to allow better 1922 /// estimating the cost and un-linked from the existing IR. After deciding to 1923 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1924 /// temporary blocks are completely removed. 1925 class GeneratedRTChecks { 1926 /// Basic block which contains the generated SCEV checks, if any. 1927 BasicBlock *SCEVCheckBlock = nullptr; 1928 1929 /// The value representing the result of the generated SCEV checks. If it is 1930 /// nullptr, either no SCEV checks have been generated or they have been used. 1931 Value *SCEVCheckCond = nullptr; 1932 1933 /// Basic block which contains the generated memory runtime checks, if any. 1934 BasicBlock *MemCheckBlock = nullptr; 1935 1936 /// The value representing the result of the generated memory runtime checks. 1937 /// If it is nullptr, either no memory runtime checks have been generated or 1938 /// they have been used. 1939 Value *MemRuntimeCheckCond = nullptr; 1940 1941 DominatorTree *DT; 1942 LoopInfo *LI; 1943 1944 SCEVExpander SCEVExp; 1945 SCEVExpander MemCheckExp; 1946 1947 public: 1948 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1949 const DataLayout &DL) 1950 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1951 MemCheckExp(SE, DL, "scev.check") {} 1952 1953 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1954 /// accurately estimate the cost of the runtime checks. The blocks are 1955 /// un-linked from the IR and is added back during vector code generation. If 1956 /// there is no vector code generation, the check blocks are removed 1957 /// completely. 1958 void Create(Loop *L, const LoopAccessInfo &LAI, 1959 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1960 1961 BasicBlock *LoopHeader = L->getHeader(); 1962 BasicBlock *Preheader = L->getLoopPreheader(); 1963 1964 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1965 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1966 // may be used by SCEVExpander. The blocks will be un-linked from their 1967 // predecessors and removed from LI & DT at the end of the function. 1968 if (!UnionPred.isAlwaysTrue()) { 1969 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1970 nullptr, "vector.scevcheck"); 1971 1972 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1973 &UnionPred, SCEVCheckBlock->getTerminator()); 1974 } 1975 1976 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1977 if (RtPtrChecking.Need) { 1978 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1979 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1980 "vector.memcheck"); 1981 1982 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1983 if (DiffChecks) { 1984 MemRuntimeCheckCond = addDiffRuntimeChecks( 1985 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1986 [VF](IRBuilderBase &B, unsigned Bits) { 1987 return getRuntimeVF(B, B.getIntNTy(Bits), VF); 1988 }, 1989 IC); 1990 } else { 1991 MemRuntimeCheckCond = 1992 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1993 RtPtrChecking.getChecks(), MemCheckExp); 1994 } 1995 assert(MemRuntimeCheckCond && 1996 "no RT checks generated although RtPtrChecking " 1997 "claimed checks are required"); 1998 } 1999 2000 if (!MemCheckBlock && !SCEVCheckBlock) 2001 return; 2002 2003 // Unhook the temporary block with the checks, update various places 2004 // accordingly. 2005 if (SCEVCheckBlock) 2006 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2007 if (MemCheckBlock) 2008 MemCheckBlock->replaceAllUsesWith(Preheader); 2009 2010 if (SCEVCheckBlock) { 2011 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2012 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2013 Preheader->getTerminator()->eraseFromParent(); 2014 } 2015 if (MemCheckBlock) { 2016 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2017 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2018 Preheader->getTerminator()->eraseFromParent(); 2019 } 2020 2021 DT->changeImmediateDominator(LoopHeader, Preheader); 2022 if (MemCheckBlock) { 2023 DT->eraseNode(MemCheckBlock); 2024 LI->removeBlock(MemCheckBlock); 2025 } 2026 if (SCEVCheckBlock) { 2027 DT->eraseNode(SCEVCheckBlock); 2028 LI->removeBlock(SCEVCheckBlock); 2029 } 2030 } 2031 2032 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2033 /// unused. 2034 ~GeneratedRTChecks() { 2035 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2036 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2037 if (!SCEVCheckCond) 2038 SCEVCleaner.markResultUsed(); 2039 2040 if (!MemRuntimeCheckCond) 2041 MemCheckCleaner.markResultUsed(); 2042 2043 if (MemRuntimeCheckCond) { 2044 auto &SE = *MemCheckExp.getSE(); 2045 // Memory runtime check generation creates compares that use expanded 2046 // values. Remove them before running the SCEVExpanderCleaners. 2047 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2048 if (MemCheckExp.isInsertedInstruction(&I)) 2049 continue; 2050 SE.forgetValue(&I); 2051 I.eraseFromParent(); 2052 } 2053 } 2054 MemCheckCleaner.cleanup(); 2055 SCEVCleaner.cleanup(); 2056 2057 if (SCEVCheckCond) 2058 SCEVCheckBlock->eraseFromParent(); 2059 if (MemRuntimeCheckCond) 2060 MemCheckBlock->eraseFromParent(); 2061 } 2062 2063 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2064 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2065 /// depending on the generated condition. 2066 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2067 BasicBlock *LoopVectorPreHeader, 2068 BasicBlock *LoopExitBlock) { 2069 if (!SCEVCheckCond) 2070 return nullptr; 2071 2072 Value *Cond = SCEVCheckCond; 2073 // Mark the check as used, to prevent it from being removed during cleanup. 2074 SCEVCheckCond = nullptr; 2075 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2076 if (C->isZero()) 2077 return nullptr; 2078 2079 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2080 2081 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2082 // Create new preheader for vector loop. 2083 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2084 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2085 2086 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2087 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2088 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2089 SCEVCheckBlock); 2090 2091 DT->addNewBlock(SCEVCheckBlock, Pred); 2092 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2093 2094 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2095 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2096 return SCEVCheckBlock; 2097 } 2098 2099 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2100 /// the branches to branch to the vector preheader or \p Bypass, depending on 2101 /// the generated condition. 2102 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2103 BasicBlock *LoopVectorPreHeader) { 2104 // Check if we generated code that checks in runtime if arrays overlap. 2105 if (!MemRuntimeCheckCond) 2106 return nullptr; 2107 2108 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2109 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2110 MemCheckBlock); 2111 2112 DT->addNewBlock(MemCheckBlock, Pred); 2113 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2114 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2115 2116 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2117 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2118 2119 ReplaceInstWithInst( 2120 MemCheckBlock->getTerminator(), 2121 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2122 MemCheckBlock->getTerminator()->setDebugLoc( 2123 Pred->getTerminator()->getDebugLoc()); 2124 2125 // Mark the check as used, to prevent it from being removed during cleanup. 2126 MemRuntimeCheckCond = nullptr; 2127 return MemCheckBlock; 2128 } 2129 }; 2130 2131 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2132 // vectorization. The loop needs to be annotated with #pragma omp simd 2133 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2134 // vector length information is not provided, vectorization is not considered 2135 // explicit. Interleave hints are not allowed either. These limitations will be 2136 // relaxed in the future. 2137 // Please, note that we are currently forced to abuse the pragma 'clang 2138 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2139 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2140 // provides *explicit vectorization hints* (LV can bypass legal checks and 2141 // assume that vectorization is legal). However, both hints are implemented 2142 // using the same metadata (llvm.loop.vectorize, processed by 2143 // LoopVectorizeHints). This will be fixed in the future when the native IR 2144 // representation for pragma 'omp simd' is introduced. 2145 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2146 OptimizationRemarkEmitter *ORE) { 2147 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2148 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2149 2150 // Only outer loops with an explicit vectorization hint are supported. 2151 // Unannotated outer loops are ignored. 2152 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2153 return false; 2154 2155 Function *Fn = OuterLp->getHeader()->getParent(); 2156 if (!Hints.allowVectorization(Fn, OuterLp, 2157 true /*VectorizeOnlyWhenForced*/)) { 2158 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2159 return false; 2160 } 2161 2162 if (Hints.getInterleave() > 1) { 2163 // TODO: Interleave support is future work. 2164 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2165 "outer loops.\n"); 2166 Hints.emitRemarkWithHints(); 2167 return false; 2168 } 2169 2170 return true; 2171 } 2172 2173 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2174 OptimizationRemarkEmitter *ORE, 2175 SmallVectorImpl<Loop *> &V) { 2176 // Collect inner loops and outer loops without irreducible control flow. For 2177 // now, only collect outer loops that have explicit vectorization hints. If we 2178 // are stress testing the VPlan H-CFG construction, we collect the outermost 2179 // loop of every loop nest. 2180 if (L.isInnermost() || VPlanBuildStressTest || 2181 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2182 LoopBlocksRPO RPOT(&L); 2183 RPOT.perform(LI); 2184 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2185 V.push_back(&L); 2186 // TODO: Collect inner loops inside marked outer loops in case 2187 // vectorization fails for the outer loop. Do not invoke 2188 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2189 // already known to be reducible. We can use an inherited attribute for 2190 // that. 2191 return; 2192 } 2193 } 2194 for (Loop *InnerL : L) 2195 collectSupportedLoops(*InnerL, LI, ORE, V); 2196 } 2197 2198 namespace { 2199 2200 /// The LoopVectorize Pass. 2201 struct LoopVectorize : public FunctionPass { 2202 /// Pass identification, replacement for typeid 2203 static char ID; 2204 2205 LoopVectorizePass Impl; 2206 2207 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2208 bool VectorizeOnlyWhenForced = false) 2209 : FunctionPass(ID), 2210 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2211 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2212 } 2213 2214 bool runOnFunction(Function &F) override { 2215 if (skipFunction(F)) 2216 return false; 2217 2218 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2219 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2220 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2221 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2222 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2223 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2224 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2225 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2226 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2227 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2228 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2229 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2230 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2231 2232 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2233 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2234 2235 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2236 GetLAA, *ORE, PSI).MadeAnyChange; 2237 } 2238 2239 void getAnalysisUsage(AnalysisUsage &AU) const override { 2240 AU.addRequired<AssumptionCacheTracker>(); 2241 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2242 AU.addRequired<DominatorTreeWrapperPass>(); 2243 AU.addRequired<LoopInfoWrapperPass>(); 2244 AU.addRequired<ScalarEvolutionWrapperPass>(); 2245 AU.addRequired<TargetTransformInfoWrapperPass>(); 2246 AU.addRequired<AAResultsWrapperPass>(); 2247 AU.addRequired<LoopAccessLegacyAnalysis>(); 2248 AU.addRequired<DemandedBitsWrapperPass>(); 2249 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2250 AU.addRequired<InjectTLIMappingsLegacy>(); 2251 2252 // We currently do not preserve loopinfo/dominator analyses with outer loop 2253 // vectorization. Until this is addressed, mark these analyses as preserved 2254 // only for non-VPlan-native path. 2255 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2256 if (!EnableVPlanNativePath) { 2257 AU.addPreserved<LoopInfoWrapperPass>(); 2258 AU.addPreserved<DominatorTreeWrapperPass>(); 2259 } 2260 2261 AU.addPreserved<BasicAAWrapperPass>(); 2262 AU.addPreserved<GlobalsAAWrapperPass>(); 2263 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2264 } 2265 }; 2266 2267 } // end anonymous namespace 2268 2269 //===----------------------------------------------------------------------===// 2270 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2271 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2272 //===----------------------------------------------------------------------===// 2273 2274 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2275 // We need to place the broadcast of invariant variables outside the loop, 2276 // but only if it's proven safe to do so. Else, broadcast will be inside 2277 // vector loop body. 2278 Instruction *Instr = dyn_cast<Instruction>(V); 2279 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2280 (!Instr || 2281 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2282 // Place the code for broadcasting invariant variables in the new preheader. 2283 IRBuilder<>::InsertPointGuard Guard(Builder); 2284 if (SafeToHoist) 2285 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2286 2287 // Broadcast the scalar into all locations in the vector. 2288 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2289 2290 return Shuf; 2291 } 2292 2293 /// This function adds 2294 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2295 /// to each vector element of Val. The sequence starts at StartIndex. 2296 /// \p Opcode is relevant for FP induction variable. 2297 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2298 Instruction::BinaryOps BinOp, ElementCount VF, 2299 IRBuilderBase &Builder) { 2300 assert(VF.isVector() && "only vector VFs are supported"); 2301 2302 // Create and check the types. 2303 auto *ValVTy = cast<VectorType>(Val->getType()); 2304 ElementCount VLen = ValVTy->getElementCount(); 2305 2306 Type *STy = Val->getType()->getScalarType(); 2307 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2308 "Induction Step must be an integer or FP"); 2309 assert(Step->getType() == STy && "Step has wrong type"); 2310 2311 SmallVector<Constant *, 8> Indices; 2312 2313 // Create a vector of consecutive numbers from zero to VF. 2314 VectorType *InitVecValVTy = ValVTy; 2315 if (STy->isFloatingPointTy()) { 2316 Type *InitVecValSTy = 2317 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2318 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2319 } 2320 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2321 2322 // Splat the StartIdx 2323 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2324 2325 if (STy->isIntegerTy()) { 2326 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2327 Step = Builder.CreateVectorSplat(VLen, Step); 2328 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2329 // FIXME: The newly created binary instructions should contain nsw/nuw 2330 // flags, which can be found from the original scalar operations. 2331 Step = Builder.CreateMul(InitVec, Step); 2332 return Builder.CreateAdd(Val, Step, "induction"); 2333 } 2334 2335 // Floating point induction. 2336 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2337 "Binary Opcode should be specified for FP induction"); 2338 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2339 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2340 2341 Step = Builder.CreateVectorSplat(VLen, Step); 2342 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2343 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2344 } 2345 2346 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2347 /// variable on which to base the steps, \p Step is the size of the step. 2348 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2349 const InductionDescriptor &ID, VPValue *Def, 2350 VPTransformState &State) { 2351 IRBuilderBase &Builder = State.Builder; 2352 // We shouldn't have to build scalar steps if we aren't vectorizing. 2353 assert(State.VF.isVector() && "VF should be greater than one"); 2354 // Get the value type and ensure it and the step have the same integer type. 2355 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2356 assert(ScalarIVTy == Step->getType() && 2357 "Val and Step should have the same type"); 2358 2359 // We build scalar steps for both integer and floating-point induction 2360 // variables. Here, we determine the kind of arithmetic we will perform. 2361 Instruction::BinaryOps AddOp; 2362 Instruction::BinaryOps MulOp; 2363 if (ScalarIVTy->isIntegerTy()) { 2364 AddOp = Instruction::Add; 2365 MulOp = Instruction::Mul; 2366 } else { 2367 AddOp = ID.getInductionOpcode(); 2368 MulOp = Instruction::FMul; 2369 } 2370 2371 // Determine the number of scalars we need to generate for each unroll 2372 // iteration. 2373 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2374 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2375 // Compute the scalar steps and save the results in State. 2376 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2377 ScalarIVTy->getScalarSizeInBits()); 2378 Type *VecIVTy = nullptr; 2379 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2380 if (!FirstLaneOnly && State.VF.isScalable()) { 2381 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2382 UnitStepVec = 2383 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2384 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2385 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2386 } 2387 2388 for (unsigned Part = 0; Part < State.UF; ++Part) { 2389 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2390 2391 if (!FirstLaneOnly && State.VF.isScalable()) { 2392 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2393 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2394 if (ScalarIVTy->isFloatingPointTy()) 2395 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2396 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2397 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2398 State.set(Def, Add, Part); 2399 // It's useful to record the lane values too for the known minimum number 2400 // of elements so we do those below. This improves the code quality when 2401 // trying to extract the first element, for example. 2402 } 2403 2404 if (ScalarIVTy->isFloatingPointTy()) 2405 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2406 2407 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2408 Value *StartIdx = Builder.CreateBinOp( 2409 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2410 // The step returned by `createStepForVF` is a runtime-evaluated value 2411 // when VF is scalable. Otherwise, it should be folded into a Constant. 2412 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2413 "Expected StartIdx to be folded to a constant when VF is not " 2414 "scalable"); 2415 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2416 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2417 State.set(Def, Add, VPIteration(Part, Lane)); 2418 } 2419 } 2420 } 2421 2422 // Generate code for the induction step. Note that induction steps are 2423 // required to be loop-invariant 2424 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2425 Instruction *InsertBefore, 2426 Loop *OrigLoop = nullptr) { 2427 const DataLayout &DL = SE.getDataLayout(); 2428 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2429 "Induction step should be loop invariant"); 2430 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2431 return E->getValue(); 2432 2433 SCEVExpander Exp(SE, DL, "induction"); 2434 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2435 } 2436 2437 /// Compute the transformed value of Index at offset StartValue using step 2438 /// StepValue. 2439 /// For integer induction, returns StartValue + Index * StepValue. 2440 /// For pointer induction, returns StartValue[Index * StepValue]. 2441 /// FIXME: The newly created binary instructions should contain nsw/nuw 2442 /// flags, which can be found from the original scalar operations. 2443 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2444 Value *StartValue, Value *Step, 2445 const InductionDescriptor &ID) { 2446 assert(Index->getType()->getScalarType() == Step->getType() && 2447 "Index scalar type does not match StepValue type"); 2448 2449 // Note: the IR at this point is broken. We cannot use SE to create any new 2450 // SCEV and then expand it, hoping that SCEV's simplification will give us 2451 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2452 // lead to various SCEV crashes. So all we can do is to use builder and rely 2453 // on InstCombine for future simplifications. Here we handle some trivial 2454 // cases only. 2455 auto CreateAdd = [&B](Value *X, Value *Y) { 2456 assert(X->getType() == Y->getType() && "Types don't match!"); 2457 if (auto *CX = dyn_cast<ConstantInt>(X)) 2458 if (CX->isZero()) 2459 return Y; 2460 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2461 if (CY->isZero()) 2462 return X; 2463 return B.CreateAdd(X, Y); 2464 }; 2465 2466 // We allow X to be a vector type, in which case Y will potentially be 2467 // splatted into a vector with the same element count. 2468 auto CreateMul = [&B](Value *X, Value *Y) { 2469 assert(X->getType()->getScalarType() == Y->getType() && 2470 "Types don't match!"); 2471 if (auto *CX = dyn_cast<ConstantInt>(X)) 2472 if (CX->isOne()) 2473 return Y; 2474 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2475 if (CY->isOne()) 2476 return X; 2477 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2478 if (XVTy && !isa<VectorType>(Y->getType())) 2479 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2480 return B.CreateMul(X, Y); 2481 }; 2482 2483 switch (ID.getKind()) { 2484 case InductionDescriptor::IK_IntInduction: { 2485 assert(!isa<VectorType>(Index->getType()) && 2486 "Vector indices not supported for integer inductions yet"); 2487 assert(Index->getType() == StartValue->getType() && 2488 "Index type does not match StartValue type"); 2489 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2490 return B.CreateSub(StartValue, Index); 2491 auto *Offset = CreateMul(Index, Step); 2492 return CreateAdd(StartValue, Offset); 2493 } 2494 case InductionDescriptor::IK_PtrInduction: { 2495 assert(isa<Constant>(Step) && 2496 "Expected constant step for pointer induction"); 2497 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2498 } 2499 case InductionDescriptor::IK_FpInduction: { 2500 assert(!isa<VectorType>(Index->getType()) && 2501 "Vector indices not supported for FP inductions yet"); 2502 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2503 auto InductionBinOp = ID.getInductionBinOp(); 2504 assert(InductionBinOp && 2505 (InductionBinOp->getOpcode() == Instruction::FAdd || 2506 InductionBinOp->getOpcode() == Instruction::FSub) && 2507 "Original bin op should be defined for FP induction"); 2508 2509 Value *MulExp = B.CreateFMul(Step, Index); 2510 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2511 "induction"); 2512 } 2513 case InductionDescriptor::IK_NoInduction: 2514 return nullptr; 2515 } 2516 llvm_unreachable("invalid enum"); 2517 } 2518 2519 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2520 const VPIteration &Instance, 2521 VPTransformState &State) { 2522 Value *ScalarInst = State.get(Def, Instance); 2523 Value *VectorValue = State.get(Def, Instance.Part); 2524 VectorValue = Builder.CreateInsertElement( 2525 VectorValue, ScalarInst, 2526 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2527 State.set(Def, VectorValue, Instance.Part); 2528 } 2529 2530 // Return whether we allow using masked interleave-groups (for dealing with 2531 // strided loads/stores that reside in predicated blocks, or for dealing 2532 // with gaps). 2533 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2534 // If an override option has been passed in for interleaved accesses, use it. 2535 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2536 return EnableMaskedInterleavedMemAccesses; 2537 2538 return TTI.enableMaskedInterleavedAccessVectorization(); 2539 } 2540 2541 // Try to vectorize the interleave group that \p Instr belongs to. 2542 // 2543 // E.g. Translate following interleaved load group (factor = 3): 2544 // for (i = 0; i < N; i+=3) { 2545 // R = Pic[i]; // Member of index 0 2546 // G = Pic[i+1]; // Member of index 1 2547 // B = Pic[i+2]; // Member of index 2 2548 // ... // do something to R, G, B 2549 // } 2550 // To: 2551 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2552 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2553 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2554 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2555 // 2556 // Or translate following interleaved store group (factor = 3): 2557 // for (i = 0; i < N; i+=3) { 2558 // ... do something to R, G, B 2559 // Pic[i] = R; // Member of index 0 2560 // Pic[i+1] = G; // Member of index 1 2561 // Pic[i+2] = B; // Member of index 2 2562 // } 2563 // To: 2564 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2565 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2566 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2567 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2568 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2569 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2570 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2571 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2572 VPValue *BlockInMask) { 2573 Instruction *Instr = Group->getInsertPos(); 2574 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2575 2576 // Prepare for the vector type of the interleaved load/store. 2577 Type *ScalarTy = getLoadStoreType(Instr); 2578 unsigned InterleaveFactor = Group->getFactor(); 2579 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2580 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2581 2582 // Prepare for the new pointers. 2583 SmallVector<Value *, 2> AddrParts; 2584 unsigned Index = Group->getIndex(Instr); 2585 2586 // TODO: extend the masked interleaved-group support to reversed access. 2587 assert((!BlockInMask || !Group->isReverse()) && 2588 "Reversed masked interleave-group not supported."); 2589 2590 // If the group is reverse, adjust the index to refer to the last vector lane 2591 // instead of the first. We adjust the index from the first vector lane, 2592 // rather than directly getting the pointer for lane VF - 1, because the 2593 // pointer operand of the interleaved access is supposed to be uniform. For 2594 // uniform instructions, we're only required to generate a value for the 2595 // first vector lane in each unroll iteration. 2596 if (Group->isReverse()) 2597 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2598 2599 for (unsigned Part = 0; Part < UF; Part++) { 2600 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2601 setDebugLocFromInst(AddrPart); 2602 2603 // Notice current instruction could be any index. Need to adjust the address 2604 // to the member of index 0. 2605 // 2606 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2607 // b = A[i]; // Member of index 0 2608 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2609 // 2610 // E.g. A[i+1] = a; // Member of index 1 2611 // A[i] = b; // Member of index 0 2612 // A[i+2] = c; // Member of index 2 (Current instruction) 2613 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2614 2615 bool InBounds = false; 2616 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2617 InBounds = gep->isInBounds(); 2618 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2619 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2620 2621 // Cast to the vector pointer type. 2622 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2623 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2624 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2625 } 2626 2627 setDebugLocFromInst(Instr); 2628 Value *PoisonVec = PoisonValue::get(VecTy); 2629 2630 Value *MaskForGaps = nullptr; 2631 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2632 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2633 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2634 } 2635 2636 // Vectorize the interleaved load group. 2637 if (isa<LoadInst>(Instr)) { 2638 // For each unroll part, create a wide load for the group. 2639 SmallVector<Value *, 2> NewLoads; 2640 for (unsigned Part = 0; Part < UF; Part++) { 2641 Instruction *NewLoad; 2642 if (BlockInMask || MaskForGaps) { 2643 assert(useMaskedInterleavedAccesses(*TTI) && 2644 "masked interleaved groups are not allowed."); 2645 Value *GroupMask = MaskForGaps; 2646 if (BlockInMask) { 2647 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2648 Value *ShuffledMask = Builder.CreateShuffleVector( 2649 BlockInMaskPart, 2650 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2651 "interleaved.mask"); 2652 GroupMask = MaskForGaps 2653 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2654 MaskForGaps) 2655 : ShuffledMask; 2656 } 2657 NewLoad = 2658 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2659 GroupMask, PoisonVec, "wide.masked.vec"); 2660 } 2661 else 2662 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2663 Group->getAlign(), "wide.vec"); 2664 Group->addMetadata(NewLoad); 2665 NewLoads.push_back(NewLoad); 2666 } 2667 2668 // For each member in the group, shuffle out the appropriate data from the 2669 // wide loads. 2670 unsigned J = 0; 2671 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2672 Instruction *Member = Group->getMember(I); 2673 2674 // Skip the gaps in the group. 2675 if (!Member) 2676 continue; 2677 2678 auto StrideMask = 2679 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2680 for (unsigned Part = 0; Part < UF; Part++) { 2681 Value *StridedVec = Builder.CreateShuffleVector( 2682 NewLoads[Part], StrideMask, "strided.vec"); 2683 2684 // If this member has different type, cast the result type. 2685 if (Member->getType() != ScalarTy) { 2686 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2687 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2688 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2689 } 2690 2691 if (Group->isReverse()) 2692 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2693 2694 State.set(VPDefs[J], StridedVec, Part); 2695 } 2696 ++J; 2697 } 2698 return; 2699 } 2700 2701 // The sub vector type for current instruction. 2702 auto *SubVT = VectorType::get(ScalarTy, VF); 2703 2704 // Vectorize the interleaved store group. 2705 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2706 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2707 "masked interleaved groups are not allowed."); 2708 assert((!MaskForGaps || !VF.isScalable()) && 2709 "masking gaps for scalable vectors is not yet supported."); 2710 for (unsigned Part = 0; Part < UF; Part++) { 2711 // Collect the stored vector from each member. 2712 SmallVector<Value *, 4> StoredVecs; 2713 for (unsigned i = 0; i < InterleaveFactor; i++) { 2714 assert((Group->getMember(i) || MaskForGaps) && 2715 "Fail to get a member from an interleaved store group"); 2716 Instruction *Member = Group->getMember(i); 2717 2718 // Skip the gaps in the group. 2719 if (!Member) { 2720 Value *Undef = PoisonValue::get(SubVT); 2721 StoredVecs.push_back(Undef); 2722 continue; 2723 } 2724 2725 Value *StoredVec = State.get(StoredValues[i], Part); 2726 2727 if (Group->isReverse()) 2728 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2729 2730 // If this member has different type, cast it to a unified type. 2731 2732 if (StoredVec->getType() != SubVT) 2733 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2734 2735 StoredVecs.push_back(StoredVec); 2736 } 2737 2738 // Concatenate all vectors into a wide vector. 2739 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2740 2741 // Interleave the elements in the wide vector. 2742 Value *IVec = Builder.CreateShuffleVector( 2743 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2744 "interleaved.vec"); 2745 2746 Instruction *NewStoreInstr; 2747 if (BlockInMask || MaskForGaps) { 2748 Value *GroupMask = MaskForGaps; 2749 if (BlockInMask) { 2750 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2751 Value *ShuffledMask = Builder.CreateShuffleVector( 2752 BlockInMaskPart, 2753 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2754 "interleaved.mask"); 2755 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2756 ShuffledMask, MaskForGaps) 2757 : ShuffledMask; 2758 } 2759 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2760 Group->getAlign(), GroupMask); 2761 } else 2762 NewStoreInstr = 2763 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2764 2765 Group->addMetadata(NewStoreInstr); 2766 } 2767 } 2768 2769 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2770 VPReplicateRecipe *RepRecipe, 2771 const VPIteration &Instance, 2772 bool IfPredicateInstr, 2773 VPTransformState &State) { 2774 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2775 2776 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2777 // the first lane and part. 2778 if (isa<NoAliasScopeDeclInst>(Instr)) 2779 if (!Instance.isFirstIteration()) 2780 return; 2781 2782 // Does this instruction return a value ? 2783 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2784 2785 Instruction *Cloned = Instr->clone(); 2786 if (!IsVoidRetTy) 2787 Cloned->setName(Instr->getName() + ".cloned"); 2788 2789 // If the scalarized instruction contributes to the address computation of a 2790 // widen masked load/store which was in a basic block that needed predication 2791 // and is not predicated after vectorization, we can't propagate 2792 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2793 // instruction could feed a poison value to the base address of the widen 2794 // load/store. 2795 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2796 Cloned->dropPoisonGeneratingFlags(); 2797 2798 if (Instr->getDebugLoc()) 2799 setDebugLocFromInst(Instr); 2800 2801 // Replace the operands of the cloned instructions with their scalar 2802 // equivalents in the new loop. 2803 for (auto &I : enumerate(RepRecipe->operands())) { 2804 auto InputInstance = Instance; 2805 VPValue *Operand = I.value(); 2806 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2807 if (OperandR && OperandR->isUniform()) 2808 InputInstance.Lane = VPLane::getFirstLane(); 2809 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2810 } 2811 addNewMetadata(Cloned, Instr); 2812 2813 // Place the cloned scalar in the new loop. 2814 State.Builder.Insert(Cloned); 2815 2816 State.set(RepRecipe, Cloned, Instance); 2817 2818 // If we just cloned a new assumption, add it the assumption cache. 2819 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2820 AC->registerAssumption(II); 2821 2822 // End if-block. 2823 if (IfPredicateInstr) 2824 PredicatedInstructions.push_back(Cloned); 2825 } 2826 2827 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2828 if (TripCount) 2829 return TripCount; 2830 2831 assert(InsertBlock); 2832 IRBuilder<> Builder(InsertBlock->getTerminator()); 2833 // Find the loop boundaries. 2834 ScalarEvolution *SE = PSE.getSE(); 2835 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2836 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2837 "Invalid loop count"); 2838 2839 Type *IdxTy = Legal->getWidestInductionType(); 2840 assert(IdxTy && "No type for induction"); 2841 2842 // The exit count might have the type of i64 while the phi is i32. This can 2843 // happen if we have an induction variable that is sign extended before the 2844 // compare. The only way that we get a backedge taken count is that the 2845 // induction variable was signed and as such will not overflow. In such a case 2846 // truncation is legal. 2847 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2848 IdxTy->getPrimitiveSizeInBits()) 2849 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2850 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2851 2852 // Get the total trip count from the count by adding 1. 2853 const SCEV *ExitCount = SE->getAddExpr( 2854 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2855 2856 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2857 2858 // Expand the trip count and place the new instructions in the preheader. 2859 // Notice that the pre-header does not change, only the loop body. 2860 SCEVExpander Exp(*SE, DL, "induction"); 2861 2862 // Count holds the overall loop count (N). 2863 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2864 InsertBlock->getTerminator()); 2865 2866 if (TripCount->getType()->isPointerTy()) 2867 TripCount = 2868 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2869 InsertBlock->getTerminator()); 2870 2871 return TripCount; 2872 } 2873 2874 Value * 2875 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2876 if (VectorTripCount) 2877 return VectorTripCount; 2878 2879 Value *TC = getOrCreateTripCount(InsertBlock); 2880 IRBuilder<> Builder(InsertBlock->getTerminator()); 2881 2882 Type *Ty = TC->getType(); 2883 // This is where we can make the step a runtime constant. 2884 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2885 2886 // If the tail is to be folded by masking, round the number of iterations N 2887 // up to a multiple of Step instead of rounding down. This is done by first 2888 // adding Step-1 and then rounding down. Note that it's ok if this addition 2889 // overflows: the vector induction variable will eventually wrap to zero given 2890 // that it starts at zero and its Step is a power of two; the loop will then 2891 // exit, with the last early-exit vector comparison also producing all-true. 2892 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2893 // is accounted for in emitIterationCountCheck that adds an overflow check. 2894 if (Cost->foldTailByMasking()) { 2895 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2896 "VF*UF must be a power of 2 when folding tail by masking"); 2897 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2898 TC = Builder.CreateAdd( 2899 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2900 } 2901 2902 // Now we need to generate the expression for the part of the loop that the 2903 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2904 // iterations are not required for correctness, or N - Step, otherwise. Step 2905 // is equal to the vectorization factor (number of SIMD elements) times the 2906 // unroll factor (number of SIMD instructions). 2907 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2908 2909 // There are cases where we *must* run at least one iteration in the remainder 2910 // loop. See the cost model for when this can happen. If the step evenly 2911 // divides the trip count, we set the remainder to be equal to the step. If 2912 // the step does not evenly divide the trip count, no adjustment is necessary 2913 // since there will already be scalar iterations. Note that the minimum 2914 // iterations check ensures that N >= Step. 2915 if (Cost->requiresScalarEpilogue(VF)) { 2916 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2917 R = Builder.CreateSelect(IsZero, Step, R); 2918 } 2919 2920 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2921 2922 return VectorTripCount; 2923 } 2924 2925 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2926 const DataLayout &DL) { 2927 // Verify that V is a vector type with same number of elements as DstVTy. 2928 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2929 unsigned VF = DstFVTy->getNumElements(); 2930 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2931 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2932 Type *SrcElemTy = SrcVecTy->getElementType(); 2933 Type *DstElemTy = DstFVTy->getElementType(); 2934 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2935 "Vector elements must have same size"); 2936 2937 // Do a direct cast if element types are castable. 2938 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2939 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2940 } 2941 // V cannot be directly casted to desired vector type. 2942 // May happen when V is a floating point vector but DstVTy is a vector of 2943 // pointers or vice-versa. Handle this using a two-step bitcast using an 2944 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2945 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2946 "Only one type should be a pointer type"); 2947 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2948 "Only one type should be a floating point type"); 2949 Type *IntTy = 2950 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2951 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2952 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2953 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2954 } 2955 2956 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2957 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2958 // Reuse existing vector loop preheader for TC checks. 2959 // Note that new preheader block is generated for vector loop. 2960 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2961 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2962 2963 // Generate code to check if the loop's trip count is less than VF * UF, or 2964 // equal to it in case a scalar epilogue is required; this implies that the 2965 // vector trip count is zero. This check also covers the case where adding one 2966 // to the backedge-taken count overflowed leading to an incorrect trip count 2967 // of zero. In this case we will also jump to the scalar loop. 2968 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2969 : ICmpInst::ICMP_ULT; 2970 2971 // If tail is to be folded, vector loop takes care of all iterations. 2972 Type *CountTy = Count->getType(); 2973 Value *CheckMinIters = Builder.getFalse(); 2974 Value *Step = createStepForVF(Builder, CountTy, VF, UF); 2975 if (!Cost->foldTailByMasking()) 2976 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2977 else if (VF.isScalable()) { 2978 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2979 // an overflow to zero when updating induction variables and so an 2980 // additional overflow check is required before entering the vector loop. 2981 2982 // Get the maximum unsigned value for the type. 2983 Value *MaxUIntTripCount = 2984 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2985 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2986 2987 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2988 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); 2989 } 2990 // Create new preheader for vector loop. 2991 LoopVectorPreHeader = 2992 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2993 "vector.ph"); 2994 2995 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2996 DT->getNode(Bypass)->getIDom()) && 2997 "TC check is expected to dominate Bypass"); 2998 2999 // Update dominator for Bypass & LoopExit (if needed). 3000 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3001 if (!Cost->requiresScalarEpilogue(VF)) 3002 // If there is an epilogue which must run, there's no edge from the 3003 // middle block to exit blocks and thus no need to update the immediate 3004 // dominator of the exit blocks. 3005 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3006 3007 ReplaceInstWithInst( 3008 TCCheckBlock->getTerminator(), 3009 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3010 LoopBypassBlocks.push_back(TCCheckBlock); 3011 } 3012 3013 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 3014 3015 BasicBlock *const SCEVCheckBlock = 3016 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3017 if (!SCEVCheckBlock) 3018 return nullptr; 3019 3020 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3021 (OptForSizeBasedOnProfile && 3022 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3023 "Cannot SCEV check stride or overflow when optimizing for size"); 3024 3025 3026 // Update dominator only if this is first RT check. 3027 if (LoopBypassBlocks.empty()) { 3028 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3029 if (!Cost->requiresScalarEpilogue(VF)) 3030 // If there is an epilogue which must run, there's no edge from the 3031 // middle block to exit blocks and thus no need to update the immediate 3032 // dominator of the exit blocks. 3033 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3034 } 3035 3036 LoopBypassBlocks.push_back(SCEVCheckBlock); 3037 AddedSafetyChecks = true; 3038 return SCEVCheckBlock; 3039 } 3040 3041 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3042 // VPlan-native path does not do any analysis for runtime checks currently. 3043 if (EnableVPlanNativePath) 3044 return nullptr; 3045 3046 BasicBlock *const MemCheckBlock = 3047 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3048 3049 // Check if we generated code that checks in runtime if arrays overlap. We put 3050 // the checks into a separate block to make the more common case of few 3051 // elements faster. 3052 if (!MemCheckBlock) 3053 return nullptr; 3054 3055 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3056 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3057 "Cannot emit memory checks when optimizing for size, unless forced " 3058 "to vectorize."); 3059 ORE->emit([&]() { 3060 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3061 OrigLoop->getStartLoc(), 3062 OrigLoop->getHeader()) 3063 << "Code-size may be reduced by not forcing " 3064 "vectorization, or by source-code modifications " 3065 "eliminating the need for runtime checks " 3066 "(e.g., adding 'restrict')."; 3067 }); 3068 } 3069 3070 LoopBypassBlocks.push_back(MemCheckBlock); 3071 3072 AddedSafetyChecks = true; 3073 3074 // Only use noalias metadata when using memory checks guaranteeing no overlap 3075 // across all iterations. 3076 if (!Legal->getLAI()->getRuntimePointerChecking()->getDiffChecks()) { 3077 // We currently don't use LoopVersioning for the actual loop cloning but we 3078 // still use it to add the noalias metadata. 3079 LVer = std::make_unique<LoopVersioning>( 3080 *Legal->getLAI(), 3081 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3082 DT, PSE.getSE()); 3083 LVer->prepareNoAliasMetadata(); 3084 } 3085 return MemCheckBlock; 3086 } 3087 3088 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3089 LoopScalarBody = OrigLoop->getHeader(); 3090 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3091 assert(LoopVectorPreHeader && "Invalid loop structure"); 3092 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3093 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3094 "multiple exit loop without required epilogue?"); 3095 3096 LoopMiddleBlock = 3097 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3098 LI, nullptr, Twine(Prefix) + "middle.block"); 3099 LoopScalarPreHeader = 3100 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3101 nullptr, Twine(Prefix) + "scalar.ph"); 3102 3103 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3104 3105 // Set up the middle block terminator. Two cases: 3106 // 1) If we know that we must execute the scalar epilogue, emit an 3107 // unconditional branch. 3108 // 2) Otherwise, we must have a single unique exit block (due to how we 3109 // implement the multiple exit case). In this case, set up a conditonal 3110 // branch from the middle block to the loop scalar preheader, and the 3111 // exit block. completeLoopSkeleton will update the condition to use an 3112 // iteration check, if required to decide whether to execute the remainder. 3113 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3114 BranchInst::Create(LoopScalarPreHeader) : 3115 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3116 Builder.getTrue()); 3117 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3118 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3119 3120 // Update dominator for loop exit. During skeleton creation, only the vector 3121 // pre-header and the middle block are created. The vector loop is entirely 3122 // created during VPlan exection. 3123 if (!Cost->requiresScalarEpilogue(VF)) 3124 // If there is an epilogue which must run, there's no edge from the 3125 // middle block to exit blocks and thus no need to update the immediate 3126 // dominator of the exit blocks. 3127 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3128 } 3129 3130 void InnerLoopVectorizer::createInductionResumeValues( 3131 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3132 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3133 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3134 "Inconsistent information about additional bypass."); 3135 3136 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3137 assert(VectorTripCount && "Expected valid arguments"); 3138 // We are going to resume the execution of the scalar loop. 3139 // Go over all of the induction variables that we found and fix the 3140 // PHIs that are left in the scalar version of the loop. 3141 // The starting values of PHI nodes depend on the counter of the last 3142 // iteration in the vectorized loop. 3143 // If we come from a bypass edge then we need to start from the original 3144 // start value. 3145 Instruction *OldInduction = Legal->getPrimaryInduction(); 3146 for (auto &InductionEntry : Legal->getInductionVars()) { 3147 PHINode *OrigPhi = InductionEntry.first; 3148 InductionDescriptor II = InductionEntry.second; 3149 3150 // Create phi nodes to merge from the backedge-taken check block. 3151 PHINode *BCResumeVal = 3152 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3153 LoopScalarPreHeader->getTerminator()); 3154 // Copy original phi DL over to the new one. 3155 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3156 Value *&EndValue = IVEndValues[OrigPhi]; 3157 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3158 if (OrigPhi == OldInduction) { 3159 // We know what the end value is. 3160 EndValue = VectorTripCount; 3161 } else { 3162 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3163 3164 // Fast-math-flags propagate from the original induction instruction. 3165 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3166 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3167 3168 Type *StepType = II.getStep()->getType(); 3169 Instruction::CastOps CastOp = 3170 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3171 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3172 Value *Step = 3173 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3174 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3175 EndValue->setName("ind.end"); 3176 3177 // Compute the end value for the additional bypass (if applicable). 3178 if (AdditionalBypass.first) { 3179 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3180 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3181 StepType, true); 3182 Value *Step = 3183 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3184 VTC = 3185 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3186 EndValueFromAdditionalBypass = 3187 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3188 EndValueFromAdditionalBypass->setName("ind.end"); 3189 } 3190 } 3191 // The new PHI merges the original incoming value, in case of a bypass, 3192 // or the value at the end of the vectorized loop. 3193 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3194 3195 // Fix the scalar body counter (PHI node). 3196 // The old induction's phi node in the scalar body needs the truncated 3197 // value. 3198 for (BasicBlock *BB : LoopBypassBlocks) 3199 BCResumeVal->addIncoming(II.getStartValue(), BB); 3200 3201 if (AdditionalBypass.first) 3202 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3203 EndValueFromAdditionalBypass); 3204 3205 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3206 } 3207 } 3208 3209 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3210 // The trip counts should be cached by now. 3211 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3212 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3213 3214 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3215 3216 // Add a check in the middle block to see if we have completed 3217 // all of the iterations in the first vector loop. Three cases: 3218 // 1) If we require a scalar epilogue, there is no conditional branch as 3219 // we unconditionally branch to the scalar preheader. Do nothing. 3220 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3221 // Thus if tail is to be folded, we know we don't need to run the 3222 // remainder and we can use the previous value for the condition (true). 3223 // 3) Otherwise, construct a runtime check. 3224 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3225 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3226 Count, VectorTripCount, "cmp.n", 3227 LoopMiddleBlock->getTerminator()); 3228 3229 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3230 // of the corresponding compare because they may have ended up with 3231 // different line numbers and we want to avoid awkward line stepping while 3232 // debugging. Eg. if the compare has got a line number inside the loop. 3233 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3234 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3235 } 3236 3237 #ifdef EXPENSIVE_CHECKS 3238 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3239 #endif 3240 3241 return LoopVectorPreHeader; 3242 } 3243 3244 std::pair<BasicBlock *, Value *> 3245 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3246 /* 3247 In this function we generate a new loop. The new loop will contain 3248 the vectorized instructions while the old loop will continue to run the 3249 scalar remainder. 3250 3251 [ ] <-- loop iteration number check. 3252 / | 3253 / v 3254 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3255 | / | 3256 | / v 3257 || [ ] <-- vector pre header. 3258 |/ | 3259 | v 3260 | [ ] \ 3261 | [ ]_| <-- vector loop (created during VPlan execution). 3262 | | 3263 | v 3264 \ -[ ] <--- middle-block. 3265 \/ | 3266 /\ v 3267 | ->[ ] <--- new preheader. 3268 | | 3269 (opt) v <-- edge from middle to exit iff epilogue is not required. 3270 | [ ] \ 3271 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3272 \ | 3273 \ v 3274 >[ ] <-- exit block(s). 3275 ... 3276 */ 3277 3278 // Get the metadata of the original loop before it gets modified. 3279 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3280 3281 // Workaround! Compute the trip count of the original loop and cache it 3282 // before we start modifying the CFG. This code has a systemic problem 3283 // wherein it tries to run analysis over partially constructed IR; this is 3284 // wrong, and not simply for SCEV. The trip count of the original loop 3285 // simply happens to be prone to hitting this in practice. In theory, we 3286 // can hit the same issue for any SCEV, or ValueTracking query done during 3287 // mutation. See PR49900. 3288 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3289 3290 // Create an empty vector loop, and prepare basic blocks for the runtime 3291 // checks. 3292 createVectorLoopSkeleton(""); 3293 3294 // Now, compare the new count to zero. If it is zero skip the vector loop and 3295 // jump to the scalar loop. This check also covers the case where the 3296 // backedge-taken count is uint##_max: adding one to it will overflow leading 3297 // to an incorrect trip count of zero. In this (rare) case we will also jump 3298 // to the scalar loop. 3299 emitIterationCountCheck(LoopScalarPreHeader); 3300 3301 // Generate the code to check any assumptions that we've made for SCEV 3302 // expressions. 3303 emitSCEVChecks(LoopScalarPreHeader); 3304 3305 // Generate the code that checks in runtime if arrays overlap. We put the 3306 // checks into a separate block to make the more common case of few elements 3307 // faster. 3308 emitMemRuntimeChecks(LoopScalarPreHeader); 3309 3310 // Emit phis for the new starting index of the scalar loop. 3311 createInductionResumeValues(); 3312 3313 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3314 } 3315 3316 // Fix up external users of the induction variable. At this point, we are 3317 // in LCSSA form, with all external PHIs that use the IV having one input value, 3318 // coming from the remainder loop. We need those PHIs to also have a correct 3319 // value for the IV when arriving directly from the middle block. 3320 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3321 const InductionDescriptor &II, 3322 Value *VectorTripCount, Value *EndValue, 3323 BasicBlock *MiddleBlock, 3324 BasicBlock *VectorHeader, VPlan &Plan) { 3325 // There are two kinds of external IV usages - those that use the value 3326 // computed in the last iteration (the PHI) and those that use the penultimate 3327 // value (the value that feeds into the phi from the loop latch). 3328 // We allow both, but they, obviously, have different values. 3329 3330 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3331 3332 DenseMap<Value *, Value *> MissingVals; 3333 3334 // An external user of the last iteration's value should see the value that 3335 // the remainder loop uses to initialize its own IV. 3336 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3337 for (User *U : PostInc->users()) { 3338 Instruction *UI = cast<Instruction>(U); 3339 if (!OrigLoop->contains(UI)) { 3340 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3341 MissingVals[UI] = EndValue; 3342 } 3343 } 3344 3345 // An external user of the penultimate value need to see EndValue - Step. 3346 // The simplest way to get this is to recompute it from the constituent SCEVs, 3347 // that is Start + (Step * (CRD - 1)). 3348 for (User *U : OrigPhi->users()) { 3349 auto *UI = cast<Instruction>(U); 3350 if (!OrigLoop->contains(UI)) { 3351 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3352 3353 IRBuilder<> B(MiddleBlock->getTerminator()); 3354 3355 // Fast-math-flags propagate from the original induction instruction. 3356 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3357 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3358 3359 Value *CountMinusOne = B.CreateSub( 3360 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3361 Value *CMO = 3362 !II.getStep()->getType()->isIntegerTy() 3363 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3364 II.getStep()->getType()) 3365 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3366 CMO->setName("cast.cmo"); 3367 3368 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3369 VectorHeader->getTerminator()); 3370 Value *Escape = 3371 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3372 Escape->setName("ind.escape"); 3373 MissingVals[UI] = Escape; 3374 } 3375 } 3376 3377 for (auto &I : MissingVals) { 3378 PHINode *PHI = cast<PHINode>(I.first); 3379 // One corner case we have to handle is two IVs "chasing" each-other, 3380 // that is %IV2 = phi [...], [ %IV1, %latch ] 3381 // In this case, if IV1 has an external use, we need to avoid adding both 3382 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3383 // don't already have an incoming value for the middle block. 3384 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3385 PHI->addIncoming(I.second, MiddleBlock); 3386 Plan.removeLiveOut(PHI); 3387 } 3388 } 3389 } 3390 3391 namespace { 3392 3393 struct CSEDenseMapInfo { 3394 static bool canHandle(const Instruction *I) { 3395 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3396 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3397 } 3398 3399 static inline Instruction *getEmptyKey() { 3400 return DenseMapInfo<Instruction *>::getEmptyKey(); 3401 } 3402 3403 static inline Instruction *getTombstoneKey() { 3404 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3405 } 3406 3407 static unsigned getHashValue(const Instruction *I) { 3408 assert(canHandle(I) && "Unknown instruction!"); 3409 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3410 I->value_op_end())); 3411 } 3412 3413 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3414 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3415 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3416 return LHS == RHS; 3417 return LHS->isIdenticalTo(RHS); 3418 } 3419 }; 3420 3421 } // end anonymous namespace 3422 3423 ///Perform cse of induction variable instructions. 3424 static void cse(BasicBlock *BB) { 3425 // Perform simple cse. 3426 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3427 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3428 if (!CSEDenseMapInfo::canHandle(&In)) 3429 continue; 3430 3431 // Check if we can replace this instruction with any of the 3432 // visited instructions. 3433 if (Instruction *V = CSEMap.lookup(&In)) { 3434 In.replaceAllUsesWith(V); 3435 In.eraseFromParent(); 3436 continue; 3437 } 3438 3439 CSEMap[&In] = &In; 3440 } 3441 } 3442 3443 InstructionCost 3444 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3445 bool &NeedToScalarize) const { 3446 Function *F = CI->getCalledFunction(); 3447 Type *ScalarRetTy = CI->getType(); 3448 SmallVector<Type *, 4> Tys, ScalarTys; 3449 for (auto &ArgOp : CI->args()) 3450 ScalarTys.push_back(ArgOp->getType()); 3451 3452 // Estimate cost of scalarized vector call. The source operands are assumed 3453 // to be vectors, so we need to extract individual elements from there, 3454 // execute VF scalar calls, and then gather the result into the vector return 3455 // value. 3456 InstructionCost ScalarCallCost = 3457 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3458 if (VF.isScalar()) 3459 return ScalarCallCost; 3460 3461 // Compute corresponding vector type for return value and arguments. 3462 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3463 for (Type *ScalarTy : ScalarTys) 3464 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3465 3466 // Compute costs of unpacking argument values for the scalar calls and 3467 // packing the return values to a vector. 3468 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3469 3470 InstructionCost Cost = 3471 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3472 3473 // If we can't emit a vector call for this function, then the currently found 3474 // cost is the cost we need to return. 3475 NeedToScalarize = true; 3476 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3477 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3478 3479 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3480 return Cost; 3481 3482 // If the corresponding vector cost is cheaper, return its cost. 3483 InstructionCost VectorCallCost = 3484 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3485 if (VectorCallCost < Cost) { 3486 NeedToScalarize = false; 3487 Cost = VectorCallCost; 3488 } 3489 return Cost; 3490 } 3491 3492 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3493 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3494 return Elt; 3495 return VectorType::get(Elt, VF); 3496 } 3497 3498 InstructionCost 3499 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3500 ElementCount VF) const { 3501 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3502 assert(ID && "Expected intrinsic call!"); 3503 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3504 FastMathFlags FMF; 3505 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3506 FMF = FPMO->getFastMathFlags(); 3507 3508 SmallVector<const Value *> Arguments(CI->args()); 3509 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3510 SmallVector<Type *> ParamTys; 3511 std::transform(FTy->param_begin(), FTy->param_end(), 3512 std::back_inserter(ParamTys), 3513 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3514 3515 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3516 dyn_cast<IntrinsicInst>(CI)); 3517 return TTI.getIntrinsicInstrCost(CostAttrs, 3518 TargetTransformInfo::TCK_RecipThroughput); 3519 } 3520 3521 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3522 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3523 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3524 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3525 } 3526 3527 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3528 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3529 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3530 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3531 } 3532 3533 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3534 // For every instruction `I` in MinBWs, truncate the operands, create a 3535 // truncated version of `I` and reextend its result. InstCombine runs 3536 // later and will remove any ext/trunc pairs. 3537 SmallPtrSet<Value *, 4> Erased; 3538 for (const auto &KV : Cost->getMinimalBitwidths()) { 3539 // If the value wasn't vectorized, we must maintain the original scalar 3540 // type. The absence of the value from State indicates that it 3541 // wasn't vectorized. 3542 // FIXME: Should not rely on getVPValue at this point. 3543 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3544 if (!State.hasAnyVectorValue(Def)) 3545 continue; 3546 for (unsigned Part = 0; Part < UF; ++Part) { 3547 Value *I = State.get(Def, Part); 3548 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3549 continue; 3550 Type *OriginalTy = I->getType(); 3551 Type *ScalarTruncatedTy = 3552 IntegerType::get(OriginalTy->getContext(), KV.second); 3553 auto *TruncatedTy = VectorType::get( 3554 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3555 if (TruncatedTy == OriginalTy) 3556 continue; 3557 3558 IRBuilder<> B(cast<Instruction>(I)); 3559 auto ShrinkOperand = [&](Value *V) -> Value * { 3560 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3561 if (ZI->getSrcTy() == TruncatedTy) 3562 return ZI->getOperand(0); 3563 return B.CreateZExtOrTrunc(V, TruncatedTy); 3564 }; 3565 3566 // The actual instruction modification depends on the instruction type, 3567 // unfortunately. 3568 Value *NewI = nullptr; 3569 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3570 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3571 ShrinkOperand(BO->getOperand(1))); 3572 3573 // Any wrapping introduced by shrinking this operation shouldn't be 3574 // considered undefined behavior. So, we can't unconditionally copy 3575 // arithmetic wrapping flags to NewI. 3576 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3577 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3578 NewI = 3579 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3580 ShrinkOperand(CI->getOperand(1))); 3581 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3582 NewI = B.CreateSelect(SI->getCondition(), 3583 ShrinkOperand(SI->getTrueValue()), 3584 ShrinkOperand(SI->getFalseValue())); 3585 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3586 switch (CI->getOpcode()) { 3587 default: 3588 llvm_unreachable("Unhandled cast!"); 3589 case Instruction::Trunc: 3590 NewI = ShrinkOperand(CI->getOperand(0)); 3591 break; 3592 case Instruction::SExt: 3593 NewI = B.CreateSExtOrTrunc( 3594 CI->getOperand(0), 3595 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3596 break; 3597 case Instruction::ZExt: 3598 NewI = B.CreateZExtOrTrunc( 3599 CI->getOperand(0), 3600 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3601 break; 3602 } 3603 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3604 auto Elements0 = 3605 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3606 auto *O0 = B.CreateZExtOrTrunc( 3607 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3608 auto Elements1 = 3609 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3610 auto *O1 = B.CreateZExtOrTrunc( 3611 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3612 3613 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3614 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3615 // Don't do anything with the operands, just extend the result. 3616 continue; 3617 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3618 auto Elements = 3619 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3620 auto *O0 = B.CreateZExtOrTrunc( 3621 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3622 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3623 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3624 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3625 auto Elements = 3626 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3627 auto *O0 = B.CreateZExtOrTrunc( 3628 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3629 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3630 } else { 3631 // If we don't know what to do, be conservative and don't do anything. 3632 continue; 3633 } 3634 3635 // Lastly, extend the result. 3636 NewI->takeName(cast<Instruction>(I)); 3637 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3638 I->replaceAllUsesWith(Res); 3639 cast<Instruction>(I)->eraseFromParent(); 3640 Erased.insert(I); 3641 State.reset(Def, Res, Part); 3642 } 3643 } 3644 3645 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3646 for (const auto &KV : Cost->getMinimalBitwidths()) { 3647 // If the value wasn't vectorized, we must maintain the original scalar 3648 // type. The absence of the value from State indicates that it 3649 // wasn't vectorized. 3650 // FIXME: Should not rely on getVPValue at this point. 3651 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3652 if (!State.hasAnyVectorValue(Def)) 3653 continue; 3654 for (unsigned Part = 0; Part < UF; ++Part) { 3655 Value *I = State.get(Def, Part); 3656 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3657 if (Inst && Inst->use_empty()) { 3658 Value *NewI = Inst->getOperand(0); 3659 Inst->eraseFromParent(); 3660 State.reset(Def, NewI, Part); 3661 } 3662 } 3663 } 3664 } 3665 3666 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3667 VPlan &Plan) { 3668 // Insert truncates and extends for any truncated instructions as hints to 3669 // InstCombine. 3670 if (VF.isVector()) 3671 truncateToMinimalBitwidths(State); 3672 3673 // Fix widened non-induction PHIs by setting up the PHI operands. 3674 if (OrigPHIsToFix.size()) { 3675 assert(EnableVPlanNativePath && 3676 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3677 fixNonInductionPHIs(State); 3678 } 3679 3680 // At this point every instruction in the original loop is widened to a 3681 // vector form. Now we need to fix the recurrences in the loop. These PHI 3682 // nodes are currently empty because we did not want to introduce cycles. 3683 // This is the second stage of vectorizing recurrences. 3684 fixCrossIterationPHIs(State); 3685 3686 // Forget the original basic block. 3687 PSE.getSE()->forgetLoop(OrigLoop); 3688 3689 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3690 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3691 if (Cost->requiresScalarEpilogue(VF)) { 3692 // No edge from the middle block to the unique exit block has been inserted 3693 // and there is nothing to fix from vector loop; phis should have incoming 3694 // from scalar loop only. 3695 Plan.clearLiveOuts(); 3696 } else { 3697 // If we inserted an edge from the middle block to the unique exit block, 3698 // update uses outside the loop (phis) to account for the newly inserted 3699 // edge. 3700 3701 // Fix-up external users of the induction variables. 3702 for (auto &Entry : Legal->getInductionVars()) 3703 fixupIVUsers(Entry.first, Entry.second, 3704 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3705 IVEndValues[Entry.first], LoopMiddleBlock, 3706 VectorLoop->getHeader(), Plan); 3707 } 3708 3709 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3710 // in the exit block, so update the builder. 3711 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3712 for (auto &KV : Plan.getLiveOuts()) 3713 KV.second->fixPhi(Plan, State); 3714 3715 for (Instruction *PI : PredicatedInstructions) 3716 sinkScalarOperands(&*PI); 3717 3718 // Remove redundant induction instructions. 3719 cse(VectorLoop->getHeader()); 3720 3721 // Set/update profile weights for the vector and remainder loops as original 3722 // loop iterations are now distributed among them. Note that original loop 3723 // represented by LoopScalarBody becomes remainder loop after vectorization. 3724 // 3725 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3726 // end up getting slightly roughened result but that should be OK since 3727 // profile is not inherently precise anyway. Note also possible bypass of 3728 // vector code caused by legality checks is ignored, assigning all the weight 3729 // to the vector loop, optimistically. 3730 // 3731 // For scalable vectorization we can't know at compile time how many iterations 3732 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3733 // vscale of '1'. 3734 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3735 LI->getLoopFor(LoopScalarBody), 3736 VF.getKnownMinValue() * UF); 3737 } 3738 3739 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3740 // In order to support recurrences we need to be able to vectorize Phi nodes. 3741 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3742 // stage #2: We now need to fix the recurrences by adding incoming edges to 3743 // the currently empty PHI nodes. At this point every instruction in the 3744 // original loop is widened to a vector form so we can use them to construct 3745 // the incoming edges. 3746 VPBasicBlock *Header = 3747 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3748 for (VPRecipeBase &R : Header->phis()) { 3749 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3750 fixReduction(ReductionPhi, State); 3751 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3752 fixFirstOrderRecurrence(FOR, State); 3753 } 3754 } 3755 3756 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3757 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3758 // This is the second phase of vectorizing first-order recurrences. An 3759 // overview of the transformation is described below. Suppose we have the 3760 // following loop. 3761 // 3762 // for (int i = 0; i < n; ++i) 3763 // b[i] = a[i] - a[i - 1]; 3764 // 3765 // There is a first-order recurrence on "a". For this loop, the shorthand 3766 // scalar IR looks like: 3767 // 3768 // scalar.ph: 3769 // s_init = a[-1] 3770 // br scalar.body 3771 // 3772 // scalar.body: 3773 // i = phi [0, scalar.ph], [i+1, scalar.body] 3774 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3775 // s2 = a[i] 3776 // b[i] = s2 - s1 3777 // br cond, scalar.body, ... 3778 // 3779 // In this example, s1 is a recurrence because it's value depends on the 3780 // previous iteration. In the first phase of vectorization, we created a 3781 // vector phi v1 for s1. We now complete the vectorization and produce the 3782 // shorthand vector IR shown below (for VF = 4, UF = 1). 3783 // 3784 // vector.ph: 3785 // v_init = vector(..., ..., ..., a[-1]) 3786 // br vector.body 3787 // 3788 // vector.body 3789 // i = phi [0, vector.ph], [i+4, vector.body] 3790 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3791 // v2 = a[i, i+1, i+2, i+3]; 3792 // v3 = vector(v1(3), v2(0, 1, 2)) 3793 // b[i, i+1, i+2, i+3] = v2 - v3 3794 // br cond, vector.body, middle.block 3795 // 3796 // middle.block: 3797 // x = v2(3) 3798 // br scalar.ph 3799 // 3800 // scalar.ph: 3801 // s_init = phi [x, middle.block], [a[-1], otherwise] 3802 // br scalar.body 3803 // 3804 // After execution completes the vector loop, we extract the next value of 3805 // the recurrence (x) to use as the initial value in the scalar loop. 3806 3807 // Extract the last vector element in the middle block. This will be the 3808 // initial value for the recurrence when jumping to the scalar loop. 3809 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3810 Value *Incoming = State.get(PreviousDef, UF - 1); 3811 auto *ExtractForScalar = Incoming; 3812 auto *IdxTy = Builder.getInt32Ty(); 3813 if (VF.isVector()) { 3814 auto *One = ConstantInt::get(IdxTy, 1); 3815 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3816 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3817 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3818 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3819 "vector.recur.extract"); 3820 } 3821 // Extract the second last element in the middle block if the 3822 // Phi is used outside the loop. We need to extract the phi itself 3823 // and not the last element (the phi update in the current iteration). This 3824 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3825 // when the scalar loop is not run at all. 3826 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3827 if (VF.isVector()) { 3828 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3829 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3830 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3831 Incoming, Idx, "vector.recur.extract.for.phi"); 3832 } else if (UF > 1) 3833 // When loop is unrolled without vectorizing, initialize 3834 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3835 // of `Incoming`. This is analogous to the vectorized case above: extracting 3836 // the second last element when VF > 1. 3837 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3838 3839 // Fix the initial value of the original recurrence in the scalar loop. 3840 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3841 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3842 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3843 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3844 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3845 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3846 Start->addIncoming(Incoming, BB); 3847 } 3848 3849 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3850 Phi->setName("scalar.recur"); 3851 3852 // Finally, fix users of the recurrence outside the loop. The users will need 3853 // either the last value of the scalar recurrence or the last value of the 3854 // vector recurrence we extracted in the middle block. Since the loop is in 3855 // LCSSA form, we just need to find all the phi nodes for the original scalar 3856 // recurrence in the exit block, and then add an edge for the middle block. 3857 // Note that LCSSA does not imply single entry when the original scalar loop 3858 // had multiple exiting edges (as we always run the last iteration in the 3859 // scalar epilogue); in that case, there is no edge from middle to exit and 3860 // and thus no phis which needed updated. 3861 if (!Cost->requiresScalarEpilogue(VF)) 3862 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3863 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3864 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3865 State.Plan->removeLiveOut(&LCSSAPhi); 3866 } 3867 } 3868 3869 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3870 VPTransformState &State) { 3871 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3872 // Get it's reduction variable descriptor. 3873 assert(Legal->isReductionVariable(OrigPhi) && 3874 "Unable to find the reduction variable"); 3875 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3876 3877 RecurKind RK = RdxDesc.getRecurrenceKind(); 3878 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3879 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3880 setDebugLocFromInst(ReductionStartValue); 3881 3882 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3883 // This is the vector-clone of the value that leaves the loop. 3884 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3885 3886 // Wrap flags are in general invalid after vectorization, clear them. 3887 clearReductionWrapFlags(PhiR, State); 3888 3889 // Before each round, move the insertion point right between 3890 // the PHIs and the values we are going to write. 3891 // This allows us to write both PHINodes and the extractelement 3892 // instructions. 3893 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3894 3895 setDebugLocFromInst(LoopExitInst); 3896 3897 Type *PhiTy = OrigPhi->getType(); 3898 3899 VPBasicBlock *LatchVPBB = 3900 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3901 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3902 // If tail is folded by masking, the vector value to leave the loop should be 3903 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3904 // instead of the former. For an inloop reduction the reduction will already 3905 // be predicated, and does not need to be handled here. 3906 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3907 for (unsigned Part = 0; Part < UF; ++Part) { 3908 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3909 SelectInst *Sel = nullptr; 3910 for (User *U : VecLoopExitInst->users()) { 3911 if (isa<SelectInst>(U)) { 3912 assert(!Sel && "Reduction exit feeding two selects"); 3913 Sel = cast<SelectInst>(U); 3914 } else 3915 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3916 } 3917 assert(Sel && "Reduction exit feeds no select"); 3918 State.reset(LoopExitInstDef, Sel, Part); 3919 3920 if (isa<FPMathOperator>(Sel)) 3921 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3922 3923 // If the target can create a predicated operator for the reduction at no 3924 // extra cost in the loop (for example a predicated vadd), it can be 3925 // cheaper for the select to remain in the loop than be sunk out of it, 3926 // and so use the select value for the phi instead of the old 3927 // LoopExitValue. 3928 if (PreferPredicatedReductionSelect || 3929 TTI->preferPredicatedReductionSelect( 3930 RdxDesc.getOpcode(), PhiTy, 3931 TargetTransformInfo::ReductionFlags())) { 3932 auto *VecRdxPhi = 3933 cast<PHINode>(State.get(PhiR, Part)); 3934 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3935 } 3936 } 3937 } 3938 3939 // If the vector reduction can be performed in a smaller type, we truncate 3940 // then extend the loop exit value to enable InstCombine to evaluate the 3941 // entire expression in the smaller type. 3942 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3943 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3944 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3945 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3946 VectorParts RdxParts(UF); 3947 for (unsigned Part = 0; Part < UF; ++Part) { 3948 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3949 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3950 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3951 : Builder.CreateZExt(Trunc, VecTy); 3952 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3953 if (U != Trunc) { 3954 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3955 RdxParts[Part] = Extnd; 3956 } 3957 } 3958 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3959 for (unsigned Part = 0; Part < UF; ++Part) { 3960 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3961 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3962 } 3963 } 3964 3965 // Reduce all of the unrolled parts into a single vector. 3966 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3967 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3968 3969 // The middle block terminator has already been assigned a DebugLoc here (the 3970 // OrigLoop's single latch terminator). We want the whole middle block to 3971 // appear to execute on this line because: (a) it is all compiler generated, 3972 // (b) these instructions are always executed after evaluating the latch 3973 // conditional branch, and (c) other passes may add new predecessors which 3974 // terminate on this line. This is the easiest way to ensure we don't 3975 // accidentally cause an extra step back into the loop while debugging. 3976 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3977 if (PhiR->isOrdered()) 3978 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3979 else { 3980 // Floating-point operations should have some FMF to enable the reduction. 3981 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3982 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3983 for (unsigned Part = 1; Part < UF; ++Part) { 3984 Value *RdxPart = State.get(LoopExitInstDef, Part); 3985 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3986 ReducedPartRdx = Builder.CreateBinOp( 3987 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3988 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3989 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3990 ReducedPartRdx, RdxPart); 3991 else 3992 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3993 } 3994 } 3995 3996 // Create the reduction after the loop. Note that inloop reductions create the 3997 // target reduction in the loop using a Reduction recipe. 3998 if (VF.isVector() && !PhiR->isInLoop()) { 3999 ReducedPartRdx = 4000 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4001 // If the reduction can be performed in a smaller type, we need to extend 4002 // the reduction to the wider type before we branch to the original loop. 4003 if (PhiTy != RdxDesc.getRecurrenceType()) 4004 ReducedPartRdx = RdxDesc.isSigned() 4005 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4006 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4007 } 4008 4009 PHINode *ResumePhi = 4010 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4011 4012 // Create a phi node that merges control-flow from the backedge-taken check 4013 // block and the middle block. 4014 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4015 LoopScalarPreHeader->getTerminator()); 4016 4017 // If we are fixing reductions in the epilogue loop then we should already 4018 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4019 // we carry over the incoming values correctly. 4020 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4021 if (Incoming == LoopMiddleBlock) 4022 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4023 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4024 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4025 Incoming); 4026 else 4027 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4028 } 4029 4030 // Set the resume value for this reduction 4031 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4032 4033 // If there were stores of the reduction value to a uniform memory address 4034 // inside the loop, create the final store here. 4035 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4036 StoreInst *NewSI = 4037 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4038 propagateMetadata(NewSI, SI); 4039 4040 // If the reduction value is used in other places, 4041 // then let the code below create PHI's for that. 4042 } 4043 4044 // Now, we need to fix the users of the reduction variable 4045 // inside and outside of the scalar remainder loop. 4046 4047 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4048 // in the exit blocks. See comment on analogous loop in 4049 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4050 if (!Cost->requiresScalarEpilogue(VF)) 4051 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4052 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4053 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4054 State.Plan->removeLiveOut(&LCSSAPhi); 4055 } 4056 4057 // Fix the scalar loop reduction variable with the incoming reduction sum 4058 // from the vector body and from the backedge value. 4059 int IncomingEdgeBlockIdx = 4060 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4061 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4062 // Pick the other block. 4063 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4064 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4065 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4066 } 4067 4068 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4069 VPTransformState &State) { 4070 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4071 RecurKind RK = RdxDesc.getRecurrenceKind(); 4072 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4073 return; 4074 4075 SmallVector<VPValue *, 8> Worklist; 4076 SmallPtrSet<VPValue *, 8> Visited; 4077 Worklist.push_back(PhiR); 4078 Visited.insert(PhiR); 4079 4080 while (!Worklist.empty()) { 4081 VPValue *Cur = Worklist.pop_back_val(); 4082 for (unsigned Part = 0; Part < UF; ++Part) { 4083 Value *V = State.get(Cur, Part); 4084 if (!isa<OverflowingBinaryOperator>(V)) 4085 break; 4086 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4087 } 4088 4089 for (VPUser *U : Cur->users()) { 4090 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4091 if (!UserRecipe) 4092 continue; 4093 for (VPValue *V : UserRecipe->definedValues()) 4094 if (Visited.insert(V).second) 4095 Worklist.push_back(V); 4096 } 4097 } 4098 } 4099 4100 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4101 // The basic block and loop containing the predicated instruction. 4102 auto *PredBB = PredInst->getParent(); 4103 auto *VectorLoop = LI->getLoopFor(PredBB); 4104 4105 // Initialize a worklist with the operands of the predicated instruction. 4106 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4107 4108 // Holds instructions that we need to analyze again. An instruction may be 4109 // reanalyzed if we don't yet know if we can sink it or not. 4110 SmallVector<Instruction *, 8> InstsToReanalyze; 4111 4112 // Returns true if a given use occurs in the predicated block. Phi nodes use 4113 // their operands in their corresponding predecessor blocks. 4114 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4115 auto *I = cast<Instruction>(U.getUser()); 4116 BasicBlock *BB = I->getParent(); 4117 if (auto *Phi = dyn_cast<PHINode>(I)) 4118 BB = Phi->getIncomingBlock( 4119 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4120 return BB == PredBB; 4121 }; 4122 4123 // Iteratively sink the scalarized operands of the predicated instruction 4124 // into the block we created for it. When an instruction is sunk, it's 4125 // operands are then added to the worklist. The algorithm ends after one pass 4126 // through the worklist doesn't sink a single instruction. 4127 bool Changed; 4128 do { 4129 // Add the instructions that need to be reanalyzed to the worklist, and 4130 // reset the changed indicator. 4131 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4132 InstsToReanalyze.clear(); 4133 Changed = false; 4134 4135 while (!Worklist.empty()) { 4136 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4137 4138 // We can't sink an instruction if it is a phi node, is not in the loop, 4139 // or may have side effects. 4140 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4141 I->mayHaveSideEffects()) 4142 continue; 4143 4144 // If the instruction is already in PredBB, check if we can sink its 4145 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4146 // sinking the scalar instruction I, hence it appears in PredBB; but it 4147 // may have failed to sink I's operands (recursively), which we try 4148 // (again) here. 4149 if (I->getParent() == PredBB) { 4150 Worklist.insert(I->op_begin(), I->op_end()); 4151 continue; 4152 } 4153 4154 // It's legal to sink the instruction if all its uses occur in the 4155 // predicated block. Otherwise, there's nothing to do yet, and we may 4156 // need to reanalyze the instruction. 4157 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4158 InstsToReanalyze.push_back(I); 4159 continue; 4160 } 4161 4162 // Move the instruction to the beginning of the predicated block, and add 4163 // it's operands to the worklist. 4164 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4165 Worklist.insert(I->op_begin(), I->op_end()); 4166 4167 // The sinking may have enabled other instructions to be sunk, so we will 4168 // need to iterate. 4169 Changed = true; 4170 } 4171 } while (Changed); 4172 } 4173 4174 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4175 for (PHINode *OrigPhi : OrigPHIsToFix) { 4176 VPWidenPHIRecipe *VPPhi = 4177 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4178 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4179 // Make sure the builder has a valid insert point. 4180 Builder.SetInsertPoint(NewPhi); 4181 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4182 VPValue *Inc = VPPhi->getIncomingValue(i); 4183 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4184 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4185 } 4186 } 4187 } 4188 4189 bool InnerLoopVectorizer::useOrderedReductions( 4190 const RecurrenceDescriptor &RdxDesc) { 4191 return Cost->useOrderedReductions(RdxDesc); 4192 } 4193 4194 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4195 VPWidenPHIRecipe *PhiR, 4196 VPTransformState &State) { 4197 assert(EnableVPlanNativePath && 4198 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 4199 // Currently we enter here in the VPlan-native path for non-induction 4200 // PHIs where all control flow is uniform. We simply widen these PHIs. 4201 // Create a vector phi with no operands - the vector phi operands will be 4202 // set at the end of vector code generation. 4203 Type *VecTy = (State.VF.isScalar()) 4204 ? PN->getType() 4205 : VectorType::get(PN->getType(), State.VF); 4206 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4207 State.set(PhiR, VecPhi, 0); 4208 OrigPHIsToFix.push_back(cast<PHINode>(PN)); 4209 } 4210 4211 /// A helper function for checking whether an integer division-related 4212 /// instruction may divide by zero (in which case it must be predicated if 4213 /// executed conditionally in the scalar code). 4214 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4215 /// Non-zero divisors that are non compile-time constants will not be 4216 /// converted into multiplication, so we will still end up scalarizing 4217 /// the division, but can do so w/o predication. 4218 static bool mayDivideByZero(Instruction &I) { 4219 assert((I.getOpcode() == Instruction::UDiv || 4220 I.getOpcode() == Instruction::SDiv || 4221 I.getOpcode() == Instruction::URem || 4222 I.getOpcode() == Instruction::SRem) && 4223 "Unexpected instruction"); 4224 Value *Divisor = I.getOperand(1); 4225 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4226 return !CInt || CInt->isZero(); 4227 } 4228 4229 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4230 VPUser &ArgOperands, 4231 VPTransformState &State) { 4232 assert(!isa<DbgInfoIntrinsic>(I) && 4233 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4234 setDebugLocFromInst(&I); 4235 4236 Module *M = I.getParent()->getParent()->getParent(); 4237 auto *CI = cast<CallInst>(&I); 4238 4239 SmallVector<Type *, 4> Tys; 4240 for (Value *ArgOperand : CI->args()) 4241 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4242 4243 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4244 4245 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4246 // version of the instruction. 4247 // Is it beneficial to perform intrinsic call compared to lib call? 4248 bool NeedToScalarize = false; 4249 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4250 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4251 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4252 assert((UseVectorIntrinsic || !NeedToScalarize) && 4253 "Instruction should be scalarized elsewhere."); 4254 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4255 "Either the intrinsic cost or vector call cost must be valid"); 4256 4257 for (unsigned Part = 0; Part < UF; ++Part) { 4258 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4259 SmallVector<Value *, 4> Args; 4260 for (auto &I : enumerate(ArgOperands.operands())) { 4261 // Some intrinsics have a scalar argument - don't replace it with a 4262 // vector. 4263 Value *Arg; 4264 if (!UseVectorIntrinsic || 4265 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4266 Arg = State.get(I.value(), Part); 4267 else 4268 Arg = State.get(I.value(), VPIteration(0, 0)); 4269 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4270 TysForDecl.push_back(Arg->getType()); 4271 Args.push_back(Arg); 4272 } 4273 4274 Function *VectorF; 4275 if (UseVectorIntrinsic) { 4276 // Use vector version of the intrinsic. 4277 if (VF.isVector()) 4278 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4279 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4280 assert(VectorF && "Can't retrieve vector intrinsic."); 4281 } else { 4282 // Use vector version of the function call. 4283 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4284 #ifndef NDEBUG 4285 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4286 "Can't create vector function."); 4287 #endif 4288 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4289 } 4290 SmallVector<OperandBundleDef, 1> OpBundles; 4291 CI->getOperandBundlesAsDefs(OpBundles); 4292 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4293 4294 if (isa<FPMathOperator>(V)) 4295 V->copyFastMathFlags(CI); 4296 4297 State.set(Def, V, Part); 4298 addMetadata(V, &I); 4299 } 4300 } 4301 4302 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4303 // We should not collect Scalars more than once per VF. Right now, this 4304 // function is called from collectUniformsAndScalars(), which already does 4305 // this check. Collecting Scalars for VF=1 does not make any sense. 4306 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4307 "This function should not be visited twice for the same VF"); 4308 4309 // This avoids any chances of creating a REPLICATE recipe during planning 4310 // since that would result in generation of scalarized code during execution, 4311 // which is not supported for scalable vectors. 4312 if (VF.isScalable()) { 4313 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4314 return; 4315 } 4316 4317 SmallSetVector<Instruction *, 8> Worklist; 4318 4319 // These sets are used to seed the analysis with pointers used by memory 4320 // accesses that will remain scalar. 4321 SmallSetVector<Instruction *, 8> ScalarPtrs; 4322 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4323 auto *Latch = TheLoop->getLoopLatch(); 4324 4325 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4326 // The pointer operands of loads and stores will be scalar as long as the 4327 // memory access is not a gather or scatter operation. The value operand of a 4328 // store will remain scalar if the store is scalarized. 4329 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4330 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4331 assert(WideningDecision != CM_Unknown && 4332 "Widening decision should be ready at this moment"); 4333 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4334 if (Ptr == Store->getValueOperand()) 4335 return WideningDecision == CM_Scalarize; 4336 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4337 "Ptr is neither a value or pointer operand"); 4338 return WideningDecision != CM_GatherScatter; 4339 }; 4340 4341 // A helper that returns true if the given value is a bitcast or 4342 // getelementptr instruction contained in the loop. 4343 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4344 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4345 isa<GetElementPtrInst>(V)) && 4346 !TheLoop->isLoopInvariant(V); 4347 }; 4348 4349 // A helper that evaluates a memory access's use of a pointer. If the use will 4350 // be a scalar use and the pointer is only used by memory accesses, we place 4351 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4352 // PossibleNonScalarPtrs. 4353 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4354 // We only care about bitcast and getelementptr instructions contained in 4355 // the loop. 4356 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4357 return; 4358 4359 // If the pointer has already been identified as scalar (e.g., if it was 4360 // also identified as uniform), there's nothing to do. 4361 auto *I = cast<Instruction>(Ptr); 4362 if (Worklist.count(I)) 4363 return; 4364 4365 // If the use of the pointer will be a scalar use, and all users of the 4366 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4367 // place the pointer in PossibleNonScalarPtrs. 4368 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4369 return isa<LoadInst>(U) || isa<StoreInst>(U); 4370 })) 4371 ScalarPtrs.insert(I); 4372 else 4373 PossibleNonScalarPtrs.insert(I); 4374 }; 4375 4376 // We seed the scalars analysis with three classes of instructions: (1) 4377 // instructions marked uniform-after-vectorization and (2) bitcast, 4378 // getelementptr and (pointer) phi instructions used by memory accesses 4379 // requiring a scalar use. 4380 // 4381 // (1) Add to the worklist all instructions that have been identified as 4382 // uniform-after-vectorization. 4383 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4384 4385 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4386 // memory accesses requiring a scalar use. The pointer operands of loads and 4387 // stores will be scalar as long as the memory accesses is not a gather or 4388 // scatter operation. The value operand of a store will remain scalar if the 4389 // store is scalarized. 4390 for (auto *BB : TheLoop->blocks()) 4391 for (auto &I : *BB) { 4392 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4393 evaluatePtrUse(Load, Load->getPointerOperand()); 4394 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4395 evaluatePtrUse(Store, Store->getPointerOperand()); 4396 evaluatePtrUse(Store, Store->getValueOperand()); 4397 } 4398 } 4399 for (auto *I : ScalarPtrs) 4400 if (!PossibleNonScalarPtrs.count(I)) { 4401 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4402 Worklist.insert(I); 4403 } 4404 4405 // Insert the forced scalars. 4406 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4407 // induction variable when the PHI user is scalarized. 4408 auto ForcedScalar = ForcedScalars.find(VF); 4409 if (ForcedScalar != ForcedScalars.end()) 4410 for (auto *I : ForcedScalar->second) 4411 Worklist.insert(I); 4412 4413 // Expand the worklist by looking through any bitcasts and getelementptr 4414 // instructions we've already identified as scalar. This is similar to the 4415 // expansion step in collectLoopUniforms(); however, here we're only 4416 // expanding to include additional bitcasts and getelementptr instructions. 4417 unsigned Idx = 0; 4418 while (Idx != Worklist.size()) { 4419 Instruction *Dst = Worklist[Idx++]; 4420 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4421 continue; 4422 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4423 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4424 auto *J = cast<Instruction>(U); 4425 return !TheLoop->contains(J) || Worklist.count(J) || 4426 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4427 isScalarUse(J, Src)); 4428 })) { 4429 Worklist.insert(Src); 4430 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4431 } 4432 } 4433 4434 // An induction variable will remain scalar if all users of the induction 4435 // variable and induction variable update remain scalar. 4436 for (auto &Induction : Legal->getInductionVars()) { 4437 auto *Ind = Induction.first; 4438 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4439 4440 // If tail-folding is applied, the primary induction variable will be used 4441 // to feed a vector compare. 4442 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4443 continue; 4444 4445 // Returns true if \p Indvar is a pointer induction that is used directly by 4446 // load/store instruction \p I. 4447 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4448 Instruction *I) { 4449 return Induction.second.getKind() == 4450 InductionDescriptor::IK_PtrInduction && 4451 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4452 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4453 }; 4454 4455 // Determine if all users of the induction variable are scalar after 4456 // vectorization. 4457 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4458 auto *I = cast<Instruction>(U); 4459 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4460 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4461 }); 4462 if (!ScalarInd) 4463 continue; 4464 4465 // Determine if all users of the induction variable update instruction are 4466 // scalar after vectorization. 4467 auto ScalarIndUpdate = 4468 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4469 auto *I = cast<Instruction>(U); 4470 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4471 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4472 }); 4473 if (!ScalarIndUpdate) 4474 continue; 4475 4476 // The induction variable and its update instruction will remain scalar. 4477 Worklist.insert(Ind); 4478 Worklist.insert(IndUpdate); 4479 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4480 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4481 << "\n"); 4482 } 4483 4484 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4485 } 4486 4487 bool LoopVectorizationCostModel::isScalarWithPredication( 4488 Instruction *I, ElementCount VF) const { 4489 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4490 return false; 4491 switch(I->getOpcode()) { 4492 default: 4493 break; 4494 case Instruction::Load: 4495 case Instruction::Store: { 4496 if (!Legal->isMaskRequired(I)) 4497 return false; 4498 auto *Ptr = getLoadStorePointerOperand(I); 4499 auto *Ty = getLoadStoreType(I); 4500 Type *VTy = Ty; 4501 if (VF.isVector()) 4502 VTy = VectorType::get(Ty, VF); 4503 const Align Alignment = getLoadStoreAlignment(I); 4504 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4505 TTI.isLegalMaskedGather(VTy, Alignment)) 4506 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4507 TTI.isLegalMaskedScatter(VTy, Alignment)); 4508 } 4509 case Instruction::UDiv: 4510 case Instruction::SDiv: 4511 case Instruction::SRem: 4512 case Instruction::URem: 4513 return mayDivideByZero(*I); 4514 } 4515 return false; 4516 } 4517 4518 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4519 Instruction *I, ElementCount VF) { 4520 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4521 assert(getWideningDecision(I, VF) == CM_Unknown && 4522 "Decision should not be set yet."); 4523 auto *Group = getInterleavedAccessGroup(I); 4524 assert(Group && "Must have a group."); 4525 4526 // If the instruction's allocated size doesn't equal it's type size, it 4527 // requires padding and will be scalarized. 4528 auto &DL = I->getModule()->getDataLayout(); 4529 auto *ScalarTy = getLoadStoreType(I); 4530 if (hasIrregularType(ScalarTy, DL)) 4531 return false; 4532 4533 // If the group involves a non-integral pointer, we may not be able to 4534 // losslessly cast all values to a common type. 4535 unsigned InterleaveFactor = Group->getFactor(); 4536 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4537 for (unsigned i = 0; i < InterleaveFactor; i++) { 4538 Instruction *Member = Group->getMember(i); 4539 if (!Member) 4540 continue; 4541 auto *MemberTy = getLoadStoreType(Member); 4542 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4543 // Don't coerce non-integral pointers to integers or vice versa. 4544 if (MemberNI != ScalarNI) { 4545 // TODO: Consider adding special nullptr value case here 4546 return false; 4547 } else if (MemberNI && ScalarNI && 4548 ScalarTy->getPointerAddressSpace() != 4549 MemberTy->getPointerAddressSpace()) { 4550 return false; 4551 } 4552 } 4553 4554 // Check if masking is required. 4555 // A Group may need masking for one of two reasons: it resides in a block that 4556 // needs predication, or it was decided to use masking to deal with gaps 4557 // (either a gap at the end of a load-access that may result in a speculative 4558 // load, or any gaps in a store-access). 4559 bool PredicatedAccessRequiresMasking = 4560 blockNeedsPredicationForAnyReason(I->getParent()) && 4561 Legal->isMaskRequired(I); 4562 bool LoadAccessWithGapsRequiresEpilogMasking = 4563 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4564 !isScalarEpilogueAllowed(); 4565 bool StoreAccessWithGapsRequiresMasking = 4566 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4567 if (!PredicatedAccessRequiresMasking && 4568 !LoadAccessWithGapsRequiresEpilogMasking && 4569 !StoreAccessWithGapsRequiresMasking) 4570 return true; 4571 4572 // If masked interleaving is required, we expect that the user/target had 4573 // enabled it, because otherwise it either wouldn't have been created or 4574 // it should have been invalidated by the CostModel. 4575 assert(useMaskedInterleavedAccesses(TTI) && 4576 "Masked interleave-groups for predicated accesses are not enabled."); 4577 4578 if (Group->isReverse()) 4579 return false; 4580 4581 auto *Ty = getLoadStoreType(I); 4582 const Align Alignment = getLoadStoreAlignment(I); 4583 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4584 : TTI.isLegalMaskedStore(Ty, Alignment); 4585 } 4586 4587 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4588 Instruction *I, ElementCount VF) { 4589 // Get and ensure we have a valid memory instruction. 4590 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4591 4592 auto *Ptr = getLoadStorePointerOperand(I); 4593 auto *ScalarTy = getLoadStoreType(I); 4594 4595 // In order to be widened, the pointer should be consecutive, first of all. 4596 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4597 return false; 4598 4599 // If the instruction is a store located in a predicated block, it will be 4600 // scalarized. 4601 if (isScalarWithPredication(I, VF)) 4602 return false; 4603 4604 // If the instruction's allocated size doesn't equal it's type size, it 4605 // requires padding and will be scalarized. 4606 auto &DL = I->getModule()->getDataLayout(); 4607 if (hasIrregularType(ScalarTy, DL)) 4608 return false; 4609 4610 return true; 4611 } 4612 4613 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4614 // We should not collect Uniforms more than once per VF. Right now, 4615 // this function is called from collectUniformsAndScalars(), which 4616 // already does this check. Collecting Uniforms for VF=1 does not make any 4617 // sense. 4618 4619 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4620 "This function should not be visited twice for the same VF"); 4621 4622 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4623 // not analyze again. Uniforms.count(VF) will return 1. 4624 Uniforms[VF].clear(); 4625 4626 // We now know that the loop is vectorizable! 4627 // Collect instructions inside the loop that will remain uniform after 4628 // vectorization. 4629 4630 // Global values, params and instructions outside of current loop are out of 4631 // scope. 4632 auto isOutOfScope = [&](Value *V) -> bool { 4633 Instruction *I = dyn_cast<Instruction>(V); 4634 return (!I || !TheLoop->contains(I)); 4635 }; 4636 4637 // Worklist containing uniform instructions demanding lane 0. 4638 SetVector<Instruction *> Worklist; 4639 BasicBlock *Latch = TheLoop->getLoopLatch(); 4640 4641 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4642 // that are scalar with predication must not be considered uniform after 4643 // vectorization, because that would create an erroneous replicating region 4644 // where only a single instance out of VF should be formed. 4645 // TODO: optimize such seldom cases if found important, see PR40816. 4646 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4647 if (isOutOfScope(I)) { 4648 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4649 << *I << "\n"); 4650 return; 4651 } 4652 if (isScalarWithPredication(I, VF)) { 4653 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4654 << *I << "\n"); 4655 return; 4656 } 4657 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4658 Worklist.insert(I); 4659 }; 4660 4661 // Start with the conditional branch. If the branch condition is an 4662 // instruction contained in the loop that is only used by the branch, it is 4663 // uniform. 4664 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4665 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4666 addToWorklistIfAllowed(Cmp); 4667 4668 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4669 InstWidening WideningDecision = getWideningDecision(I, VF); 4670 assert(WideningDecision != CM_Unknown && 4671 "Widening decision should be ready at this moment"); 4672 4673 // A uniform memory op is itself uniform. We exclude uniform stores 4674 // here as they demand the last lane, not the first one. 4675 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4676 assert(WideningDecision == CM_Scalarize); 4677 return true; 4678 } 4679 4680 return (WideningDecision == CM_Widen || 4681 WideningDecision == CM_Widen_Reverse || 4682 WideningDecision == CM_Interleave); 4683 }; 4684 4685 4686 // Returns true if Ptr is the pointer operand of a memory access instruction 4687 // I, and I is known to not require scalarization. 4688 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4689 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4690 }; 4691 4692 // Holds a list of values which are known to have at least one uniform use. 4693 // Note that there may be other uses which aren't uniform. A "uniform use" 4694 // here is something which only demands lane 0 of the unrolled iterations; 4695 // it does not imply that all lanes produce the same value (e.g. this is not 4696 // the usual meaning of uniform) 4697 SetVector<Value *> HasUniformUse; 4698 4699 // Scan the loop for instructions which are either a) known to have only 4700 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4701 for (auto *BB : TheLoop->blocks()) 4702 for (auto &I : *BB) { 4703 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4704 switch (II->getIntrinsicID()) { 4705 case Intrinsic::sideeffect: 4706 case Intrinsic::experimental_noalias_scope_decl: 4707 case Intrinsic::assume: 4708 case Intrinsic::lifetime_start: 4709 case Intrinsic::lifetime_end: 4710 if (TheLoop->hasLoopInvariantOperands(&I)) 4711 addToWorklistIfAllowed(&I); 4712 break; 4713 default: 4714 break; 4715 } 4716 } 4717 4718 // ExtractValue instructions must be uniform, because the operands are 4719 // known to be loop-invariant. 4720 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4721 assert(isOutOfScope(EVI->getAggregateOperand()) && 4722 "Expected aggregate value to be loop invariant"); 4723 addToWorklistIfAllowed(EVI); 4724 continue; 4725 } 4726 4727 // If there's no pointer operand, there's nothing to do. 4728 auto *Ptr = getLoadStorePointerOperand(&I); 4729 if (!Ptr) 4730 continue; 4731 4732 // A uniform memory op is itself uniform. We exclude uniform stores 4733 // here as they demand the last lane, not the first one. 4734 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4735 addToWorklistIfAllowed(&I); 4736 4737 if (isUniformDecision(&I, VF)) { 4738 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4739 HasUniformUse.insert(Ptr); 4740 } 4741 } 4742 4743 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4744 // demanding) users. Since loops are assumed to be in LCSSA form, this 4745 // disallows uses outside the loop as well. 4746 for (auto *V : HasUniformUse) { 4747 if (isOutOfScope(V)) 4748 continue; 4749 auto *I = cast<Instruction>(V); 4750 auto UsersAreMemAccesses = 4751 llvm::all_of(I->users(), [&](User *U) -> bool { 4752 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4753 }); 4754 if (UsersAreMemAccesses) 4755 addToWorklistIfAllowed(I); 4756 } 4757 4758 // Expand Worklist in topological order: whenever a new instruction 4759 // is added , its users should be already inside Worklist. It ensures 4760 // a uniform instruction will only be used by uniform instructions. 4761 unsigned idx = 0; 4762 while (idx != Worklist.size()) { 4763 Instruction *I = Worklist[idx++]; 4764 4765 for (auto OV : I->operand_values()) { 4766 // isOutOfScope operands cannot be uniform instructions. 4767 if (isOutOfScope(OV)) 4768 continue; 4769 // First order recurrence Phi's should typically be considered 4770 // non-uniform. 4771 auto *OP = dyn_cast<PHINode>(OV); 4772 if (OP && Legal->isFirstOrderRecurrence(OP)) 4773 continue; 4774 // If all the users of the operand are uniform, then add the 4775 // operand into the uniform worklist. 4776 auto *OI = cast<Instruction>(OV); 4777 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4778 auto *J = cast<Instruction>(U); 4779 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4780 })) 4781 addToWorklistIfAllowed(OI); 4782 } 4783 } 4784 4785 // For an instruction to be added into Worklist above, all its users inside 4786 // the loop should also be in Worklist. However, this condition cannot be 4787 // true for phi nodes that form a cyclic dependence. We must process phi 4788 // nodes separately. An induction variable will remain uniform if all users 4789 // of the induction variable and induction variable update remain uniform. 4790 // The code below handles both pointer and non-pointer induction variables. 4791 for (auto &Induction : Legal->getInductionVars()) { 4792 auto *Ind = Induction.first; 4793 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4794 4795 // Determine if all users of the induction variable are uniform after 4796 // vectorization. 4797 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4798 auto *I = cast<Instruction>(U); 4799 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4800 isVectorizedMemAccessUse(I, Ind); 4801 }); 4802 if (!UniformInd) 4803 continue; 4804 4805 // Determine if all users of the induction variable update instruction are 4806 // uniform after vectorization. 4807 auto UniformIndUpdate = 4808 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4809 auto *I = cast<Instruction>(U); 4810 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4811 isVectorizedMemAccessUse(I, IndUpdate); 4812 }); 4813 if (!UniformIndUpdate) 4814 continue; 4815 4816 // The induction variable and its update instruction will remain uniform. 4817 addToWorklistIfAllowed(Ind); 4818 addToWorklistIfAllowed(IndUpdate); 4819 } 4820 4821 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4822 } 4823 4824 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4825 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4826 4827 if (Legal->getRuntimePointerChecking()->Need) { 4828 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4829 "runtime pointer checks needed. Enable vectorization of this " 4830 "loop with '#pragma clang loop vectorize(enable)' when " 4831 "compiling with -Os/-Oz", 4832 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4833 return true; 4834 } 4835 4836 if (!PSE.getPredicate().isAlwaysTrue()) { 4837 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4838 "runtime SCEV checks needed. Enable vectorization of this " 4839 "loop with '#pragma clang loop vectorize(enable)' when " 4840 "compiling with -Os/-Oz", 4841 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4842 return true; 4843 } 4844 4845 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4846 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4847 reportVectorizationFailure("Runtime stride check for small trip count", 4848 "runtime stride == 1 checks needed. Enable vectorization of " 4849 "this loop without such check by compiling with -Os/-Oz", 4850 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4851 return true; 4852 } 4853 4854 return false; 4855 } 4856 4857 ElementCount 4858 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4859 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4860 return ElementCount::getScalable(0); 4861 4862 if (Hints->isScalableVectorizationDisabled()) { 4863 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4864 "ScalableVectorizationDisabled", ORE, TheLoop); 4865 return ElementCount::getScalable(0); 4866 } 4867 4868 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4869 4870 auto MaxScalableVF = ElementCount::getScalable( 4871 std::numeric_limits<ElementCount::ScalarTy>::max()); 4872 4873 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4874 // FIXME: While for scalable vectors this is currently sufficient, this should 4875 // be replaced by a more detailed mechanism that filters out specific VFs, 4876 // instead of invalidating vectorization for a whole set of VFs based on the 4877 // MaxVF. 4878 4879 // Disable scalable vectorization if the loop contains unsupported reductions. 4880 if (!canVectorizeReductions(MaxScalableVF)) { 4881 reportVectorizationInfo( 4882 "Scalable vectorization not supported for the reduction " 4883 "operations found in this loop.", 4884 "ScalableVFUnfeasible", ORE, TheLoop); 4885 return ElementCount::getScalable(0); 4886 } 4887 4888 // Disable scalable vectorization if the loop contains any instructions 4889 // with element types not supported for scalable vectors. 4890 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4891 return !Ty->isVoidTy() && 4892 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4893 })) { 4894 reportVectorizationInfo("Scalable vectorization is not supported " 4895 "for all element types found in this loop.", 4896 "ScalableVFUnfeasible", ORE, TheLoop); 4897 return ElementCount::getScalable(0); 4898 } 4899 4900 if (Legal->isSafeForAnyVectorWidth()) 4901 return MaxScalableVF; 4902 4903 // Limit MaxScalableVF by the maximum safe dependence distance. 4904 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4905 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4906 MaxVScale = 4907 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4908 MaxScalableVF = ElementCount::getScalable( 4909 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 4910 if (!MaxScalableVF) 4911 reportVectorizationInfo( 4912 "Max legal vector width too small, scalable vectorization " 4913 "unfeasible.", 4914 "ScalableVFUnfeasible", ORE, TheLoop); 4915 4916 return MaxScalableVF; 4917 } 4918 4919 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4920 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4921 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4922 unsigned SmallestType, WidestType; 4923 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4924 4925 // Get the maximum safe dependence distance in bits computed by LAA. 4926 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4927 // the memory accesses that is most restrictive (involved in the smallest 4928 // dependence distance). 4929 unsigned MaxSafeElements = 4930 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4931 4932 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4933 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4934 4935 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4936 << ".\n"); 4937 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4938 << ".\n"); 4939 4940 // First analyze the UserVF, fall back if the UserVF should be ignored. 4941 if (UserVF) { 4942 auto MaxSafeUserVF = 4943 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4944 4945 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4946 // If `VF=vscale x N` is safe, then so is `VF=N` 4947 if (UserVF.isScalable()) 4948 return FixedScalableVFPair( 4949 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4950 else 4951 return UserVF; 4952 } 4953 4954 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4955 4956 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4957 // is better to ignore the hint and let the compiler choose a suitable VF. 4958 if (!UserVF.isScalable()) { 4959 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4960 << " is unsafe, clamping to max safe VF=" 4961 << MaxSafeFixedVF << ".\n"); 4962 ORE->emit([&]() { 4963 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4964 TheLoop->getStartLoc(), 4965 TheLoop->getHeader()) 4966 << "User-specified vectorization factor " 4967 << ore::NV("UserVectorizationFactor", UserVF) 4968 << " is unsafe, clamping to maximum safe vectorization factor " 4969 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4970 }); 4971 return MaxSafeFixedVF; 4972 } 4973 4974 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4975 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4976 << " is ignored because scalable vectors are not " 4977 "available.\n"); 4978 ORE->emit([&]() { 4979 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4980 TheLoop->getStartLoc(), 4981 TheLoop->getHeader()) 4982 << "User-specified vectorization factor " 4983 << ore::NV("UserVectorizationFactor", UserVF) 4984 << " is ignored because the target does not support scalable " 4985 "vectors. The compiler will pick a more suitable value."; 4986 }); 4987 } else { 4988 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4989 << " is unsafe. Ignoring scalable UserVF.\n"); 4990 ORE->emit([&]() { 4991 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4992 TheLoop->getStartLoc(), 4993 TheLoop->getHeader()) 4994 << "User-specified vectorization factor " 4995 << ore::NV("UserVectorizationFactor", UserVF) 4996 << " is unsafe. Ignoring the hint to let the compiler pick a " 4997 "more suitable value."; 4998 }); 4999 } 5000 } 5001 5002 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5003 << " / " << WidestType << " bits.\n"); 5004 5005 FixedScalableVFPair Result(ElementCount::getFixed(1), 5006 ElementCount::getScalable(0)); 5007 if (auto MaxVF = 5008 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5009 MaxSafeFixedVF, FoldTailByMasking)) 5010 Result.FixedVF = MaxVF; 5011 5012 if (auto MaxVF = 5013 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5014 MaxSafeScalableVF, FoldTailByMasking)) 5015 if (MaxVF.isScalable()) { 5016 Result.ScalableVF = MaxVF; 5017 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5018 << "\n"); 5019 } 5020 5021 return Result; 5022 } 5023 5024 FixedScalableVFPair 5025 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5026 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5027 // TODO: It may by useful to do since it's still likely to be dynamically 5028 // uniform if the target can skip. 5029 reportVectorizationFailure( 5030 "Not inserting runtime ptr check for divergent target", 5031 "runtime pointer checks needed. Not enabled for divergent target", 5032 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5033 return FixedScalableVFPair::getNone(); 5034 } 5035 5036 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5037 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5038 if (TC == 1) { 5039 reportVectorizationFailure("Single iteration (non) loop", 5040 "loop trip count is one, irrelevant for vectorization", 5041 "SingleIterationLoop", ORE, TheLoop); 5042 return FixedScalableVFPair::getNone(); 5043 } 5044 5045 switch (ScalarEpilogueStatus) { 5046 case CM_ScalarEpilogueAllowed: 5047 return computeFeasibleMaxVF(TC, UserVF, false); 5048 case CM_ScalarEpilogueNotAllowedUsePredicate: 5049 LLVM_FALLTHROUGH; 5050 case CM_ScalarEpilogueNotNeededUsePredicate: 5051 LLVM_DEBUG( 5052 dbgs() << "LV: vector predicate hint/switch found.\n" 5053 << "LV: Not allowing scalar epilogue, creating predicated " 5054 << "vector loop.\n"); 5055 break; 5056 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5057 // fallthrough as a special case of OptForSize 5058 case CM_ScalarEpilogueNotAllowedOptSize: 5059 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5060 LLVM_DEBUG( 5061 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5062 else 5063 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5064 << "count.\n"); 5065 5066 // Bail if runtime checks are required, which are not good when optimising 5067 // for size. 5068 if (runtimeChecksRequired()) 5069 return FixedScalableVFPair::getNone(); 5070 5071 break; 5072 } 5073 5074 // The only loops we can vectorize without a scalar epilogue, are loops with 5075 // a bottom-test and a single exiting block. We'd have to handle the fact 5076 // that not every instruction executes on the last iteration. This will 5077 // require a lane mask which varies through the vector loop body. (TODO) 5078 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5079 // If there was a tail-folding hint/switch, but we can't fold the tail by 5080 // masking, fallback to a vectorization with a scalar epilogue. 5081 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5082 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5083 "scalar epilogue instead.\n"); 5084 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5085 return computeFeasibleMaxVF(TC, UserVF, false); 5086 } 5087 return FixedScalableVFPair::getNone(); 5088 } 5089 5090 // Now try the tail folding 5091 5092 // Invalidate interleave groups that require an epilogue if we can't mask 5093 // the interleave-group. 5094 if (!useMaskedInterleavedAccesses(TTI)) { 5095 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5096 "No decisions should have been taken at this point"); 5097 // Note: There is no need to invalidate any cost modeling decisions here, as 5098 // non where taken so far. 5099 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5100 } 5101 5102 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5103 // Avoid tail folding if the trip count is known to be a multiple of any VF 5104 // we chose. 5105 // FIXME: The condition below pessimises the case for fixed-width vectors, 5106 // when scalable VFs are also candidates for vectorization. 5107 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5108 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5109 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5110 "MaxFixedVF must be a power of 2"); 5111 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5112 : MaxFixedVF.getFixedValue(); 5113 ScalarEvolution *SE = PSE.getSE(); 5114 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5115 const SCEV *ExitCount = SE->getAddExpr( 5116 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5117 const SCEV *Rem = SE->getURemExpr( 5118 SE->applyLoopGuards(ExitCount, TheLoop), 5119 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5120 if (Rem->isZero()) { 5121 // Accept MaxFixedVF if we do not have a tail. 5122 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5123 return MaxFactors; 5124 } 5125 } 5126 5127 // If we don't know the precise trip count, or if the trip count that we 5128 // found modulo the vectorization factor is not zero, try to fold the tail 5129 // by masking. 5130 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5131 if (Legal->prepareToFoldTailByMasking()) { 5132 FoldTailByMasking = true; 5133 return MaxFactors; 5134 } 5135 5136 // If there was a tail-folding hint/switch, but we can't fold the tail by 5137 // masking, fallback to a vectorization with a scalar epilogue. 5138 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5139 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5140 "scalar epilogue instead.\n"); 5141 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5142 return MaxFactors; 5143 } 5144 5145 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5146 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5147 return FixedScalableVFPair::getNone(); 5148 } 5149 5150 if (TC == 0) { 5151 reportVectorizationFailure( 5152 "Unable to calculate the loop count due to complex control flow", 5153 "unable to calculate the loop count due to complex control flow", 5154 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5155 return FixedScalableVFPair::getNone(); 5156 } 5157 5158 reportVectorizationFailure( 5159 "Cannot optimize for size and vectorize at the same time.", 5160 "cannot optimize for size and vectorize at the same time. " 5161 "Enable vectorization of this loop with '#pragma clang loop " 5162 "vectorize(enable)' when compiling with -Os/-Oz", 5163 "NoTailLoopWithOptForSize", ORE, TheLoop); 5164 return FixedScalableVFPair::getNone(); 5165 } 5166 5167 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5168 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5169 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5170 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5171 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5172 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5173 : TargetTransformInfo::RGK_FixedWidthVector); 5174 5175 // Convenience function to return the minimum of two ElementCounts. 5176 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5177 assert((LHS.isScalable() == RHS.isScalable()) && 5178 "Scalable flags must match"); 5179 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5180 }; 5181 5182 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5183 // Note that both WidestRegister and WidestType may not be a powers of 2. 5184 auto MaxVectorElementCount = ElementCount::get( 5185 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5186 ComputeScalableMaxVF); 5187 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5188 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5189 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5190 5191 if (!MaxVectorElementCount) { 5192 LLVM_DEBUG(dbgs() << "LV: The target has no " 5193 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5194 << " vector registers.\n"); 5195 return ElementCount::getFixed(1); 5196 } 5197 5198 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5199 if (ConstTripCount && 5200 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5201 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5202 // If loop trip count (TC) is known at compile time there is no point in 5203 // choosing VF greater than TC (as done in the loop below). Select maximum 5204 // power of two which doesn't exceed TC. 5205 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5206 // when the TC is less than or equal to the known number of lanes. 5207 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5208 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5209 "exceeding the constant trip count: " 5210 << ClampedConstTripCount << "\n"); 5211 return ElementCount::getFixed(ClampedConstTripCount); 5212 } 5213 5214 TargetTransformInfo::RegisterKind RegKind = 5215 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5216 : TargetTransformInfo::RGK_FixedWidthVector; 5217 ElementCount MaxVF = MaxVectorElementCount; 5218 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5219 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5220 auto MaxVectorElementCountMaxBW = ElementCount::get( 5221 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5222 ComputeScalableMaxVF); 5223 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5224 5225 // Collect all viable vectorization factors larger than the default MaxVF 5226 // (i.e. MaxVectorElementCount). 5227 SmallVector<ElementCount, 8> VFs; 5228 for (ElementCount VS = MaxVectorElementCount * 2; 5229 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5230 VFs.push_back(VS); 5231 5232 // For each VF calculate its register usage. 5233 auto RUs = calculateRegisterUsage(VFs); 5234 5235 // Select the largest VF which doesn't require more registers than existing 5236 // ones. 5237 for (int i = RUs.size() - 1; i >= 0; --i) { 5238 bool Selected = true; 5239 for (auto &pair : RUs[i].MaxLocalUsers) { 5240 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5241 if (pair.second > TargetNumRegisters) 5242 Selected = false; 5243 } 5244 if (Selected) { 5245 MaxVF = VFs[i]; 5246 break; 5247 } 5248 } 5249 if (ElementCount MinVF = 5250 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5251 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5252 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5253 << ") with target's minimum: " << MinVF << '\n'); 5254 MaxVF = MinVF; 5255 } 5256 } 5257 5258 // Invalidate any widening decisions we might have made, in case the loop 5259 // requires prediction (decided later), but we have already made some 5260 // load/store widening decisions. 5261 invalidateCostModelingDecisions(); 5262 } 5263 return MaxVF; 5264 } 5265 5266 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5267 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5268 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5269 auto Min = Attr.getVScaleRangeMin(); 5270 auto Max = Attr.getVScaleRangeMax(); 5271 if (Max && Min == Max) 5272 return Max; 5273 } 5274 5275 return TTI.getVScaleForTuning(); 5276 } 5277 5278 bool LoopVectorizationCostModel::isMoreProfitable( 5279 const VectorizationFactor &A, const VectorizationFactor &B) const { 5280 InstructionCost CostA = A.Cost; 5281 InstructionCost CostB = B.Cost; 5282 5283 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5284 5285 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5286 MaxTripCount) { 5287 // If we are folding the tail and the trip count is a known (possibly small) 5288 // constant, the trip count will be rounded up to an integer number of 5289 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5290 // which we compare directly. When not folding the tail, the total cost will 5291 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5292 // approximated with the per-lane cost below instead of using the tripcount 5293 // as here. 5294 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5295 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5296 return RTCostA < RTCostB; 5297 } 5298 5299 // Improve estimate for the vector width if it is scalable. 5300 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5301 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5302 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5303 if (A.Width.isScalable()) 5304 EstimatedWidthA *= VScale.getValue(); 5305 if (B.Width.isScalable()) 5306 EstimatedWidthB *= VScale.getValue(); 5307 } 5308 5309 // Assume vscale may be larger than 1 (or the value being tuned for), 5310 // so that scalable vectorization is slightly favorable over fixed-width 5311 // vectorization. 5312 if (A.Width.isScalable() && !B.Width.isScalable()) 5313 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5314 5315 // To avoid the need for FP division: 5316 // (CostA / A.Width) < (CostB / B.Width) 5317 // <=> (CostA * B.Width) < (CostB * A.Width) 5318 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5319 } 5320 5321 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5322 const ElementCountSet &VFCandidates) { 5323 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5324 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5325 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5326 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5327 "Expected Scalar VF to be a candidate"); 5328 5329 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5330 VectorizationFactor ChosenFactor = ScalarCost; 5331 5332 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5333 if (ForceVectorization && VFCandidates.size() > 1) { 5334 // Ignore scalar width, because the user explicitly wants vectorization. 5335 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5336 // evaluation. 5337 ChosenFactor.Cost = InstructionCost::getMax(); 5338 } 5339 5340 SmallVector<InstructionVFPair> InvalidCosts; 5341 for (const auto &i : VFCandidates) { 5342 // The cost for scalar VF=1 is already calculated, so ignore it. 5343 if (i.isScalar()) 5344 continue; 5345 5346 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5347 VectorizationFactor Candidate(i, C.first); 5348 5349 #ifndef NDEBUG 5350 unsigned AssumedMinimumVscale = 1; 5351 if (Optional<unsigned> VScale = getVScaleForTuning()) 5352 AssumedMinimumVscale = VScale.getValue(); 5353 unsigned Width = 5354 Candidate.Width.isScalable() 5355 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5356 : Candidate.Width.getFixedValue(); 5357 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5358 << " costs: " << (Candidate.Cost / Width)); 5359 if (i.isScalable()) 5360 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5361 << AssumedMinimumVscale << ")"); 5362 LLVM_DEBUG(dbgs() << ".\n"); 5363 #endif 5364 5365 if (!C.second && !ForceVectorization) { 5366 LLVM_DEBUG( 5367 dbgs() << "LV: Not considering vector loop of width " << i 5368 << " because it will not generate any vector instructions.\n"); 5369 continue; 5370 } 5371 5372 // If profitable add it to ProfitableVF list. 5373 if (isMoreProfitable(Candidate, ScalarCost)) 5374 ProfitableVFs.push_back(Candidate); 5375 5376 if (isMoreProfitable(Candidate, ChosenFactor)) 5377 ChosenFactor = Candidate; 5378 } 5379 5380 // Emit a report of VFs with invalid costs in the loop. 5381 if (!InvalidCosts.empty()) { 5382 // Group the remarks per instruction, keeping the instruction order from 5383 // InvalidCosts. 5384 std::map<Instruction *, unsigned> Numbering; 5385 unsigned I = 0; 5386 for (auto &Pair : InvalidCosts) 5387 if (!Numbering.count(Pair.first)) 5388 Numbering[Pair.first] = I++; 5389 5390 // Sort the list, first on instruction(number) then on VF. 5391 llvm::sort(InvalidCosts, 5392 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5393 if (Numbering[A.first] != Numbering[B.first]) 5394 return Numbering[A.first] < Numbering[B.first]; 5395 ElementCountComparator ECC; 5396 return ECC(A.second, B.second); 5397 }); 5398 5399 // For a list of ordered instruction-vf pairs: 5400 // [(load, vf1), (load, vf2), (store, vf1)] 5401 // Group the instructions together to emit separate remarks for: 5402 // load (vf1, vf2) 5403 // store (vf1) 5404 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5405 auto Subset = ArrayRef<InstructionVFPair>(); 5406 do { 5407 if (Subset.empty()) 5408 Subset = Tail.take_front(1); 5409 5410 Instruction *I = Subset.front().first; 5411 5412 // If the next instruction is different, or if there are no other pairs, 5413 // emit a remark for the collated subset. e.g. 5414 // [(load, vf1), (load, vf2))] 5415 // to emit: 5416 // remark: invalid costs for 'load' at VF=(vf, vf2) 5417 if (Subset == Tail || Tail[Subset.size()].first != I) { 5418 std::string OutString; 5419 raw_string_ostream OS(OutString); 5420 assert(!Subset.empty() && "Unexpected empty range"); 5421 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5422 for (auto &Pair : Subset) 5423 OS << (Pair.second == Subset.front().second ? "" : ", ") 5424 << Pair.second; 5425 OS << "):"; 5426 if (auto *CI = dyn_cast<CallInst>(I)) 5427 OS << " call to " << CI->getCalledFunction()->getName(); 5428 else 5429 OS << " " << I->getOpcodeName(); 5430 OS.flush(); 5431 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5432 Tail = Tail.drop_front(Subset.size()); 5433 Subset = {}; 5434 } else 5435 // Grow the subset by one element 5436 Subset = Tail.take_front(Subset.size() + 1); 5437 } while (!Tail.empty()); 5438 } 5439 5440 if (!EnableCondStoresVectorization && NumPredStores) { 5441 reportVectorizationFailure("There are conditional stores.", 5442 "store that is conditionally executed prevents vectorization", 5443 "ConditionalStore", ORE, TheLoop); 5444 ChosenFactor = ScalarCost; 5445 } 5446 5447 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5448 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5449 << "LV: Vectorization seems to be not beneficial, " 5450 << "but was forced by a user.\n"); 5451 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5452 return ChosenFactor; 5453 } 5454 5455 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5456 const Loop &L, ElementCount VF) const { 5457 // Cross iteration phis such as reductions need special handling and are 5458 // currently unsupported. 5459 if (any_of(L.getHeader()->phis(), 5460 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5461 return false; 5462 5463 // Phis with uses outside of the loop require special handling and are 5464 // currently unsupported. 5465 for (auto &Entry : Legal->getInductionVars()) { 5466 // Look for uses of the value of the induction at the last iteration. 5467 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5468 for (User *U : PostInc->users()) 5469 if (!L.contains(cast<Instruction>(U))) 5470 return false; 5471 // Look for uses of penultimate value of the induction. 5472 for (User *U : Entry.first->users()) 5473 if (!L.contains(cast<Instruction>(U))) 5474 return false; 5475 } 5476 5477 // Induction variables that are widened require special handling that is 5478 // currently not supported. 5479 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5480 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5481 this->isProfitableToScalarize(Entry.first, VF)); 5482 })) 5483 return false; 5484 5485 // Epilogue vectorization code has not been auditted to ensure it handles 5486 // non-latch exits properly. It may be fine, but it needs auditted and 5487 // tested. 5488 if (L.getExitingBlock() != L.getLoopLatch()) 5489 return false; 5490 5491 return true; 5492 } 5493 5494 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5495 const ElementCount VF) const { 5496 // FIXME: We need a much better cost-model to take different parameters such 5497 // as register pressure, code size increase and cost of extra branches into 5498 // account. For now we apply a very crude heuristic and only consider loops 5499 // with vectorization factors larger than a certain value. 5500 // We also consider epilogue vectorization unprofitable for targets that don't 5501 // consider interleaving beneficial (eg. MVE). 5502 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5503 return false; 5504 // FIXME: We should consider changing the threshold for scalable 5505 // vectors to take VScaleForTuning into account. 5506 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5507 return true; 5508 return false; 5509 } 5510 5511 VectorizationFactor 5512 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5513 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5514 VectorizationFactor Result = VectorizationFactor::Disabled(); 5515 if (!EnableEpilogueVectorization) { 5516 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5517 return Result; 5518 } 5519 5520 if (!isScalarEpilogueAllowed()) { 5521 LLVM_DEBUG( 5522 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5523 "allowed.\n";); 5524 return Result; 5525 } 5526 5527 // Not really a cost consideration, but check for unsupported cases here to 5528 // simplify the logic. 5529 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5530 LLVM_DEBUG( 5531 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5532 "not a supported candidate.\n";); 5533 return Result; 5534 } 5535 5536 if (EpilogueVectorizationForceVF > 1) { 5537 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5538 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5539 if (LVP.hasPlanWithVF(ForcedEC)) 5540 return {ForcedEC, 0}; 5541 else { 5542 LLVM_DEBUG( 5543 dbgs() 5544 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5545 return Result; 5546 } 5547 } 5548 5549 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5550 TheLoop->getHeader()->getParent()->hasMinSize()) { 5551 LLVM_DEBUG( 5552 dbgs() 5553 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5554 return Result; 5555 } 5556 5557 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5558 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5559 "this loop\n"); 5560 return Result; 5561 } 5562 5563 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5564 // the main loop handles 8 lanes per iteration. We could still benefit from 5565 // vectorizing the epilogue loop with VF=4. 5566 ElementCount EstimatedRuntimeVF = MainLoopVF; 5567 if (MainLoopVF.isScalable()) { 5568 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5569 if (Optional<unsigned> VScale = getVScaleForTuning()) 5570 EstimatedRuntimeVF *= VScale.getValue(); 5571 } 5572 5573 for (auto &NextVF : ProfitableVFs) 5574 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5575 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5576 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5577 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5578 LVP.hasPlanWithVF(NextVF.Width)) 5579 Result = NextVF; 5580 5581 if (Result != VectorizationFactor::Disabled()) 5582 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5583 << Result.Width << "\n";); 5584 return Result; 5585 } 5586 5587 std::pair<unsigned, unsigned> 5588 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5589 unsigned MinWidth = -1U; 5590 unsigned MaxWidth = 8; 5591 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5592 // For in-loop reductions, no element types are added to ElementTypesInLoop 5593 // if there are no loads/stores in the loop. In this case, check through the 5594 // reduction variables to determine the maximum width. 5595 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5596 // Reset MaxWidth so that we can find the smallest type used by recurrences 5597 // in the loop. 5598 MaxWidth = -1U; 5599 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5600 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5601 // When finding the min width used by the recurrence we need to account 5602 // for casts on the input operands of the recurrence. 5603 MaxWidth = std::min<unsigned>( 5604 MaxWidth, std::min<unsigned>( 5605 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5606 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5607 } 5608 } else { 5609 for (Type *T : ElementTypesInLoop) { 5610 MinWidth = std::min<unsigned>( 5611 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5612 MaxWidth = std::max<unsigned>( 5613 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5614 } 5615 } 5616 return {MinWidth, MaxWidth}; 5617 } 5618 5619 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5620 ElementTypesInLoop.clear(); 5621 // For each block. 5622 for (BasicBlock *BB : TheLoop->blocks()) { 5623 // For each instruction in the loop. 5624 for (Instruction &I : BB->instructionsWithoutDebug()) { 5625 Type *T = I.getType(); 5626 5627 // Skip ignored values. 5628 if (ValuesToIgnore.count(&I)) 5629 continue; 5630 5631 // Only examine Loads, Stores and PHINodes. 5632 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5633 continue; 5634 5635 // Examine PHI nodes that are reduction variables. Update the type to 5636 // account for the recurrence type. 5637 if (auto *PN = dyn_cast<PHINode>(&I)) { 5638 if (!Legal->isReductionVariable(PN)) 5639 continue; 5640 const RecurrenceDescriptor &RdxDesc = 5641 Legal->getReductionVars().find(PN)->second; 5642 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5643 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5644 RdxDesc.getRecurrenceType(), 5645 TargetTransformInfo::ReductionFlags())) 5646 continue; 5647 T = RdxDesc.getRecurrenceType(); 5648 } 5649 5650 // Examine the stored values. 5651 if (auto *ST = dyn_cast<StoreInst>(&I)) 5652 T = ST->getValueOperand()->getType(); 5653 5654 assert(T->isSized() && 5655 "Expected the load/store/recurrence type to be sized"); 5656 5657 ElementTypesInLoop.insert(T); 5658 } 5659 } 5660 } 5661 5662 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5663 unsigned LoopCost) { 5664 // -- The interleave heuristics -- 5665 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5666 // There are many micro-architectural considerations that we can't predict 5667 // at this level. For example, frontend pressure (on decode or fetch) due to 5668 // code size, or the number and capabilities of the execution ports. 5669 // 5670 // We use the following heuristics to select the interleave count: 5671 // 1. If the code has reductions, then we interleave to break the cross 5672 // iteration dependency. 5673 // 2. If the loop is really small, then we interleave to reduce the loop 5674 // overhead. 5675 // 3. We don't interleave if we think that we will spill registers to memory 5676 // due to the increased register pressure. 5677 5678 if (!isScalarEpilogueAllowed()) 5679 return 1; 5680 5681 // We used the distance for the interleave count. 5682 if (Legal->getMaxSafeDepDistBytes() != -1U) 5683 return 1; 5684 5685 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5686 const bool HasReductions = !Legal->getReductionVars().empty(); 5687 // Do not interleave loops with a relatively small known or estimated trip 5688 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5689 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5690 // because with the above conditions interleaving can expose ILP and break 5691 // cross iteration dependences for reductions. 5692 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5693 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5694 return 1; 5695 5696 // If we did not calculate the cost for VF (because the user selected the VF) 5697 // then we calculate the cost of VF here. 5698 if (LoopCost == 0) { 5699 InstructionCost C = expectedCost(VF).first; 5700 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5701 LoopCost = *C.getValue(); 5702 5703 // Loop body is free and there is no need for interleaving. 5704 if (LoopCost == 0) 5705 return 1; 5706 } 5707 5708 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5709 // We divide by these constants so assume that we have at least one 5710 // instruction that uses at least one register. 5711 for (auto& pair : R.MaxLocalUsers) { 5712 pair.second = std::max(pair.second, 1U); 5713 } 5714 5715 // We calculate the interleave count using the following formula. 5716 // Subtract the number of loop invariants from the number of available 5717 // registers. These registers are used by all of the interleaved instances. 5718 // Next, divide the remaining registers by the number of registers that is 5719 // required by the loop, in order to estimate how many parallel instances 5720 // fit without causing spills. All of this is rounded down if necessary to be 5721 // a power of two. We want power of two interleave count to simplify any 5722 // addressing operations or alignment considerations. 5723 // We also want power of two interleave counts to ensure that the induction 5724 // variable of the vector loop wraps to zero, when tail is folded by masking; 5725 // this currently happens when OptForSize, in which case IC is set to 1 above. 5726 unsigned IC = UINT_MAX; 5727 5728 for (auto& pair : R.MaxLocalUsers) { 5729 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5730 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5731 << " registers of " 5732 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5733 if (VF.isScalar()) { 5734 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5735 TargetNumRegisters = ForceTargetNumScalarRegs; 5736 } else { 5737 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5738 TargetNumRegisters = ForceTargetNumVectorRegs; 5739 } 5740 unsigned MaxLocalUsers = pair.second; 5741 unsigned LoopInvariantRegs = 0; 5742 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5743 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5744 5745 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5746 // Don't count the induction variable as interleaved. 5747 if (EnableIndVarRegisterHeur) { 5748 TmpIC = 5749 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5750 std::max(1U, (MaxLocalUsers - 1))); 5751 } 5752 5753 IC = std::min(IC, TmpIC); 5754 } 5755 5756 // Clamp the interleave ranges to reasonable counts. 5757 unsigned MaxInterleaveCount = 5758 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5759 5760 // Check if the user has overridden the max. 5761 if (VF.isScalar()) { 5762 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5763 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5764 } else { 5765 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5766 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5767 } 5768 5769 // If trip count is known or estimated compile time constant, limit the 5770 // interleave count to be less than the trip count divided by VF, provided it 5771 // is at least 1. 5772 // 5773 // For scalable vectors we can't know if interleaving is beneficial. It may 5774 // not be beneficial for small loops if none of the lanes in the second vector 5775 // iterations is enabled. However, for larger loops, there is likely to be a 5776 // similar benefit as for fixed-width vectors. For now, we choose to leave 5777 // the InterleaveCount as if vscale is '1', although if some information about 5778 // the vector is known (e.g. min vector size), we can make a better decision. 5779 if (BestKnownTC) { 5780 MaxInterleaveCount = 5781 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5782 // Make sure MaxInterleaveCount is greater than 0. 5783 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5784 } 5785 5786 assert(MaxInterleaveCount > 0 && 5787 "Maximum interleave count must be greater than 0"); 5788 5789 // Clamp the calculated IC to be between the 1 and the max interleave count 5790 // that the target and trip count allows. 5791 if (IC > MaxInterleaveCount) 5792 IC = MaxInterleaveCount; 5793 else 5794 // Make sure IC is greater than 0. 5795 IC = std::max(1u, IC); 5796 5797 assert(IC > 0 && "Interleave count must be greater than 0."); 5798 5799 // Interleave if we vectorized this loop and there is a reduction that could 5800 // benefit from interleaving. 5801 if (VF.isVector() && HasReductions) { 5802 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5803 return IC; 5804 } 5805 5806 // For any scalar loop that either requires runtime checks or predication we 5807 // are better off leaving this to the unroller. Note that if we've already 5808 // vectorized the loop we will have done the runtime check and so interleaving 5809 // won't require further checks. 5810 bool ScalarInterleavingRequiresPredication = 5811 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5812 return Legal->blockNeedsPredication(BB); 5813 })); 5814 bool ScalarInterleavingRequiresRuntimePointerCheck = 5815 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5816 5817 // We want to interleave small loops in order to reduce the loop overhead and 5818 // potentially expose ILP opportunities. 5819 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5820 << "LV: IC is " << IC << '\n' 5821 << "LV: VF is " << VF << '\n'); 5822 const bool AggressivelyInterleaveReductions = 5823 TTI.enableAggressiveInterleaving(HasReductions); 5824 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5825 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5826 // We assume that the cost overhead is 1 and we use the cost model 5827 // to estimate the cost of the loop and interleave until the cost of the 5828 // loop overhead is about 5% of the cost of the loop. 5829 unsigned SmallIC = 5830 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5831 5832 // Interleave until store/load ports (estimated by max interleave count) are 5833 // saturated. 5834 unsigned NumStores = Legal->getNumStores(); 5835 unsigned NumLoads = Legal->getNumLoads(); 5836 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5837 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5838 5839 // There is little point in interleaving for reductions containing selects 5840 // and compares when VF=1 since it may just create more overhead than it's 5841 // worth for loops with small trip counts. This is because we still have to 5842 // do the final reduction after the loop. 5843 bool HasSelectCmpReductions = 5844 HasReductions && 5845 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5846 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5847 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5848 RdxDesc.getRecurrenceKind()); 5849 }); 5850 if (HasSelectCmpReductions) { 5851 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5852 return 1; 5853 } 5854 5855 // If we have a scalar reduction (vector reductions are already dealt with 5856 // by this point), we can increase the critical path length if the loop 5857 // we're interleaving is inside another loop. For tree-wise reductions 5858 // set the limit to 2, and for ordered reductions it's best to disable 5859 // interleaving entirely. 5860 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5861 bool HasOrderedReductions = 5862 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5863 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5864 return RdxDesc.isOrdered(); 5865 }); 5866 if (HasOrderedReductions) { 5867 LLVM_DEBUG( 5868 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5869 return 1; 5870 } 5871 5872 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5873 SmallIC = std::min(SmallIC, F); 5874 StoresIC = std::min(StoresIC, F); 5875 LoadsIC = std::min(LoadsIC, F); 5876 } 5877 5878 if (EnableLoadStoreRuntimeInterleave && 5879 std::max(StoresIC, LoadsIC) > SmallIC) { 5880 LLVM_DEBUG( 5881 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5882 return std::max(StoresIC, LoadsIC); 5883 } 5884 5885 // If there are scalar reductions and TTI has enabled aggressive 5886 // interleaving for reductions, we will interleave to expose ILP. 5887 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5888 AggressivelyInterleaveReductions) { 5889 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5890 // Interleave no less than SmallIC but not as aggressive as the normal IC 5891 // to satisfy the rare situation when resources are too limited. 5892 return std::max(IC / 2, SmallIC); 5893 } else { 5894 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5895 return SmallIC; 5896 } 5897 } 5898 5899 // Interleave if this is a large loop (small loops are already dealt with by 5900 // this point) that could benefit from interleaving. 5901 if (AggressivelyInterleaveReductions) { 5902 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5903 return IC; 5904 } 5905 5906 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5907 return 1; 5908 } 5909 5910 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5911 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5912 // This function calculates the register usage by measuring the highest number 5913 // of values that are alive at a single location. Obviously, this is a very 5914 // rough estimation. We scan the loop in a topological order in order and 5915 // assign a number to each instruction. We use RPO to ensure that defs are 5916 // met before their users. We assume that each instruction that has in-loop 5917 // users starts an interval. We record every time that an in-loop value is 5918 // used, so we have a list of the first and last occurrences of each 5919 // instruction. Next, we transpose this data structure into a multi map that 5920 // holds the list of intervals that *end* at a specific location. This multi 5921 // map allows us to perform a linear search. We scan the instructions linearly 5922 // and record each time that a new interval starts, by placing it in a set. 5923 // If we find this value in the multi-map then we remove it from the set. 5924 // The max register usage is the maximum size of the set. 5925 // We also search for instructions that are defined outside the loop, but are 5926 // used inside the loop. We need this number separately from the max-interval 5927 // usage number because when we unroll, loop-invariant values do not take 5928 // more register. 5929 LoopBlocksDFS DFS(TheLoop); 5930 DFS.perform(LI); 5931 5932 RegisterUsage RU; 5933 5934 // Each 'key' in the map opens a new interval. The values 5935 // of the map are the index of the 'last seen' usage of the 5936 // instruction that is the key. 5937 using IntervalMap = DenseMap<Instruction *, unsigned>; 5938 5939 // Maps instruction to its index. 5940 SmallVector<Instruction *, 64> IdxToInstr; 5941 // Marks the end of each interval. 5942 IntervalMap EndPoint; 5943 // Saves the list of instruction indices that are used in the loop. 5944 SmallPtrSet<Instruction *, 8> Ends; 5945 // Saves the list of values that are used in the loop but are 5946 // defined outside the loop, such as arguments and constants. 5947 SmallPtrSet<Value *, 8> LoopInvariants; 5948 5949 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5950 for (Instruction &I : BB->instructionsWithoutDebug()) { 5951 IdxToInstr.push_back(&I); 5952 5953 // Save the end location of each USE. 5954 for (Value *U : I.operands()) { 5955 auto *Instr = dyn_cast<Instruction>(U); 5956 5957 // Ignore non-instruction values such as arguments, constants, etc. 5958 if (!Instr) 5959 continue; 5960 5961 // If this instruction is outside the loop then record it and continue. 5962 if (!TheLoop->contains(Instr)) { 5963 LoopInvariants.insert(Instr); 5964 continue; 5965 } 5966 5967 // Overwrite previous end points. 5968 EndPoint[Instr] = IdxToInstr.size(); 5969 Ends.insert(Instr); 5970 } 5971 } 5972 } 5973 5974 // Saves the list of intervals that end with the index in 'key'. 5975 using InstrList = SmallVector<Instruction *, 2>; 5976 DenseMap<unsigned, InstrList> TransposeEnds; 5977 5978 // Transpose the EndPoints to a list of values that end at each index. 5979 for (auto &Interval : EndPoint) 5980 TransposeEnds[Interval.second].push_back(Interval.first); 5981 5982 SmallPtrSet<Instruction *, 8> OpenIntervals; 5983 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5984 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5985 5986 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5987 5988 auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { 5989 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5990 return 0; 5991 return TTI.getRegUsageForType(VectorType::get(Ty, VF)); 5992 }; 5993 5994 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5995 Instruction *I = IdxToInstr[i]; 5996 5997 // Remove all of the instructions that end at this location. 5998 InstrList &List = TransposeEnds[i]; 5999 for (Instruction *ToRemove : List) 6000 OpenIntervals.erase(ToRemove); 6001 6002 // Ignore instructions that are never used within the loop. 6003 if (!Ends.count(I)) 6004 continue; 6005 6006 // Skip ignored values. 6007 if (ValuesToIgnore.count(I)) 6008 continue; 6009 6010 // For each VF find the maximum usage of registers. 6011 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6012 // Count the number of live intervals. 6013 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6014 6015 if (VFs[j].isScalar()) { 6016 for (auto Inst : OpenIntervals) { 6017 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6018 if (RegUsage.find(ClassID) == RegUsage.end()) 6019 RegUsage[ClassID] = 1; 6020 else 6021 RegUsage[ClassID] += 1; 6022 } 6023 } else { 6024 collectUniformsAndScalars(VFs[j]); 6025 for (auto Inst : OpenIntervals) { 6026 // Skip ignored values for VF > 1. 6027 if (VecValuesToIgnore.count(Inst)) 6028 continue; 6029 if (isScalarAfterVectorization(Inst, VFs[j])) { 6030 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6031 if (RegUsage.find(ClassID) == RegUsage.end()) 6032 RegUsage[ClassID] = 1; 6033 else 6034 RegUsage[ClassID] += 1; 6035 } else { 6036 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6037 if (RegUsage.find(ClassID) == RegUsage.end()) 6038 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6039 else 6040 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6041 } 6042 } 6043 } 6044 6045 for (auto& pair : RegUsage) { 6046 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6047 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6048 else 6049 MaxUsages[j][pair.first] = pair.second; 6050 } 6051 } 6052 6053 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6054 << OpenIntervals.size() << '\n'); 6055 6056 // Add the current instruction to the list of open intervals. 6057 OpenIntervals.insert(I); 6058 } 6059 6060 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6061 SmallMapVector<unsigned, unsigned, 4> Invariant; 6062 6063 for (auto Inst : LoopInvariants) { 6064 unsigned Usage = 6065 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6066 unsigned ClassID = 6067 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6068 if (Invariant.find(ClassID) == Invariant.end()) 6069 Invariant[ClassID] = Usage; 6070 else 6071 Invariant[ClassID] += Usage; 6072 } 6073 6074 LLVM_DEBUG({ 6075 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6076 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6077 << " item\n"; 6078 for (const auto &pair : MaxUsages[i]) { 6079 dbgs() << "LV(REG): RegisterClass: " 6080 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6081 << " registers\n"; 6082 } 6083 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6084 << " item\n"; 6085 for (const auto &pair : Invariant) { 6086 dbgs() << "LV(REG): RegisterClass: " 6087 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6088 << " registers\n"; 6089 } 6090 }); 6091 6092 RU.LoopInvariantRegs = Invariant; 6093 RU.MaxLocalUsers = MaxUsages[i]; 6094 RUs[i] = RU; 6095 } 6096 6097 return RUs; 6098 } 6099 6100 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6101 ElementCount VF) { 6102 // TODO: Cost model for emulated masked load/store is completely 6103 // broken. This hack guides the cost model to use an artificially 6104 // high enough value to practically disable vectorization with such 6105 // operations, except where previously deployed legality hack allowed 6106 // using very low cost values. This is to avoid regressions coming simply 6107 // from moving "masked load/store" check from legality to cost model. 6108 // Masked Load/Gather emulation was previously never allowed. 6109 // Limited number of Masked Store/Scatter emulation was allowed. 6110 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6111 return isa<LoadInst>(I) || 6112 (isa<StoreInst>(I) && 6113 NumPredStores > NumberOfStoresToPredicate); 6114 } 6115 6116 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6117 // If we aren't vectorizing the loop, or if we've already collected the 6118 // instructions to scalarize, there's nothing to do. Collection may already 6119 // have occurred if we have a user-selected VF and are now computing the 6120 // expected cost for interleaving. 6121 if (VF.isScalar() || VF.isZero() || 6122 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6123 return; 6124 6125 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6126 // not profitable to scalarize any instructions, the presence of VF in the 6127 // map will indicate that we've analyzed it already. 6128 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6129 6130 // Find all the instructions that are scalar with predication in the loop and 6131 // determine if it would be better to not if-convert the blocks they are in. 6132 // If so, we also record the instructions to scalarize. 6133 for (BasicBlock *BB : TheLoop->blocks()) { 6134 if (!blockNeedsPredicationForAnyReason(BB)) 6135 continue; 6136 for (Instruction &I : *BB) 6137 if (isScalarWithPredication(&I, VF)) { 6138 ScalarCostsTy ScalarCosts; 6139 // Do not apply discount if scalable, because that would lead to 6140 // invalid scalarization costs. 6141 // Do not apply discount logic if hacked cost is needed 6142 // for emulated masked memrefs. 6143 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6144 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6145 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6146 // Remember that BB will remain after vectorization. 6147 PredicatedBBsAfterVectorization.insert(BB); 6148 } 6149 } 6150 } 6151 6152 int LoopVectorizationCostModel::computePredInstDiscount( 6153 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6154 assert(!isUniformAfterVectorization(PredInst, VF) && 6155 "Instruction marked uniform-after-vectorization will be predicated"); 6156 6157 // Initialize the discount to zero, meaning that the scalar version and the 6158 // vector version cost the same. 6159 InstructionCost Discount = 0; 6160 6161 // Holds instructions to analyze. The instructions we visit are mapped in 6162 // ScalarCosts. Those instructions are the ones that would be scalarized if 6163 // we find that the scalar version costs less. 6164 SmallVector<Instruction *, 8> Worklist; 6165 6166 // Returns true if the given instruction can be scalarized. 6167 auto canBeScalarized = [&](Instruction *I) -> bool { 6168 // We only attempt to scalarize instructions forming a single-use chain 6169 // from the original predicated block that would otherwise be vectorized. 6170 // Although not strictly necessary, we give up on instructions we know will 6171 // already be scalar to avoid traversing chains that are unlikely to be 6172 // beneficial. 6173 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6174 isScalarAfterVectorization(I, VF)) 6175 return false; 6176 6177 // If the instruction is scalar with predication, it will be analyzed 6178 // separately. We ignore it within the context of PredInst. 6179 if (isScalarWithPredication(I, VF)) 6180 return false; 6181 6182 // If any of the instruction's operands are uniform after vectorization, 6183 // the instruction cannot be scalarized. This prevents, for example, a 6184 // masked load from being scalarized. 6185 // 6186 // We assume we will only emit a value for lane zero of an instruction 6187 // marked uniform after vectorization, rather than VF identical values. 6188 // Thus, if we scalarize an instruction that uses a uniform, we would 6189 // create uses of values corresponding to the lanes we aren't emitting code 6190 // for. This behavior can be changed by allowing getScalarValue to clone 6191 // the lane zero values for uniforms rather than asserting. 6192 for (Use &U : I->operands()) 6193 if (auto *J = dyn_cast<Instruction>(U.get())) 6194 if (isUniformAfterVectorization(J, VF)) 6195 return false; 6196 6197 // Otherwise, we can scalarize the instruction. 6198 return true; 6199 }; 6200 6201 // Compute the expected cost discount from scalarizing the entire expression 6202 // feeding the predicated instruction. We currently only consider expressions 6203 // that are single-use instruction chains. 6204 Worklist.push_back(PredInst); 6205 while (!Worklist.empty()) { 6206 Instruction *I = Worklist.pop_back_val(); 6207 6208 // If we've already analyzed the instruction, there's nothing to do. 6209 if (ScalarCosts.find(I) != ScalarCosts.end()) 6210 continue; 6211 6212 // Compute the cost of the vector instruction. Note that this cost already 6213 // includes the scalarization overhead of the predicated instruction. 6214 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6215 6216 // Compute the cost of the scalarized instruction. This cost is the cost of 6217 // the instruction as if it wasn't if-converted and instead remained in the 6218 // predicated block. We will scale this cost by block probability after 6219 // computing the scalarization overhead. 6220 InstructionCost ScalarCost = 6221 VF.getFixedValue() * 6222 getInstructionCost(I, ElementCount::getFixed(1)).first; 6223 6224 // Compute the scalarization overhead of needed insertelement instructions 6225 // and phi nodes. 6226 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6227 ScalarCost += TTI.getScalarizationOverhead( 6228 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6229 APInt::getAllOnes(VF.getFixedValue()), true, false); 6230 ScalarCost += 6231 VF.getFixedValue() * 6232 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6233 } 6234 6235 // Compute the scalarization overhead of needed extractelement 6236 // instructions. For each of the instruction's operands, if the operand can 6237 // be scalarized, add it to the worklist; otherwise, account for the 6238 // overhead. 6239 for (Use &U : I->operands()) 6240 if (auto *J = dyn_cast<Instruction>(U.get())) { 6241 assert(VectorType::isValidElementType(J->getType()) && 6242 "Instruction has non-scalar type"); 6243 if (canBeScalarized(J)) 6244 Worklist.push_back(J); 6245 else if (needsExtract(J, VF)) { 6246 ScalarCost += TTI.getScalarizationOverhead( 6247 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6248 APInt::getAllOnes(VF.getFixedValue()), false, true); 6249 } 6250 } 6251 6252 // Scale the total scalar cost by block probability. 6253 ScalarCost /= getReciprocalPredBlockProb(); 6254 6255 // Compute the discount. A non-negative discount means the vector version 6256 // of the instruction costs more, and scalarizing would be beneficial. 6257 Discount += VectorCost - ScalarCost; 6258 ScalarCosts[I] = ScalarCost; 6259 } 6260 6261 return *Discount.getValue(); 6262 } 6263 6264 LoopVectorizationCostModel::VectorizationCostTy 6265 LoopVectorizationCostModel::expectedCost( 6266 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6267 VectorizationCostTy Cost; 6268 6269 // For each block. 6270 for (BasicBlock *BB : TheLoop->blocks()) { 6271 VectorizationCostTy BlockCost; 6272 6273 // For each instruction in the old loop. 6274 for (Instruction &I : BB->instructionsWithoutDebug()) { 6275 // Skip ignored values. 6276 if (ValuesToIgnore.count(&I) || 6277 (VF.isVector() && VecValuesToIgnore.count(&I))) 6278 continue; 6279 6280 VectorizationCostTy C = getInstructionCost(&I, VF); 6281 6282 // Check if we should override the cost. 6283 if (C.first.isValid() && 6284 ForceTargetInstructionCost.getNumOccurrences() > 0) 6285 C.first = InstructionCost(ForceTargetInstructionCost); 6286 6287 // Keep a list of instructions with invalid costs. 6288 if (Invalid && !C.first.isValid()) 6289 Invalid->emplace_back(&I, VF); 6290 6291 BlockCost.first += C.first; 6292 BlockCost.second |= C.second; 6293 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6294 << " for VF " << VF << " For instruction: " << I 6295 << '\n'); 6296 } 6297 6298 // If we are vectorizing a predicated block, it will have been 6299 // if-converted. This means that the block's instructions (aside from 6300 // stores and instructions that may divide by zero) will now be 6301 // unconditionally executed. For the scalar case, we may not always execute 6302 // the predicated block, if it is an if-else block. Thus, scale the block's 6303 // cost by the probability of executing it. blockNeedsPredication from 6304 // Legal is used so as to not include all blocks in tail folded loops. 6305 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6306 BlockCost.first /= getReciprocalPredBlockProb(); 6307 6308 Cost.first += BlockCost.first; 6309 Cost.second |= BlockCost.second; 6310 } 6311 6312 return Cost; 6313 } 6314 6315 /// Gets Address Access SCEV after verifying that the access pattern 6316 /// is loop invariant except the induction variable dependence. 6317 /// 6318 /// This SCEV can be sent to the Target in order to estimate the address 6319 /// calculation cost. 6320 static const SCEV *getAddressAccessSCEV( 6321 Value *Ptr, 6322 LoopVectorizationLegality *Legal, 6323 PredicatedScalarEvolution &PSE, 6324 const Loop *TheLoop) { 6325 6326 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6327 if (!Gep) 6328 return nullptr; 6329 6330 // We are looking for a gep with all loop invariant indices except for one 6331 // which should be an induction variable. 6332 auto SE = PSE.getSE(); 6333 unsigned NumOperands = Gep->getNumOperands(); 6334 for (unsigned i = 1; i < NumOperands; ++i) { 6335 Value *Opd = Gep->getOperand(i); 6336 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6337 !Legal->isInductionVariable(Opd)) 6338 return nullptr; 6339 } 6340 6341 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6342 return PSE.getSCEV(Ptr); 6343 } 6344 6345 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6346 return Legal->hasStride(I->getOperand(0)) || 6347 Legal->hasStride(I->getOperand(1)); 6348 } 6349 6350 InstructionCost 6351 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6352 ElementCount VF) { 6353 assert(VF.isVector() && 6354 "Scalarization cost of instruction implies vectorization."); 6355 if (VF.isScalable()) 6356 return InstructionCost::getInvalid(); 6357 6358 Type *ValTy = getLoadStoreType(I); 6359 auto SE = PSE.getSE(); 6360 6361 unsigned AS = getLoadStoreAddressSpace(I); 6362 Value *Ptr = getLoadStorePointerOperand(I); 6363 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6364 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6365 // that it is being called from this specific place. 6366 6367 // Figure out whether the access is strided and get the stride value 6368 // if it's known in compile time 6369 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6370 6371 // Get the cost of the scalar memory instruction and address computation. 6372 InstructionCost Cost = 6373 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6374 6375 // Don't pass *I here, since it is scalar but will actually be part of a 6376 // vectorized loop where the user of it is a vectorized instruction. 6377 const Align Alignment = getLoadStoreAlignment(I); 6378 Cost += VF.getKnownMinValue() * 6379 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6380 AS, TTI::TCK_RecipThroughput); 6381 6382 // Get the overhead of the extractelement and insertelement instructions 6383 // we might create due to scalarization. 6384 Cost += getScalarizationOverhead(I, VF); 6385 6386 // If we have a predicated load/store, it will need extra i1 extracts and 6387 // conditional branches, but may not be executed for each vector lane. Scale 6388 // the cost by the probability of executing the predicated block. 6389 if (isPredicatedInst(I, VF)) { 6390 Cost /= getReciprocalPredBlockProb(); 6391 6392 // Add the cost of an i1 extract and a branch 6393 auto *Vec_i1Ty = 6394 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6395 Cost += TTI.getScalarizationOverhead( 6396 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6397 /*Insert=*/false, /*Extract=*/true); 6398 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6399 6400 if (useEmulatedMaskMemRefHack(I, VF)) 6401 // Artificially setting to a high enough value to practically disable 6402 // vectorization with such operations. 6403 Cost = 3000000; 6404 } 6405 6406 return Cost; 6407 } 6408 6409 InstructionCost 6410 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6411 ElementCount VF) { 6412 Type *ValTy = getLoadStoreType(I); 6413 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6414 Value *Ptr = getLoadStorePointerOperand(I); 6415 unsigned AS = getLoadStoreAddressSpace(I); 6416 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6417 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6418 6419 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6420 "Stride should be 1 or -1 for consecutive memory access"); 6421 const Align Alignment = getLoadStoreAlignment(I); 6422 InstructionCost Cost = 0; 6423 if (Legal->isMaskRequired(I)) 6424 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6425 CostKind); 6426 else 6427 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6428 CostKind, I); 6429 6430 bool Reverse = ConsecutiveStride < 0; 6431 if (Reverse) 6432 Cost += 6433 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6434 return Cost; 6435 } 6436 6437 InstructionCost 6438 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6439 ElementCount VF) { 6440 assert(Legal->isUniformMemOp(*I)); 6441 6442 Type *ValTy = getLoadStoreType(I); 6443 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6444 const Align Alignment = getLoadStoreAlignment(I); 6445 unsigned AS = getLoadStoreAddressSpace(I); 6446 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6447 if (isa<LoadInst>(I)) { 6448 return TTI.getAddressComputationCost(ValTy) + 6449 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6450 CostKind) + 6451 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6452 } 6453 StoreInst *SI = cast<StoreInst>(I); 6454 6455 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6456 return TTI.getAddressComputationCost(ValTy) + 6457 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6458 CostKind) + 6459 (isLoopInvariantStoreValue 6460 ? 0 6461 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6462 VF.getKnownMinValue() - 1)); 6463 } 6464 6465 InstructionCost 6466 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6467 ElementCount VF) { 6468 Type *ValTy = getLoadStoreType(I); 6469 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6470 const Align Alignment = getLoadStoreAlignment(I); 6471 const Value *Ptr = getLoadStorePointerOperand(I); 6472 6473 return TTI.getAddressComputationCost(VectorTy) + 6474 TTI.getGatherScatterOpCost( 6475 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6476 TargetTransformInfo::TCK_RecipThroughput, I); 6477 } 6478 6479 InstructionCost 6480 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6481 ElementCount VF) { 6482 // TODO: Once we have support for interleaving with scalable vectors 6483 // we can calculate the cost properly here. 6484 if (VF.isScalable()) 6485 return InstructionCost::getInvalid(); 6486 6487 Type *ValTy = getLoadStoreType(I); 6488 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6489 unsigned AS = getLoadStoreAddressSpace(I); 6490 6491 auto Group = getInterleavedAccessGroup(I); 6492 assert(Group && "Fail to get an interleaved access group."); 6493 6494 unsigned InterleaveFactor = Group->getFactor(); 6495 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6496 6497 // Holds the indices of existing members in the interleaved group. 6498 SmallVector<unsigned, 4> Indices; 6499 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6500 if (Group->getMember(IF)) 6501 Indices.push_back(IF); 6502 6503 // Calculate the cost of the whole interleaved group. 6504 bool UseMaskForGaps = 6505 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6506 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6507 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6508 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6509 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6510 6511 if (Group->isReverse()) { 6512 // TODO: Add support for reversed masked interleaved access. 6513 assert(!Legal->isMaskRequired(I) && 6514 "Reverse masked interleaved access not supported."); 6515 Cost += 6516 Group->getNumMembers() * 6517 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6518 } 6519 return Cost; 6520 } 6521 6522 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6523 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6524 using namespace llvm::PatternMatch; 6525 // Early exit for no inloop reductions 6526 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6527 return None; 6528 auto *VectorTy = cast<VectorType>(Ty); 6529 6530 // We are looking for a pattern of, and finding the minimal acceptable cost: 6531 // reduce(mul(ext(A), ext(B))) or 6532 // reduce(mul(A, B)) or 6533 // reduce(ext(A)) or 6534 // reduce(A). 6535 // The basic idea is that we walk down the tree to do that, finding the root 6536 // reduction instruction in InLoopReductionImmediateChains. From there we find 6537 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6538 // of the components. If the reduction cost is lower then we return it for the 6539 // reduction instruction and 0 for the other instructions in the pattern. If 6540 // it is not we return an invalid cost specifying the orignal cost method 6541 // should be used. 6542 Instruction *RetI = I; 6543 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6544 if (!RetI->hasOneUser()) 6545 return None; 6546 RetI = RetI->user_back(); 6547 } 6548 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6549 RetI->user_back()->getOpcode() == Instruction::Add) { 6550 if (!RetI->hasOneUser()) 6551 return None; 6552 RetI = RetI->user_back(); 6553 } 6554 6555 // Test if the found instruction is a reduction, and if not return an invalid 6556 // cost specifying the parent to use the original cost modelling. 6557 if (!InLoopReductionImmediateChains.count(RetI)) 6558 return None; 6559 6560 // Find the reduction this chain is a part of and calculate the basic cost of 6561 // the reduction on its own. 6562 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6563 Instruction *ReductionPhi = LastChain; 6564 while (!isa<PHINode>(ReductionPhi)) 6565 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6566 6567 const RecurrenceDescriptor &RdxDesc = 6568 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6569 6570 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6571 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6572 6573 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6574 // normal fmul instruction to the cost of the fadd reduction. 6575 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6576 BaseCost += 6577 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6578 6579 // If we're using ordered reductions then we can just return the base cost 6580 // here, since getArithmeticReductionCost calculates the full ordered 6581 // reduction cost when FP reassociation is not allowed. 6582 if (useOrderedReductions(RdxDesc)) 6583 return BaseCost; 6584 6585 // Get the operand that was not the reduction chain and match it to one of the 6586 // patterns, returning the better cost if it is found. 6587 Instruction *RedOp = RetI->getOperand(1) == LastChain 6588 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6589 : dyn_cast<Instruction>(RetI->getOperand(1)); 6590 6591 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6592 6593 Instruction *Op0, *Op1; 6594 if (RedOp && 6595 match(RedOp, 6596 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6597 match(Op0, m_ZExtOrSExt(m_Value())) && 6598 Op0->getOpcode() == Op1->getOpcode() && 6599 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6600 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6601 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6602 6603 // Matched reduce(ext(mul(ext(A), ext(B))) 6604 // Note that the extend opcodes need to all match, or if A==B they will have 6605 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6606 // which is equally fine. 6607 bool IsUnsigned = isa<ZExtInst>(Op0); 6608 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6609 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6610 6611 InstructionCost ExtCost = 6612 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6613 TTI::CastContextHint::None, CostKind, Op0); 6614 InstructionCost MulCost = 6615 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6616 InstructionCost Ext2Cost = 6617 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6618 TTI::CastContextHint::None, CostKind, RedOp); 6619 6620 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6621 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6622 CostKind); 6623 6624 if (RedCost.isValid() && 6625 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6626 return I == RetI ? RedCost : 0; 6627 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6628 !TheLoop->isLoopInvariant(RedOp)) { 6629 // Matched reduce(ext(A)) 6630 bool IsUnsigned = isa<ZExtInst>(RedOp); 6631 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6632 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6633 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6634 CostKind); 6635 6636 InstructionCost ExtCost = 6637 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6638 TTI::CastContextHint::None, CostKind, RedOp); 6639 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6640 return I == RetI ? RedCost : 0; 6641 } else if (RedOp && 6642 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6643 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6644 Op0->getOpcode() == Op1->getOpcode() && 6645 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6646 bool IsUnsigned = isa<ZExtInst>(Op0); 6647 Type *Op0Ty = Op0->getOperand(0)->getType(); 6648 Type *Op1Ty = Op1->getOperand(0)->getType(); 6649 Type *LargestOpTy = 6650 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6651 : Op0Ty; 6652 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6653 6654 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6655 // different sizes. We take the largest type as the ext to reduce, and add 6656 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6657 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6658 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6659 TTI::CastContextHint::None, CostKind, Op0); 6660 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6661 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6662 TTI::CastContextHint::None, CostKind, Op1); 6663 InstructionCost MulCost = 6664 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6665 6666 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6667 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6668 CostKind); 6669 InstructionCost ExtraExtCost = 0; 6670 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6671 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6672 ExtraExtCost = TTI.getCastInstrCost( 6673 ExtraExtOp->getOpcode(), ExtType, 6674 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6675 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6676 } 6677 6678 if (RedCost.isValid() && 6679 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6680 return I == RetI ? RedCost : 0; 6681 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6682 // Matched reduce(mul()) 6683 InstructionCost MulCost = 6684 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6685 6686 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6687 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6688 CostKind); 6689 6690 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6691 return I == RetI ? RedCost : 0; 6692 } 6693 } 6694 6695 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6696 } 6697 6698 InstructionCost 6699 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6700 ElementCount VF) { 6701 // Calculate scalar cost only. Vectorization cost should be ready at this 6702 // moment. 6703 if (VF.isScalar()) { 6704 Type *ValTy = getLoadStoreType(I); 6705 const Align Alignment = getLoadStoreAlignment(I); 6706 unsigned AS = getLoadStoreAddressSpace(I); 6707 6708 return TTI.getAddressComputationCost(ValTy) + 6709 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6710 TTI::TCK_RecipThroughput, I); 6711 } 6712 return getWideningCost(I, VF); 6713 } 6714 6715 LoopVectorizationCostModel::VectorizationCostTy 6716 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6717 ElementCount VF) { 6718 // If we know that this instruction will remain uniform, check the cost of 6719 // the scalar version. 6720 if (isUniformAfterVectorization(I, VF)) 6721 VF = ElementCount::getFixed(1); 6722 6723 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6724 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6725 6726 // Forced scalars do not have any scalarization overhead. 6727 auto ForcedScalar = ForcedScalars.find(VF); 6728 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6729 auto InstSet = ForcedScalar->second; 6730 if (InstSet.count(I)) 6731 return VectorizationCostTy( 6732 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6733 VF.getKnownMinValue()), 6734 false); 6735 } 6736 6737 Type *VectorTy; 6738 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6739 6740 bool TypeNotScalarized = false; 6741 if (VF.isVector() && VectorTy->isVectorTy()) { 6742 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 6743 if (NumParts) 6744 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6745 else 6746 C = InstructionCost::getInvalid(); 6747 } 6748 return VectorizationCostTy(C, TypeNotScalarized); 6749 } 6750 6751 InstructionCost 6752 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6753 ElementCount VF) const { 6754 6755 // There is no mechanism yet to create a scalable scalarization loop, 6756 // so this is currently Invalid. 6757 if (VF.isScalable()) 6758 return InstructionCost::getInvalid(); 6759 6760 if (VF.isScalar()) 6761 return 0; 6762 6763 InstructionCost Cost = 0; 6764 Type *RetTy = ToVectorTy(I->getType(), VF); 6765 if (!RetTy->isVoidTy() && 6766 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6767 Cost += TTI.getScalarizationOverhead( 6768 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6769 false); 6770 6771 // Some targets keep addresses scalar. 6772 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6773 return Cost; 6774 6775 // Some targets support efficient element stores. 6776 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6777 return Cost; 6778 6779 // Collect operands to consider. 6780 CallInst *CI = dyn_cast<CallInst>(I); 6781 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6782 6783 // Skip operands that do not require extraction/scalarization and do not incur 6784 // any overhead. 6785 SmallVector<Type *> Tys; 6786 for (auto *V : filterExtractingOperands(Ops, VF)) 6787 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6788 return Cost + TTI.getOperandsScalarizationOverhead( 6789 filterExtractingOperands(Ops, VF), Tys); 6790 } 6791 6792 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6793 if (VF.isScalar()) 6794 return; 6795 NumPredStores = 0; 6796 for (BasicBlock *BB : TheLoop->blocks()) { 6797 // For each instruction in the old loop. 6798 for (Instruction &I : *BB) { 6799 Value *Ptr = getLoadStorePointerOperand(&I); 6800 if (!Ptr) 6801 continue; 6802 6803 // TODO: We should generate better code and update the cost model for 6804 // predicated uniform stores. Today they are treated as any other 6805 // predicated store (see added test cases in 6806 // invariant-store-vectorization.ll). 6807 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6808 NumPredStores++; 6809 6810 if (Legal->isUniformMemOp(I)) { 6811 // TODO: Avoid replicating loads and stores instead of 6812 // relying on instcombine to remove them. 6813 // Load: Scalar load + broadcast 6814 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6815 InstructionCost Cost; 6816 if (isa<StoreInst>(&I) && VF.isScalable() && 6817 isLegalGatherOrScatter(&I, VF)) { 6818 Cost = getGatherScatterCost(&I, VF); 6819 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6820 } else { 6821 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 6822 "Cannot yet scalarize uniform stores"); 6823 Cost = getUniformMemOpCost(&I, VF); 6824 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6825 } 6826 continue; 6827 } 6828 6829 // We assume that widening is the best solution when possible. 6830 if (memoryInstructionCanBeWidened(&I, VF)) { 6831 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6832 int ConsecutiveStride = Legal->isConsecutivePtr( 6833 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6834 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6835 "Expected consecutive stride."); 6836 InstWidening Decision = 6837 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6838 setWideningDecision(&I, VF, Decision, Cost); 6839 continue; 6840 } 6841 6842 // Choose between Interleaving, Gather/Scatter or Scalarization. 6843 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6844 unsigned NumAccesses = 1; 6845 if (isAccessInterleaved(&I)) { 6846 auto Group = getInterleavedAccessGroup(&I); 6847 assert(Group && "Fail to get an interleaved access group."); 6848 6849 // Make one decision for the whole group. 6850 if (getWideningDecision(&I, VF) != CM_Unknown) 6851 continue; 6852 6853 NumAccesses = Group->getNumMembers(); 6854 if (interleavedAccessCanBeWidened(&I, VF)) 6855 InterleaveCost = getInterleaveGroupCost(&I, VF); 6856 } 6857 6858 InstructionCost GatherScatterCost = 6859 isLegalGatherOrScatter(&I, VF) 6860 ? getGatherScatterCost(&I, VF) * NumAccesses 6861 : InstructionCost::getInvalid(); 6862 6863 InstructionCost ScalarizationCost = 6864 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6865 6866 // Choose better solution for the current VF, 6867 // write down this decision and use it during vectorization. 6868 InstructionCost Cost; 6869 InstWidening Decision; 6870 if (InterleaveCost <= GatherScatterCost && 6871 InterleaveCost < ScalarizationCost) { 6872 Decision = CM_Interleave; 6873 Cost = InterleaveCost; 6874 } else if (GatherScatterCost < ScalarizationCost) { 6875 Decision = CM_GatherScatter; 6876 Cost = GatherScatterCost; 6877 } else { 6878 Decision = CM_Scalarize; 6879 Cost = ScalarizationCost; 6880 } 6881 // If the instructions belongs to an interleave group, the whole group 6882 // receives the same decision. The whole group receives the cost, but 6883 // the cost will actually be assigned to one instruction. 6884 if (auto Group = getInterleavedAccessGroup(&I)) 6885 setWideningDecision(Group, VF, Decision, Cost); 6886 else 6887 setWideningDecision(&I, VF, Decision, Cost); 6888 } 6889 } 6890 6891 // Make sure that any load of address and any other address computation 6892 // remains scalar unless there is gather/scatter support. This avoids 6893 // inevitable extracts into address registers, and also has the benefit of 6894 // activating LSR more, since that pass can't optimize vectorized 6895 // addresses. 6896 if (TTI.prefersVectorizedAddressing()) 6897 return; 6898 6899 // Start with all scalar pointer uses. 6900 SmallPtrSet<Instruction *, 8> AddrDefs; 6901 for (BasicBlock *BB : TheLoop->blocks()) 6902 for (Instruction &I : *BB) { 6903 Instruction *PtrDef = 6904 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6905 if (PtrDef && TheLoop->contains(PtrDef) && 6906 getWideningDecision(&I, VF) != CM_GatherScatter) 6907 AddrDefs.insert(PtrDef); 6908 } 6909 6910 // Add all instructions used to generate the addresses. 6911 SmallVector<Instruction *, 4> Worklist; 6912 append_range(Worklist, AddrDefs); 6913 while (!Worklist.empty()) { 6914 Instruction *I = Worklist.pop_back_val(); 6915 for (auto &Op : I->operands()) 6916 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6917 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6918 AddrDefs.insert(InstOp).second) 6919 Worklist.push_back(InstOp); 6920 } 6921 6922 for (auto *I : AddrDefs) { 6923 if (isa<LoadInst>(I)) { 6924 // Setting the desired widening decision should ideally be handled in 6925 // by cost functions, but since this involves the task of finding out 6926 // if the loaded register is involved in an address computation, it is 6927 // instead changed here when we know this is the case. 6928 InstWidening Decision = getWideningDecision(I, VF); 6929 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6930 // Scalarize a widened load of address. 6931 setWideningDecision( 6932 I, VF, CM_Scalarize, 6933 (VF.getKnownMinValue() * 6934 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6935 else if (auto Group = getInterleavedAccessGroup(I)) { 6936 // Scalarize an interleave group of address loads. 6937 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6938 if (Instruction *Member = Group->getMember(I)) 6939 setWideningDecision( 6940 Member, VF, CM_Scalarize, 6941 (VF.getKnownMinValue() * 6942 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6943 } 6944 } 6945 } else 6946 // Make sure I gets scalarized and a cost estimate without 6947 // scalarization overhead. 6948 ForcedScalars[VF].insert(I); 6949 } 6950 } 6951 6952 InstructionCost 6953 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6954 Type *&VectorTy) { 6955 Type *RetTy = I->getType(); 6956 if (canTruncateToMinimalBitwidth(I, VF)) 6957 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6958 auto SE = PSE.getSE(); 6959 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6960 6961 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6962 ElementCount VF) -> bool { 6963 if (VF.isScalar()) 6964 return true; 6965 6966 auto Scalarized = InstsToScalarize.find(VF); 6967 assert(Scalarized != InstsToScalarize.end() && 6968 "VF not yet analyzed for scalarization profitability"); 6969 return !Scalarized->second.count(I) && 6970 llvm::all_of(I->users(), [&](User *U) { 6971 auto *UI = cast<Instruction>(U); 6972 return !Scalarized->second.count(UI); 6973 }); 6974 }; 6975 (void) hasSingleCopyAfterVectorization; 6976 6977 if (isScalarAfterVectorization(I, VF)) { 6978 // With the exception of GEPs and PHIs, after scalarization there should 6979 // only be one copy of the instruction generated in the loop. This is 6980 // because the VF is either 1, or any instructions that need scalarizing 6981 // have already been dealt with by the the time we get here. As a result, 6982 // it means we don't have to multiply the instruction cost by VF. 6983 assert(I->getOpcode() == Instruction::GetElementPtr || 6984 I->getOpcode() == Instruction::PHI || 6985 (I->getOpcode() == Instruction::BitCast && 6986 I->getType()->isPointerTy()) || 6987 hasSingleCopyAfterVectorization(I, VF)); 6988 VectorTy = RetTy; 6989 } else 6990 VectorTy = ToVectorTy(RetTy, VF); 6991 6992 // TODO: We need to estimate the cost of intrinsic calls. 6993 switch (I->getOpcode()) { 6994 case Instruction::GetElementPtr: 6995 // We mark this instruction as zero-cost because the cost of GEPs in 6996 // vectorized code depends on whether the corresponding memory instruction 6997 // is scalarized or not. Therefore, we handle GEPs with the memory 6998 // instruction cost. 6999 return 0; 7000 case Instruction::Br: { 7001 // In cases of scalarized and predicated instructions, there will be VF 7002 // predicated blocks in the vectorized loop. Each branch around these 7003 // blocks requires also an extract of its vector compare i1 element. 7004 bool ScalarPredicatedBB = false; 7005 BranchInst *BI = cast<BranchInst>(I); 7006 if (VF.isVector() && BI->isConditional() && 7007 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7008 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7009 ScalarPredicatedBB = true; 7010 7011 if (ScalarPredicatedBB) { 7012 // Not possible to scalarize scalable vector with predicated instructions. 7013 if (VF.isScalable()) 7014 return InstructionCost::getInvalid(); 7015 // Return cost for branches around scalarized and predicated blocks. 7016 auto *Vec_i1Ty = 7017 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7018 return ( 7019 TTI.getScalarizationOverhead( 7020 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7021 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7022 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7023 // The back-edge branch will remain, as will all scalar branches. 7024 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7025 else 7026 // This branch will be eliminated by if-conversion. 7027 return 0; 7028 // Note: We currently assume zero cost for an unconditional branch inside 7029 // a predicated block since it will become a fall-through, although we 7030 // may decide in the future to call TTI for all branches. 7031 } 7032 case Instruction::PHI: { 7033 auto *Phi = cast<PHINode>(I); 7034 7035 // First-order recurrences are replaced by vector shuffles inside the loop. 7036 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7037 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7038 return TTI.getShuffleCost( 7039 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7040 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7041 7042 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7043 // converted into select instructions. We require N - 1 selects per phi 7044 // node, where N is the number of incoming values. 7045 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7046 return (Phi->getNumIncomingValues() - 1) * 7047 TTI.getCmpSelInstrCost( 7048 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7049 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7050 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7051 7052 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7053 } 7054 case Instruction::UDiv: 7055 case Instruction::SDiv: 7056 case Instruction::URem: 7057 case Instruction::SRem: 7058 // If we have a predicated instruction, it may not be executed for each 7059 // vector lane. Get the scalarization cost and scale this amount by the 7060 // probability of executing the predicated block. If the instruction is not 7061 // predicated, we fall through to the next case. 7062 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7063 InstructionCost Cost = 0; 7064 7065 // These instructions have a non-void type, so account for the phi nodes 7066 // that we will create. This cost is likely to be zero. The phi node 7067 // cost, if any, should be scaled by the block probability because it 7068 // models a copy at the end of each predicated block. 7069 Cost += VF.getKnownMinValue() * 7070 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7071 7072 // The cost of the non-predicated instruction. 7073 Cost += VF.getKnownMinValue() * 7074 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7075 7076 // The cost of insertelement and extractelement instructions needed for 7077 // scalarization. 7078 Cost += getScalarizationOverhead(I, VF); 7079 7080 // Scale the cost by the probability of executing the predicated blocks. 7081 // This assumes the predicated block for each vector lane is equally 7082 // likely. 7083 return Cost / getReciprocalPredBlockProb(); 7084 } 7085 LLVM_FALLTHROUGH; 7086 case Instruction::Add: 7087 case Instruction::FAdd: 7088 case Instruction::Sub: 7089 case Instruction::FSub: 7090 case Instruction::Mul: 7091 case Instruction::FMul: 7092 case Instruction::FDiv: 7093 case Instruction::FRem: 7094 case Instruction::Shl: 7095 case Instruction::LShr: 7096 case Instruction::AShr: 7097 case Instruction::And: 7098 case Instruction::Or: 7099 case Instruction::Xor: { 7100 // Since we will replace the stride by 1 the multiplication should go away. 7101 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7102 return 0; 7103 7104 // Detect reduction patterns 7105 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7106 return *RedCost; 7107 7108 // Certain instructions can be cheaper to vectorize if they have a constant 7109 // second vector operand. One example of this are shifts on x86. 7110 Value *Op2 = I->getOperand(1); 7111 TargetTransformInfo::OperandValueProperties Op2VP; 7112 TargetTransformInfo::OperandValueKind Op2VK = 7113 TTI.getOperandInfo(Op2, Op2VP); 7114 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7115 Op2VK = TargetTransformInfo::OK_UniformValue; 7116 7117 SmallVector<const Value *, 4> Operands(I->operand_values()); 7118 return TTI.getArithmeticInstrCost( 7119 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7120 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7121 } 7122 case Instruction::FNeg: { 7123 return TTI.getArithmeticInstrCost( 7124 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7125 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7126 TargetTransformInfo::OP_None, I->getOperand(0), I); 7127 } 7128 case Instruction::Select: { 7129 SelectInst *SI = cast<SelectInst>(I); 7130 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7131 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7132 7133 const Value *Op0, *Op1; 7134 using namespace llvm::PatternMatch; 7135 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7136 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7137 // select x, y, false --> x & y 7138 // select x, true, y --> x | y 7139 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7140 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7141 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7142 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7143 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7144 Op1->getType()->getScalarSizeInBits() == 1); 7145 7146 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7147 return TTI.getArithmeticInstrCost( 7148 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7149 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7150 } 7151 7152 Type *CondTy = SI->getCondition()->getType(); 7153 if (!ScalarCond) 7154 CondTy = VectorType::get(CondTy, VF); 7155 7156 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7157 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7158 Pred = Cmp->getPredicate(); 7159 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7160 CostKind, I); 7161 } 7162 case Instruction::ICmp: 7163 case Instruction::FCmp: { 7164 Type *ValTy = I->getOperand(0)->getType(); 7165 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7166 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7167 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7168 VectorTy = ToVectorTy(ValTy, VF); 7169 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7170 cast<CmpInst>(I)->getPredicate(), CostKind, 7171 I); 7172 } 7173 case Instruction::Store: 7174 case Instruction::Load: { 7175 ElementCount Width = VF; 7176 if (Width.isVector()) { 7177 InstWidening Decision = getWideningDecision(I, Width); 7178 assert(Decision != CM_Unknown && 7179 "CM decision should be taken at this point"); 7180 if (Decision == CM_Scalarize) 7181 Width = ElementCount::getFixed(1); 7182 } 7183 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7184 return getMemoryInstructionCost(I, VF); 7185 } 7186 case Instruction::BitCast: 7187 if (I->getType()->isPointerTy()) 7188 return 0; 7189 LLVM_FALLTHROUGH; 7190 case Instruction::ZExt: 7191 case Instruction::SExt: 7192 case Instruction::FPToUI: 7193 case Instruction::FPToSI: 7194 case Instruction::FPExt: 7195 case Instruction::PtrToInt: 7196 case Instruction::IntToPtr: 7197 case Instruction::SIToFP: 7198 case Instruction::UIToFP: 7199 case Instruction::Trunc: 7200 case Instruction::FPTrunc: { 7201 // Computes the CastContextHint from a Load/Store instruction. 7202 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7203 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7204 "Expected a load or a store!"); 7205 7206 if (VF.isScalar() || !TheLoop->contains(I)) 7207 return TTI::CastContextHint::Normal; 7208 7209 switch (getWideningDecision(I, VF)) { 7210 case LoopVectorizationCostModel::CM_GatherScatter: 7211 return TTI::CastContextHint::GatherScatter; 7212 case LoopVectorizationCostModel::CM_Interleave: 7213 return TTI::CastContextHint::Interleave; 7214 case LoopVectorizationCostModel::CM_Scalarize: 7215 case LoopVectorizationCostModel::CM_Widen: 7216 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7217 : TTI::CastContextHint::Normal; 7218 case LoopVectorizationCostModel::CM_Widen_Reverse: 7219 return TTI::CastContextHint::Reversed; 7220 case LoopVectorizationCostModel::CM_Unknown: 7221 llvm_unreachable("Instr did not go through cost modelling?"); 7222 } 7223 7224 llvm_unreachable("Unhandled case!"); 7225 }; 7226 7227 unsigned Opcode = I->getOpcode(); 7228 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7229 // For Trunc, the context is the only user, which must be a StoreInst. 7230 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7231 if (I->hasOneUse()) 7232 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7233 CCH = ComputeCCH(Store); 7234 } 7235 // For Z/Sext, the context is the operand, which must be a LoadInst. 7236 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7237 Opcode == Instruction::FPExt) { 7238 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7239 CCH = ComputeCCH(Load); 7240 } 7241 7242 // We optimize the truncation of induction variables having constant 7243 // integer steps. The cost of these truncations is the same as the scalar 7244 // operation. 7245 if (isOptimizableIVTruncate(I, VF)) { 7246 auto *Trunc = cast<TruncInst>(I); 7247 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7248 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7249 } 7250 7251 // Detect reduction patterns 7252 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7253 return *RedCost; 7254 7255 Type *SrcScalarTy = I->getOperand(0)->getType(); 7256 Type *SrcVecTy = 7257 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7258 if (canTruncateToMinimalBitwidth(I, VF)) { 7259 // This cast is going to be shrunk. This may remove the cast or it might 7260 // turn it into slightly different cast. For example, if MinBW == 16, 7261 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7262 // 7263 // Calculate the modified src and dest types. 7264 Type *MinVecTy = VectorTy; 7265 if (Opcode == Instruction::Trunc) { 7266 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7267 VectorTy = 7268 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7269 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7270 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7271 VectorTy = 7272 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7273 } 7274 } 7275 7276 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7277 } 7278 case Instruction::Call: { 7279 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7280 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7281 return *RedCost; 7282 bool NeedToScalarize; 7283 CallInst *CI = cast<CallInst>(I); 7284 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7285 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7286 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7287 return std::min(CallCost, IntrinsicCost); 7288 } 7289 return CallCost; 7290 } 7291 case Instruction::ExtractValue: 7292 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7293 case Instruction::Alloca: 7294 // We cannot easily widen alloca to a scalable alloca, as 7295 // the result would need to be a vector of pointers. 7296 if (VF.isScalable()) 7297 return InstructionCost::getInvalid(); 7298 LLVM_FALLTHROUGH; 7299 default: 7300 // This opcode is unknown. Assume that it is the same as 'mul'. 7301 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7302 } // end of switch. 7303 } 7304 7305 char LoopVectorize::ID = 0; 7306 7307 static const char lv_name[] = "Loop Vectorization"; 7308 7309 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7310 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7311 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7312 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7313 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7314 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7315 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7316 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7317 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7318 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7319 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7320 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7321 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7322 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7323 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7324 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7325 7326 namespace llvm { 7327 7328 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7329 7330 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7331 bool VectorizeOnlyWhenForced) { 7332 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7333 } 7334 7335 } // end namespace llvm 7336 7337 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7338 // Check if the pointer operand of a load or store instruction is 7339 // consecutive. 7340 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7341 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7342 return false; 7343 } 7344 7345 void LoopVectorizationCostModel::collectValuesToIgnore() { 7346 // Ignore ephemeral values. 7347 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7348 7349 // Find all stores to invariant variables. Since they are going to sink 7350 // outside the loop we do not need calculate cost for them. 7351 for (BasicBlock *BB : TheLoop->blocks()) 7352 for (Instruction &I : *BB) { 7353 StoreInst *SI; 7354 if ((SI = dyn_cast<StoreInst>(&I)) && 7355 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7356 ValuesToIgnore.insert(&I); 7357 } 7358 7359 // Ignore type-promoting instructions we identified during reduction 7360 // detection. 7361 for (auto &Reduction : Legal->getReductionVars()) { 7362 const RecurrenceDescriptor &RedDes = Reduction.second; 7363 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7364 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7365 } 7366 // Ignore type-casting instructions we identified during induction 7367 // detection. 7368 for (auto &Induction : Legal->getInductionVars()) { 7369 const InductionDescriptor &IndDes = Induction.second; 7370 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7371 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7372 } 7373 } 7374 7375 void LoopVectorizationCostModel::collectInLoopReductions() { 7376 for (auto &Reduction : Legal->getReductionVars()) { 7377 PHINode *Phi = Reduction.first; 7378 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7379 7380 // We don't collect reductions that are type promoted (yet). 7381 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7382 continue; 7383 7384 // If the target would prefer this reduction to happen "in-loop", then we 7385 // want to record it as such. 7386 unsigned Opcode = RdxDesc.getOpcode(); 7387 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7388 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7389 TargetTransformInfo::ReductionFlags())) 7390 continue; 7391 7392 // Check that we can correctly put the reductions into the loop, by 7393 // finding the chain of operations that leads from the phi to the loop 7394 // exit value. 7395 SmallVector<Instruction *, 4> ReductionOperations = 7396 RdxDesc.getReductionOpChain(Phi, TheLoop); 7397 bool InLoop = !ReductionOperations.empty(); 7398 if (InLoop) { 7399 InLoopReductionChains[Phi] = ReductionOperations; 7400 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7401 Instruction *LastChain = Phi; 7402 for (auto *I : ReductionOperations) { 7403 InLoopReductionImmediateChains[I] = LastChain; 7404 LastChain = I; 7405 } 7406 } 7407 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7408 << " reduction for phi: " << *Phi << "\n"); 7409 } 7410 } 7411 7412 // TODO: we could return a pair of values that specify the max VF and 7413 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7414 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7415 // doesn't have a cost model that can choose which plan to execute if 7416 // more than one is generated. 7417 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7418 LoopVectorizationCostModel &CM) { 7419 unsigned WidestType; 7420 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7421 return WidestVectorRegBits / WidestType; 7422 } 7423 7424 VectorizationFactor 7425 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7426 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7427 ElementCount VF = UserVF; 7428 // Outer loop handling: They may require CFG and instruction level 7429 // transformations before even evaluating whether vectorization is profitable. 7430 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7431 // the vectorization pipeline. 7432 if (!OrigLoop->isInnermost()) { 7433 // If the user doesn't provide a vectorization factor, determine a 7434 // reasonable one. 7435 if (UserVF.isZero()) { 7436 VF = ElementCount::getFixed(determineVPlanVF( 7437 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7438 .getFixedSize(), 7439 CM)); 7440 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7441 7442 // Make sure we have a VF > 1 for stress testing. 7443 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7444 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7445 << "overriding computed VF.\n"); 7446 VF = ElementCount::getFixed(4); 7447 } 7448 } 7449 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7450 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7451 "VF needs to be a power of two"); 7452 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7453 << "VF " << VF << " to build VPlans.\n"); 7454 buildVPlans(VF, VF); 7455 7456 // For VPlan build stress testing, we bail out after VPlan construction. 7457 if (VPlanBuildStressTest) 7458 return VectorizationFactor::Disabled(); 7459 7460 return {VF, 0 /*Cost*/}; 7461 } 7462 7463 LLVM_DEBUG( 7464 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7465 "VPlan-native path.\n"); 7466 return VectorizationFactor::Disabled(); 7467 } 7468 7469 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { 7470 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7471 return (NumRuntimePointerChecks > 7472 VectorizerParams::RuntimeMemoryCheckThreshold && 7473 !Hints.allowReordering()) || 7474 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7475 } 7476 7477 Optional<VectorizationFactor> 7478 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7479 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7480 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7481 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7482 return None; 7483 7484 // Invalidate interleave groups if all blocks of loop will be predicated. 7485 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7486 !useMaskedInterleavedAccesses(*TTI)) { 7487 LLVM_DEBUG( 7488 dbgs() 7489 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7490 "which requires masked-interleaved support.\n"); 7491 if (CM.InterleaveInfo.invalidateGroups()) 7492 // Invalidating interleave groups also requires invalidating all decisions 7493 // based on them, which includes widening decisions and uniform and scalar 7494 // values. 7495 CM.invalidateCostModelingDecisions(); 7496 } 7497 7498 ElementCount MaxUserVF = 7499 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7500 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7501 if (!UserVF.isZero() && UserVFIsLegal) { 7502 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7503 "VF needs to be a power of two"); 7504 // Collect the instructions (and their associated costs) that will be more 7505 // profitable to scalarize. 7506 if (CM.selectUserVectorizationFactor(UserVF)) { 7507 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7508 CM.collectInLoopReductions(); 7509 buildVPlansWithVPRecipes(UserVF, UserVF); 7510 LLVM_DEBUG(printPlans(dbgs())); 7511 return {{UserVF, 0}}; 7512 } else 7513 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7514 "InvalidCost", ORE, OrigLoop); 7515 } 7516 7517 // Populate the set of Vectorization Factor Candidates. 7518 ElementCountSet VFCandidates; 7519 for (auto VF = ElementCount::getFixed(1); 7520 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7521 VFCandidates.insert(VF); 7522 for (auto VF = ElementCount::getScalable(1); 7523 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7524 VFCandidates.insert(VF); 7525 7526 for (const auto &VF : VFCandidates) { 7527 // Collect Uniform and Scalar instructions after vectorization with VF. 7528 CM.collectUniformsAndScalars(VF); 7529 7530 // Collect the instructions (and their associated costs) that will be more 7531 // profitable to scalarize. 7532 if (VF.isVector()) 7533 CM.collectInstsToScalarize(VF); 7534 } 7535 7536 CM.collectInLoopReductions(); 7537 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7538 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7539 7540 LLVM_DEBUG(printPlans(dbgs())); 7541 if (!MaxFactors.hasVector()) 7542 return VectorizationFactor::Disabled(); 7543 7544 // Select the optimal vectorization factor. 7545 return CM.selectVectorizationFactor(VFCandidates); 7546 } 7547 7548 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7549 assert(count_if(VPlans, 7550 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7551 1 && 7552 "Best VF has not a single VPlan."); 7553 7554 for (const VPlanPtr &Plan : VPlans) { 7555 if (Plan->hasVF(VF)) 7556 return *Plan.get(); 7557 } 7558 llvm_unreachable("No plan found!"); 7559 } 7560 7561 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7562 SmallVector<Metadata *, 4> MDs; 7563 // Reserve first location for self reference to the LoopID metadata node. 7564 MDs.push_back(nullptr); 7565 bool IsUnrollMetadata = false; 7566 MDNode *LoopID = L->getLoopID(); 7567 if (LoopID) { 7568 // First find existing loop unrolling disable metadata. 7569 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7570 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7571 if (MD) { 7572 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7573 IsUnrollMetadata = 7574 S && S->getString().startswith("llvm.loop.unroll.disable"); 7575 } 7576 MDs.push_back(LoopID->getOperand(i)); 7577 } 7578 } 7579 7580 if (!IsUnrollMetadata) { 7581 // Add runtime unroll disable metadata. 7582 LLVMContext &Context = L->getHeader()->getContext(); 7583 SmallVector<Metadata *, 1> DisableOperands; 7584 DisableOperands.push_back( 7585 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7586 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7587 MDs.push_back(DisableNode); 7588 MDNode *NewLoopID = MDNode::get(Context, MDs); 7589 // Set operand 0 to refer to the loop id itself. 7590 NewLoopID->replaceOperandWith(0, NewLoopID); 7591 L->setLoopID(NewLoopID); 7592 } 7593 } 7594 7595 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7596 VPlan &BestVPlan, 7597 InnerLoopVectorizer &ILV, 7598 DominatorTree *DT) { 7599 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7600 << '\n'); 7601 7602 // Perform the actual loop transformation. 7603 7604 // 1. Set up the skeleton for vectorization, including vector pre-header and 7605 // middle block. The vector loop is created during VPlan execution. 7606 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7607 Value *CanonicalIVStartValue; 7608 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7609 ILV.createVectorizedLoopSkeleton(); 7610 ILV.collectPoisonGeneratingRecipes(State); 7611 7612 ILV.printDebugTracesAtStart(); 7613 7614 //===------------------------------------------------===// 7615 // 7616 // Notice: any optimization or new instruction that go 7617 // into the code below should also be implemented in 7618 // the cost-model. 7619 // 7620 //===------------------------------------------------===// 7621 7622 // 2. Copy and widen instructions from the old loop into the new loop. 7623 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7624 ILV.getOrCreateVectorTripCount(nullptr), 7625 CanonicalIVStartValue, State); 7626 BestVPlan.execute(&State); 7627 7628 // Keep all loop hints from the original loop on the vector loop (we'll 7629 // replace the vectorizer-specific hints below). 7630 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7631 7632 Optional<MDNode *> VectorizedLoopID = 7633 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7634 LLVMLoopVectorizeFollowupVectorized}); 7635 7636 VPBasicBlock *HeaderVPBB = 7637 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7638 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7639 if (VectorizedLoopID.hasValue()) 7640 L->setLoopID(VectorizedLoopID.getValue()); 7641 else { 7642 // Keep all loop hints from the original loop on the vector loop (we'll 7643 // replace the vectorizer-specific hints below). 7644 if (MDNode *LID = OrigLoop->getLoopID()) 7645 L->setLoopID(LID); 7646 7647 LoopVectorizeHints Hints(L, true, *ORE); 7648 Hints.setAlreadyVectorized(); 7649 } 7650 // Disable runtime unrolling when vectorizing the epilogue loop. 7651 if (CanonicalIVStartValue) 7652 AddRuntimeUnrollDisableMetaData(L); 7653 7654 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7655 // predication, updating analyses. 7656 ILV.fixVectorizedLoop(State, BestVPlan); 7657 7658 ILV.printDebugTracesAtEnd(); 7659 } 7660 7661 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7662 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7663 for (const auto &Plan : VPlans) 7664 if (PrintVPlansInDotFormat) 7665 Plan->printDOT(O); 7666 else 7667 Plan->print(O); 7668 } 7669 #endif 7670 7671 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7672 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7673 7674 // We create new control-flow for the vectorized loop, so the original exit 7675 // conditions will be dead after vectorization if it's only used by the 7676 // terminator 7677 SmallVector<BasicBlock*> ExitingBlocks; 7678 OrigLoop->getExitingBlocks(ExitingBlocks); 7679 for (auto *BB : ExitingBlocks) { 7680 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7681 if (!Cmp || !Cmp->hasOneUse()) 7682 continue; 7683 7684 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7685 if (!DeadInstructions.insert(Cmp).second) 7686 continue; 7687 7688 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7689 // TODO: can recurse through operands in general 7690 for (Value *Op : Cmp->operands()) { 7691 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7692 DeadInstructions.insert(cast<Instruction>(Op)); 7693 } 7694 } 7695 7696 // We create new "steps" for induction variable updates to which the original 7697 // induction variables map. An original update instruction will be dead if 7698 // all its users except the induction variable are dead. 7699 auto *Latch = OrigLoop->getLoopLatch(); 7700 for (auto &Induction : Legal->getInductionVars()) { 7701 PHINode *Ind = Induction.first; 7702 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7703 7704 // If the tail is to be folded by masking, the primary induction variable, 7705 // if exists, isn't dead: it will be used for masking. Don't kill it. 7706 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7707 continue; 7708 7709 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7710 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7711 })) 7712 DeadInstructions.insert(IndUpdate); 7713 } 7714 } 7715 7716 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7717 7718 //===--------------------------------------------------------------------===// 7719 // EpilogueVectorizerMainLoop 7720 //===--------------------------------------------------------------------===// 7721 7722 /// This function is partially responsible for generating the control flow 7723 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7724 std::pair<BasicBlock *, Value *> 7725 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7726 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7727 7728 // Workaround! Compute the trip count of the original loop and cache it 7729 // before we start modifying the CFG. This code has a systemic problem 7730 // wherein it tries to run analysis over partially constructed IR; this is 7731 // wrong, and not simply for SCEV. The trip count of the original loop 7732 // simply happens to be prone to hitting this in practice. In theory, we 7733 // can hit the same issue for any SCEV, or ValueTracking query done during 7734 // mutation. See PR49900. 7735 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7736 createVectorLoopSkeleton(""); 7737 7738 // Generate the code to check the minimum iteration count of the vector 7739 // epilogue (see below). 7740 EPI.EpilogueIterationCountCheck = 7741 emitIterationCountCheck(LoopScalarPreHeader, true); 7742 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7743 7744 // Generate the code to check any assumptions that we've made for SCEV 7745 // expressions. 7746 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7747 7748 // Generate the code that checks at runtime if arrays overlap. We put the 7749 // checks into a separate block to make the more common case of few elements 7750 // faster. 7751 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7752 7753 // Generate the iteration count check for the main loop, *after* the check 7754 // for the epilogue loop, so that the path-length is shorter for the case 7755 // that goes directly through the vector epilogue. The longer-path length for 7756 // the main loop is compensated for, by the gain from vectorizing the larger 7757 // trip count. Note: the branch will get updated later on when we vectorize 7758 // the epilogue. 7759 EPI.MainLoopIterationCountCheck = 7760 emitIterationCountCheck(LoopScalarPreHeader, false); 7761 7762 // Generate the induction variable. 7763 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7764 7765 // Skip induction resume value creation here because they will be created in 7766 // the second pass. If we created them here, they wouldn't be used anyway, 7767 // because the vplan in the second pass still contains the inductions from the 7768 // original loop. 7769 7770 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7771 } 7772 7773 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7774 LLVM_DEBUG({ 7775 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7776 << "Main Loop VF:" << EPI.MainLoopVF 7777 << ", Main Loop UF:" << EPI.MainLoopUF 7778 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7779 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7780 }); 7781 } 7782 7783 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7784 DEBUG_WITH_TYPE(VerboseDebug, { 7785 dbgs() << "intermediate fn:\n" 7786 << *OrigLoop->getHeader()->getParent() << "\n"; 7787 }); 7788 } 7789 7790 BasicBlock * 7791 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7792 bool ForEpilogue) { 7793 assert(Bypass && "Expected valid bypass basic block."); 7794 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7795 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7796 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7797 // Reuse existing vector loop preheader for TC checks. 7798 // Note that new preheader block is generated for vector loop. 7799 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7800 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7801 7802 // Generate code to check if the loop's trip count is less than VF * UF of the 7803 // main vector loop. 7804 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7805 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7806 7807 Value *CheckMinIters = Builder.CreateICmp( 7808 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7809 "min.iters.check"); 7810 7811 if (!ForEpilogue) 7812 TCCheckBlock->setName("vector.main.loop.iter.check"); 7813 7814 // Create new preheader for vector loop. 7815 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7816 DT, LI, nullptr, "vector.ph"); 7817 7818 if (ForEpilogue) { 7819 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7820 DT->getNode(Bypass)->getIDom()) && 7821 "TC check is expected to dominate Bypass"); 7822 7823 // Update dominator for Bypass & LoopExit. 7824 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7825 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7826 // For loops with multiple exits, there's no edge from the middle block 7827 // to exit blocks (as the epilogue must run) and thus no need to update 7828 // the immediate dominator of the exit blocks. 7829 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7830 7831 LoopBypassBlocks.push_back(TCCheckBlock); 7832 7833 // Save the trip count so we don't have to regenerate it in the 7834 // vec.epilog.iter.check. This is safe to do because the trip count 7835 // generated here dominates the vector epilog iter check. 7836 EPI.TripCount = Count; 7837 } 7838 7839 ReplaceInstWithInst( 7840 TCCheckBlock->getTerminator(), 7841 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7842 7843 return TCCheckBlock; 7844 } 7845 7846 //===--------------------------------------------------------------------===// 7847 // EpilogueVectorizerEpilogueLoop 7848 //===--------------------------------------------------------------------===// 7849 7850 /// This function is partially responsible for generating the control flow 7851 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7852 std::pair<BasicBlock *, Value *> 7853 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7854 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7855 createVectorLoopSkeleton("vec.epilog."); 7856 7857 // Now, compare the remaining count and if there aren't enough iterations to 7858 // execute the vectorized epilogue skip to the scalar part. 7859 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7860 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7861 LoopVectorPreHeader = 7862 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7863 LI, nullptr, "vec.epilog.ph"); 7864 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7865 VecEpilogueIterationCountCheck); 7866 7867 // Adjust the control flow taking the state info from the main loop 7868 // vectorization into account. 7869 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7870 "expected this to be saved from the previous pass."); 7871 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7872 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7873 7874 DT->changeImmediateDominator(LoopVectorPreHeader, 7875 EPI.MainLoopIterationCountCheck); 7876 7877 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7878 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7879 7880 if (EPI.SCEVSafetyCheck) 7881 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7882 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7883 if (EPI.MemSafetyCheck) 7884 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7885 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7886 7887 DT->changeImmediateDominator( 7888 VecEpilogueIterationCountCheck, 7889 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7890 7891 DT->changeImmediateDominator(LoopScalarPreHeader, 7892 EPI.EpilogueIterationCountCheck); 7893 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7894 // If there is an epilogue which must run, there's no edge from the 7895 // middle block to exit blocks and thus no need to update the immediate 7896 // dominator of the exit blocks. 7897 DT->changeImmediateDominator(LoopExitBlock, 7898 EPI.EpilogueIterationCountCheck); 7899 7900 // Keep track of bypass blocks, as they feed start values to the induction 7901 // phis in the scalar loop preheader. 7902 if (EPI.SCEVSafetyCheck) 7903 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7904 if (EPI.MemSafetyCheck) 7905 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7906 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7907 7908 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7909 // merge control-flow from the latch block and the middle block. Update the 7910 // incoming values here and move the Phi into the preheader. 7911 SmallVector<PHINode *, 4> PhisInBlock; 7912 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7913 PhisInBlock.push_back(&Phi); 7914 7915 for (PHINode *Phi : PhisInBlock) { 7916 Phi->replaceIncomingBlockWith( 7917 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7918 VecEpilogueIterationCountCheck); 7919 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7920 if (EPI.SCEVSafetyCheck) 7921 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7922 if (EPI.MemSafetyCheck) 7923 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7924 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7925 } 7926 7927 // Generate a resume induction for the vector epilogue and put it in the 7928 // vector epilogue preheader 7929 Type *IdxTy = Legal->getWidestInductionType(); 7930 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7931 LoopVectorPreHeader->getFirstNonPHI()); 7932 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7933 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7934 EPI.MainLoopIterationCountCheck); 7935 7936 // Generate induction resume values. These variables save the new starting 7937 // indexes for the scalar loop. They are used to test if there are any tail 7938 // iterations left once the vector loop has completed. 7939 // Note that when the vectorized epilogue is skipped due to iteration count 7940 // check, then the resume value for the induction variable comes from 7941 // the trip count of the main vector loop, hence passing the AdditionalBypass 7942 // argument. 7943 createInductionResumeValues({VecEpilogueIterationCountCheck, 7944 EPI.VectorTripCount} /* AdditionalBypass */); 7945 7946 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7947 } 7948 7949 BasicBlock * 7950 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7951 BasicBlock *Bypass, BasicBlock *Insert) { 7952 7953 assert(EPI.TripCount && 7954 "Expected trip count to have been safed in the first pass."); 7955 assert( 7956 (!isa<Instruction>(EPI.TripCount) || 7957 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7958 "saved trip count does not dominate insertion point."); 7959 Value *TC = EPI.TripCount; 7960 IRBuilder<> Builder(Insert->getTerminator()); 7961 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7962 7963 // Generate code to check if the loop's trip count is less than VF * UF of the 7964 // vector epilogue loop. 7965 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7966 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7967 7968 Value *CheckMinIters = 7969 Builder.CreateICmp(P, Count, 7970 createStepForVF(Builder, Count->getType(), 7971 EPI.EpilogueVF, EPI.EpilogueUF), 7972 "min.epilog.iters.check"); 7973 7974 ReplaceInstWithInst( 7975 Insert->getTerminator(), 7976 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7977 7978 LoopBypassBlocks.push_back(Insert); 7979 return Insert; 7980 } 7981 7982 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7983 LLVM_DEBUG({ 7984 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7985 << "Epilogue Loop VF:" << EPI.EpilogueVF 7986 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7987 }); 7988 } 7989 7990 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7991 DEBUG_WITH_TYPE(VerboseDebug, { 7992 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7993 }); 7994 } 7995 7996 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7997 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7998 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7999 bool PredicateAtRangeStart = Predicate(Range.Start); 8000 8001 for (ElementCount TmpVF = Range.Start * 2; 8002 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8003 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8004 Range.End = TmpVF; 8005 break; 8006 } 8007 8008 return PredicateAtRangeStart; 8009 } 8010 8011 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8012 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8013 /// of VF's starting at a given VF and extending it as much as possible. Each 8014 /// vectorization decision can potentially shorten this sub-range during 8015 /// buildVPlan(). 8016 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8017 ElementCount MaxVF) { 8018 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8019 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8020 VFRange SubRange = {VF, MaxVFPlusOne}; 8021 VPlans.push_back(buildVPlan(SubRange)); 8022 VF = SubRange.End; 8023 } 8024 } 8025 8026 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8027 VPlanPtr &Plan) { 8028 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8029 8030 // Look for cached value. 8031 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8032 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8033 if (ECEntryIt != EdgeMaskCache.end()) 8034 return ECEntryIt->second; 8035 8036 VPValue *SrcMask = createBlockInMask(Src, Plan); 8037 8038 // The terminator has to be a branch inst! 8039 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8040 assert(BI && "Unexpected terminator found"); 8041 8042 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8043 return EdgeMaskCache[Edge] = SrcMask; 8044 8045 // If source is an exiting block, we know the exit edge is dynamically dead 8046 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8047 // adding uses of an otherwise potentially dead instruction. 8048 if (OrigLoop->isLoopExiting(Src)) 8049 return EdgeMaskCache[Edge] = SrcMask; 8050 8051 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8052 assert(EdgeMask && "No Edge Mask found for condition"); 8053 8054 if (BI->getSuccessor(0) != Dst) 8055 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8056 8057 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8058 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8059 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8060 // The select version does not introduce new UB if SrcMask is false and 8061 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8062 VPValue *False = Plan->getOrAddVPValue( 8063 ConstantInt::getFalse(BI->getCondition()->getType())); 8064 EdgeMask = 8065 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8066 } 8067 8068 return EdgeMaskCache[Edge] = EdgeMask; 8069 } 8070 8071 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8072 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8073 8074 // Look for cached value. 8075 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8076 if (BCEntryIt != BlockMaskCache.end()) 8077 return BCEntryIt->second; 8078 8079 // All-one mask is modelled as no-mask following the convention for masked 8080 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8081 VPValue *BlockMask = nullptr; 8082 8083 if (OrigLoop->getHeader() == BB) { 8084 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8085 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8086 8087 // Introduce the early-exit compare IV <= BTC to form header block mask. 8088 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8089 // constructing the desired canonical IV in the header block as its first 8090 // non-phi instructions. 8091 assert(CM.foldTailByMasking() && "must fold the tail"); 8092 VPBasicBlock *HeaderVPBB = 8093 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8094 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8095 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8096 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8097 8098 VPBuilder::InsertPointGuard Guard(Builder); 8099 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8100 if (CM.TTI.emitGetActiveLaneMask()) { 8101 VPValue *TC = Plan->getOrCreateTripCount(); 8102 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8103 } else { 8104 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8105 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8106 } 8107 return BlockMaskCache[BB] = BlockMask; 8108 } 8109 8110 // This is the block mask. We OR all incoming edges. 8111 for (auto *Predecessor : predecessors(BB)) { 8112 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8113 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8114 return BlockMaskCache[BB] = EdgeMask; 8115 8116 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8117 BlockMask = EdgeMask; 8118 continue; 8119 } 8120 8121 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8122 } 8123 8124 return BlockMaskCache[BB] = BlockMask; 8125 } 8126 8127 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8128 ArrayRef<VPValue *> Operands, 8129 VFRange &Range, 8130 VPlanPtr &Plan) { 8131 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8132 "Must be called with either a load or store"); 8133 8134 auto willWiden = [&](ElementCount VF) -> bool { 8135 LoopVectorizationCostModel::InstWidening Decision = 8136 CM.getWideningDecision(I, VF); 8137 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8138 "CM decision should be taken at this point."); 8139 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8140 return true; 8141 if (CM.isScalarAfterVectorization(I, VF) || 8142 CM.isProfitableToScalarize(I, VF)) 8143 return false; 8144 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8145 }; 8146 8147 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8148 return nullptr; 8149 8150 VPValue *Mask = nullptr; 8151 if (Legal->isMaskRequired(I)) 8152 Mask = createBlockInMask(I->getParent(), Plan); 8153 8154 // Determine if the pointer operand of the access is either consecutive or 8155 // reverse consecutive. 8156 LoopVectorizationCostModel::InstWidening Decision = 8157 CM.getWideningDecision(I, Range.Start); 8158 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8159 bool Consecutive = 8160 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8161 8162 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8163 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8164 Consecutive, Reverse); 8165 8166 StoreInst *Store = cast<StoreInst>(I); 8167 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8168 Mask, Consecutive, Reverse); 8169 } 8170 8171 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8172 /// insert a recipe to expand the step for the induction recipe. 8173 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8174 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8175 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8176 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8177 // Returns true if an instruction \p I should be scalarized instead of 8178 // vectorized for the chosen vectorization factor. 8179 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8180 return CM.isScalarAfterVectorization(I, VF) || 8181 CM.isProfitableToScalarize(I, VF); 8182 }; 8183 8184 bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( 8185 [&](ElementCount VF) { 8186 // Returns true if we should generate a scalar version of \p IV. 8187 if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) 8188 return true; 8189 auto isScalarInst = [&](User *U) -> bool { 8190 auto *I = cast<Instruction>(U); 8191 return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); 8192 }; 8193 return any_of(PhiOrTrunc->users(), isScalarInst); 8194 }, 8195 Range); 8196 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8197 [&](ElementCount VF) { 8198 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8199 }, 8200 Range); 8201 assert(IndDesc.getStartValue() == 8202 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8203 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8204 "step must be loop invariant"); 8205 8206 VPValue *Step = 8207 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8208 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8209 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8210 NeedsScalarIV, !NeedsScalarIVOnly); 8211 } 8212 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8213 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8214 NeedsScalarIV, !NeedsScalarIVOnly); 8215 } 8216 8217 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8218 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8219 8220 // Check if this is an integer or fp induction. If so, build the recipe that 8221 // produces its scalar and vector values. 8222 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8223 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8224 *PSE.getSE(), *OrigLoop, Range); 8225 8226 // Check if this is pointer induction. If so, build the recipe for it. 8227 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8228 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8229 *PSE.getSE()); 8230 return nullptr; 8231 } 8232 8233 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8234 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8235 // Optimize the special case where the source is a constant integer 8236 // induction variable. Notice that we can only optimize the 'trunc' case 8237 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8238 // (c) other casts depend on pointer size. 8239 8240 // Determine whether \p K is a truncation based on an induction variable that 8241 // can be optimized. 8242 auto isOptimizableIVTruncate = 8243 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8244 return [=](ElementCount VF) -> bool { 8245 return CM.isOptimizableIVTruncate(K, VF); 8246 }; 8247 }; 8248 8249 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8250 isOptimizableIVTruncate(I), Range)) { 8251 8252 auto *Phi = cast<PHINode>(I->getOperand(0)); 8253 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8254 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8255 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8256 *PSE.getSE(), *OrigLoop, Range); 8257 } 8258 return nullptr; 8259 } 8260 8261 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8262 ArrayRef<VPValue *> Operands, 8263 VPlanPtr &Plan) { 8264 // If all incoming values are equal, the incoming VPValue can be used directly 8265 // instead of creating a new VPBlendRecipe. 8266 VPValue *FirstIncoming = Operands[0]; 8267 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8268 return FirstIncoming == Inc; 8269 })) { 8270 return Operands[0]; 8271 } 8272 8273 unsigned NumIncoming = Phi->getNumIncomingValues(); 8274 // For in-loop reductions, we do not need to create an additional select. 8275 VPValue *InLoopVal = nullptr; 8276 for (unsigned In = 0; In < NumIncoming; In++) { 8277 PHINode *PhiOp = 8278 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8279 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8280 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8281 InLoopVal = Operands[In]; 8282 } 8283 } 8284 8285 assert((!InLoopVal || NumIncoming == 2) && 8286 "Found an in-loop reduction for PHI with unexpected number of " 8287 "incoming values"); 8288 if (InLoopVal) 8289 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8290 8291 // We know that all PHIs in non-header blocks are converted into selects, so 8292 // we don't have to worry about the insertion order and we can just use the 8293 // builder. At this point we generate the predication tree. There may be 8294 // duplications since this is a simple recursive scan, but future 8295 // optimizations will clean it up. 8296 SmallVector<VPValue *, 2> OperandsWithMask; 8297 8298 for (unsigned In = 0; In < NumIncoming; In++) { 8299 VPValue *EdgeMask = 8300 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8301 assert((EdgeMask || NumIncoming == 1) && 8302 "Multiple predecessors with one having a full mask"); 8303 OperandsWithMask.push_back(Operands[In]); 8304 if (EdgeMask) 8305 OperandsWithMask.push_back(EdgeMask); 8306 } 8307 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8308 } 8309 8310 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8311 ArrayRef<VPValue *> Operands, 8312 VFRange &Range) const { 8313 8314 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8315 [this, CI](ElementCount VF) { 8316 return CM.isScalarWithPredication(CI, VF); 8317 }, 8318 Range); 8319 8320 if (IsPredicated) 8321 return nullptr; 8322 8323 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8324 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8325 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8326 ID == Intrinsic::pseudoprobe || 8327 ID == Intrinsic::experimental_noalias_scope_decl)) 8328 return nullptr; 8329 8330 auto willWiden = [&](ElementCount VF) -> bool { 8331 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8332 // The following case may be scalarized depending on the VF. 8333 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8334 // version of the instruction. 8335 // Is it beneficial to perform intrinsic call compared to lib call? 8336 bool NeedToScalarize = false; 8337 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8338 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8339 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8340 return UseVectorIntrinsic || !NeedToScalarize; 8341 }; 8342 8343 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8344 return nullptr; 8345 8346 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8347 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8348 } 8349 8350 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8351 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8352 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8353 // Instruction should be widened, unless it is scalar after vectorization, 8354 // scalarization is profitable or it is predicated. 8355 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8356 return CM.isScalarAfterVectorization(I, VF) || 8357 CM.isProfitableToScalarize(I, VF) || 8358 CM.isScalarWithPredication(I, VF); 8359 }; 8360 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8361 Range); 8362 } 8363 8364 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8365 ArrayRef<VPValue *> Operands) const { 8366 auto IsVectorizableOpcode = [](unsigned Opcode) { 8367 switch (Opcode) { 8368 case Instruction::Add: 8369 case Instruction::And: 8370 case Instruction::AShr: 8371 case Instruction::BitCast: 8372 case Instruction::FAdd: 8373 case Instruction::FCmp: 8374 case Instruction::FDiv: 8375 case Instruction::FMul: 8376 case Instruction::FNeg: 8377 case Instruction::FPExt: 8378 case Instruction::FPToSI: 8379 case Instruction::FPToUI: 8380 case Instruction::FPTrunc: 8381 case Instruction::FRem: 8382 case Instruction::FSub: 8383 case Instruction::ICmp: 8384 case Instruction::IntToPtr: 8385 case Instruction::LShr: 8386 case Instruction::Mul: 8387 case Instruction::Or: 8388 case Instruction::PtrToInt: 8389 case Instruction::SDiv: 8390 case Instruction::Select: 8391 case Instruction::SExt: 8392 case Instruction::Shl: 8393 case Instruction::SIToFP: 8394 case Instruction::SRem: 8395 case Instruction::Sub: 8396 case Instruction::Trunc: 8397 case Instruction::UDiv: 8398 case Instruction::UIToFP: 8399 case Instruction::URem: 8400 case Instruction::Xor: 8401 case Instruction::ZExt: 8402 case Instruction::Freeze: 8403 return true; 8404 } 8405 return false; 8406 }; 8407 8408 if (!IsVectorizableOpcode(I->getOpcode())) 8409 return nullptr; 8410 8411 // Success: widen this instruction. 8412 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8413 } 8414 8415 void VPRecipeBuilder::fixHeaderPhis() { 8416 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8417 for (VPHeaderPHIRecipe *R : PhisToFix) { 8418 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8419 VPRecipeBase *IncR = 8420 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8421 R->addOperand(IncR->getVPSingleValue()); 8422 } 8423 } 8424 8425 VPBasicBlock *VPRecipeBuilder::handleReplication( 8426 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8427 VPlanPtr &Plan) { 8428 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8429 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8430 Range); 8431 8432 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8433 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8434 Range); 8435 8436 // Even if the instruction is not marked as uniform, there are certain 8437 // intrinsic calls that can be effectively treated as such, so we check for 8438 // them here. Conservatively, we only do this for scalable vectors, since 8439 // for fixed-width VFs we can always fall back on full scalarization. 8440 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8441 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8442 case Intrinsic::assume: 8443 case Intrinsic::lifetime_start: 8444 case Intrinsic::lifetime_end: 8445 // For scalable vectors if one of the operands is variant then we still 8446 // want to mark as uniform, which will generate one instruction for just 8447 // the first lane of the vector. We can't scalarize the call in the same 8448 // way as for fixed-width vectors because we don't know how many lanes 8449 // there are. 8450 // 8451 // The reasons for doing it this way for scalable vectors are: 8452 // 1. For the assume intrinsic generating the instruction for the first 8453 // lane is still be better than not generating any at all. For 8454 // example, the input may be a splat across all lanes. 8455 // 2. For the lifetime start/end intrinsics the pointer operand only 8456 // does anything useful when the input comes from a stack object, 8457 // which suggests it should always be uniform. For non-stack objects 8458 // the effect is to poison the object, which still allows us to 8459 // remove the call. 8460 IsUniform = true; 8461 break; 8462 default: 8463 break; 8464 } 8465 } 8466 8467 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8468 IsUniform, IsPredicated); 8469 setRecipe(I, Recipe); 8470 Plan->addVPValue(I, Recipe); 8471 8472 // Find if I uses a predicated instruction. If so, it will use its scalar 8473 // value. Avoid hoisting the insert-element which packs the scalar value into 8474 // a vector value, as that happens iff all users use the vector value. 8475 for (VPValue *Op : Recipe->operands()) { 8476 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8477 if (!PredR) 8478 continue; 8479 auto *RepR = 8480 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8481 assert(RepR->isPredicated() && 8482 "expected Replicate recipe to be predicated"); 8483 RepR->setAlsoPack(false); 8484 } 8485 8486 // Finalize the recipe for Instr, first if it is not predicated. 8487 if (!IsPredicated) { 8488 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8489 VPBB->appendRecipe(Recipe); 8490 return VPBB; 8491 } 8492 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8493 8494 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8495 assert(SingleSucc && "VPBB must have a single successor when handling " 8496 "predicated replication."); 8497 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8498 // Record predicated instructions for above packing optimizations. 8499 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8500 VPBlockUtils::insertBlockAfter(Region, VPBB); 8501 auto *RegSucc = new VPBasicBlock(); 8502 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8503 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8504 return RegSucc; 8505 } 8506 8507 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8508 VPRecipeBase *PredRecipe, 8509 VPlanPtr &Plan) { 8510 // Instructions marked for predication are replicated and placed under an 8511 // if-then construct to prevent side-effects. 8512 8513 // Generate recipes to compute the block mask for this region. 8514 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8515 8516 // Build the triangular if-then region. 8517 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8518 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8519 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8520 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8521 auto *PHIRecipe = Instr->getType()->isVoidTy() 8522 ? nullptr 8523 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8524 if (PHIRecipe) { 8525 Plan->removeVPValueFor(Instr); 8526 Plan->addVPValue(Instr, PHIRecipe); 8527 } 8528 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8529 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8530 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8531 8532 // Note: first set Entry as region entry and then connect successors starting 8533 // from it in order, to propagate the "parent" of each VPBasicBlock. 8534 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8535 VPBlockUtils::connectBlocks(Pred, Exiting); 8536 8537 return Region; 8538 } 8539 8540 VPRecipeOrVPValueTy 8541 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8542 ArrayRef<VPValue *> Operands, 8543 VFRange &Range, VPlanPtr &Plan) { 8544 // First, check for specific widening recipes that deal with inductions, Phi 8545 // nodes, calls and memory operations. 8546 VPRecipeBase *Recipe; 8547 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8548 if (Phi->getParent() != OrigLoop->getHeader()) 8549 return tryToBlend(Phi, Operands, Plan); 8550 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8551 return toVPRecipeResult(Recipe); 8552 8553 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8554 assert((Legal->isReductionVariable(Phi) || 8555 Legal->isFirstOrderRecurrence(Phi)) && 8556 "can only widen reductions and first-order recurrences here"); 8557 VPValue *StartV = Operands[0]; 8558 if (Legal->isReductionVariable(Phi)) { 8559 const RecurrenceDescriptor &RdxDesc = 8560 Legal->getReductionVars().find(Phi)->second; 8561 assert(RdxDesc.getRecurrenceStartValue() == 8562 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8563 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8564 CM.isInLoopReduction(Phi), 8565 CM.useOrderedReductions(RdxDesc)); 8566 } else { 8567 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8568 } 8569 8570 // Record the incoming value from the backedge, so we can add the incoming 8571 // value from the backedge after all recipes have been created. 8572 recordRecipeOf(cast<Instruction>( 8573 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8574 PhisToFix.push_back(PhiRecipe); 8575 return toVPRecipeResult(PhiRecipe); 8576 } 8577 8578 if (isa<TruncInst>(Instr) && 8579 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8580 Range, *Plan))) 8581 return toVPRecipeResult(Recipe); 8582 8583 // All widen recipes below deal only with VF > 1. 8584 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8585 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8586 return nullptr; 8587 8588 if (auto *CI = dyn_cast<CallInst>(Instr)) 8589 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8590 8591 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8592 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8593 8594 if (!shouldWiden(Instr, Range)) 8595 return nullptr; 8596 8597 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8598 return toVPRecipeResult(new VPWidenGEPRecipe( 8599 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8600 8601 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8602 bool InvariantCond = 8603 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8604 return toVPRecipeResult(new VPWidenSelectRecipe( 8605 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8606 } 8607 8608 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8609 } 8610 8611 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8612 ElementCount MaxVF) { 8613 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8614 8615 // Collect instructions from the original loop that will become trivially dead 8616 // in the vectorized loop. We don't need to vectorize these instructions. For 8617 // example, original induction update instructions can become dead because we 8618 // separately emit induction "steps" when generating code for the new loop. 8619 // Similarly, we create a new latch condition when setting up the structure 8620 // of the new loop, so the old one can become dead. 8621 SmallPtrSet<Instruction *, 4> DeadInstructions; 8622 collectTriviallyDeadInstructions(DeadInstructions); 8623 8624 // Add assume instructions we need to drop to DeadInstructions, to prevent 8625 // them from being added to the VPlan. 8626 // TODO: We only need to drop assumes in blocks that get flattend. If the 8627 // control flow is preserved, we should keep them. 8628 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8629 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8630 8631 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8632 // Dead instructions do not need sinking. Remove them from SinkAfter. 8633 for (Instruction *I : DeadInstructions) 8634 SinkAfter.erase(I); 8635 8636 // Cannot sink instructions after dead instructions (there won't be any 8637 // recipes for them). Instead, find the first non-dead previous instruction. 8638 for (auto &P : Legal->getSinkAfter()) { 8639 Instruction *SinkTarget = P.second; 8640 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8641 (void)FirstInst; 8642 while (DeadInstructions.contains(SinkTarget)) { 8643 assert( 8644 SinkTarget != FirstInst && 8645 "Must find a live instruction (at least the one feeding the " 8646 "first-order recurrence PHI) before reaching beginning of the block"); 8647 SinkTarget = SinkTarget->getPrevNode(); 8648 assert(SinkTarget != P.first && 8649 "sink source equals target, no sinking required"); 8650 } 8651 P.second = SinkTarget; 8652 } 8653 8654 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8655 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8656 VFRange SubRange = {VF, MaxVFPlusOne}; 8657 VPlans.push_back( 8658 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8659 VF = SubRange.End; 8660 } 8661 } 8662 8663 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8664 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8665 // BranchOnCount VPInstruction to the latch. 8666 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8667 bool HasNUW) { 8668 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8669 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8670 8671 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8672 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8673 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8674 Header->insert(CanonicalIVPHI, Header->begin()); 8675 8676 auto *CanonicalIVIncrement = 8677 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8678 : VPInstruction::CanonicalIVIncrement, 8679 {CanonicalIVPHI}, DL); 8680 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8681 8682 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8683 EB->appendRecipe(CanonicalIVIncrement); 8684 8685 auto *BranchOnCount = 8686 new VPInstruction(VPInstruction::BranchOnCount, 8687 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8688 EB->appendRecipe(BranchOnCount); 8689 } 8690 8691 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8692 // original exit block. 8693 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8694 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8695 VPlan &Plan) { 8696 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8697 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8698 // Only handle single-exit loops with unique exit blocks for now. 8699 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8700 return; 8701 8702 // Introduce VPUsers modeling the exit values. 8703 for (PHINode &ExitPhi : ExitBB->phis()) { 8704 Value *IncomingValue = 8705 ExitPhi.getIncomingValueForBlock(ExitingBB); 8706 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8707 Plan.addLiveOut(&ExitPhi, V); 8708 } 8709 } 8710 8711 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8712 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8713 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8714 8715 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8716 8717 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8718 8719 // --------------------------------------------------------------------------- 8720 // Pre-construction: record ingredients whose recipes we'll need to further 8721 // process after constructing the initial VPlan. 8722 // --------------------------------------------------------------------------- 8723 8724 // Mark instructions we'll need to sink later and their targets as 8725 // ingredients whose recipe we'll need to record. 8726 for (auto &Entry : SinkAfter) { 8727 RecipeBuilder.recordRecipeOf(Entry.first); 8728 RecipeBuilder.recordRecipeOf(Entry.second); 8729 } 8730 for (auto &Reduction : CM.getInLoopReductionChains()) { 8731 PHINode *Phi = Reduction.first; 8732 RecurKind Kind = 8733 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8734 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8735 8736 RecipeBuilder.recordRecipeOf(Phi); 8737 for (auto &R : ReductionOperations) { 8738 RecipeBuilder.recordRecipeOf(R); 8739 // For min/max reductions, where we have a pair of icmp/select, we also 8740 // need to record the ICmp recipe, so it can be removed later. 8741 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8742 "Only min/max recurrences allowed for inloop reductions"); 8743 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8744 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8745 } 8746 } 8747 8748 // For each interleave group which is relevant for this (possibly trimmed) 8749 // Range, add it to the set of groups to be later applied to the VPlan and add 8750 // placeholders for its members' Recipes which we'll be replacing with a 8751 // single VPInterleaveRecipe. 8752 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8753 auto applyIG = [IG, this](ElementCount VF) -> bool { 8754 return (VF.isVector() && // Query is illegal for VF == 1 8755 CM.getWideningDecision(IG->getInsertPos(), VF) == 8756 LoopVectorizationCostModel::CM_Interleave); 8757 }; 8758 if (!getDecisionAndClampRange(applyIG, Range)) 8759 continue; 8760 InterleaveGroups.insert(IG); 8761 for (unsigned i = 0; i < IG->getFactor(); i++) 8762 if (Instruction *Member = IG->getMember(i)) 8763 RecipeBuilder.recordRecipeOf(Member); 8764 }; 8765 8766 // --------------------------------------------------------------------------- 8767 // Build initial VPlan: Scan the body of the loop in a topological order to 8768 // visit each basic block after having visited its predecessor basic blocks. 8769 // --------------------------------------------------------------------------- 8770 8771 // Create initial VPlan skeleton, starting with a block for the pre-header, 8772 // followed by a region for the vector loop, followed by the middle block. The 8773 // skeleton vector loop region contains a header and latch block. 8774 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8775 auto Plan = std::make_unique<VPlan>(Preheader); 8776 8777 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8778 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8779 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8780 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8781 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8782 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8783 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8784 8785 Instruction *DLInst = 8786 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8787 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8788 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8789 !CM.foldTailByMasking()); 8790 8791 // Scan the body of the loop in a topological order to visit each basic block 8792 // after having visited its predecessor basic blocks. 8793 LoopBlocksDFS DFS(OrigLoop); 8794 DFS.perform(LI); 8795 8796 VPBasicBlock *VPBB = HeaderVPBB; 8797 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8798 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8799 // Relevant instructions from basic block BB will be grouped into VPRecipe 8800 // ingredients and fill a new VPBasicBlock. 8801 unsigned VPBBsForBB = 0; 8802 if (VPBB != HeaderVPBB) 8803 VPBB->setName(BB->getName()); 8804 Builder.setInsertPoint(VPBB); 8805 8806 // Introduce each ingredient into VPlan. 8807 // TODO: Model and preserve debug intrinsics in VPlan. 8808 for (Instruction &I : BB->instructionsWithoutDebug()) { 8809 Instruction *Instr = &I; 8810 8811 // First filter out irrelevant instructions, to ensure no recipes are 8812 // built for them. 8813 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8814 continue; 8815 8816 SmallVector<VPValue *, 4> Operands; 8817 auto *Phi = dyn_cast<PHINode>(Instr); 8818 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8819 Operands.push_back(Plan->getOrAddVPValue( 8820 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8821 } else { 8822 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8823 Operands = {OpRange.begin(), OpRange.end()}; 8824 } 8825 8826 // Invariant stores inside loop will be deleted and a single store 8827 // with the final reduction value will be added to the exit block 8828 StoreInst *SI; 8829 if ((SI = dyn_cast<StoreInst>(&I)) && 8830 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8831 continue; 8832 8833 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8834 Instr, Operands, Range, Plan)) { 8835 // If Instr can be simplified to an existing VPValue, use it. 8836 if (RecipeOrValue.is<VPValue *>()) { 8837 auto *VPV = RecipeOrValue.get<VPValue *>(); 8838 Plan->addVPValue(Instr, VPV); 8839 // If the re-used value is a recipe, register the recipe for the 8840 // instruction, in case the recipe for Instr needs to be recorded. 8841 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8842 RecipeBuilder.setRecipe(Instr, R); 8843 continue; 8844 } 8845 // Otherwise, add the new recipe. 8846 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8847 for (auto *Def : Recipe->definedValues()) { 8848 auto *UV = Def->getUnderlyingValue(); 8849 Plan->addVPValue(UV, Def); 8850 } 8851 8852 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8853 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8854 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8855 // of the header block. That can happen for truncates of induction 8856 // variables. Those recipes are moved to the phi section of the header 8857 // block after applying SinkAfter, which relies on the original 8858 // position of the trunc. 8859 assert(isa<TruncInst>(Instr)); 8860 InductionsToMove.push_back( 8861 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8862 } 8863 RecipeBuilder.setRecipe(Instr, Recipe); 8864 VPBB->appendRecipe(Recipe); 8865 continue; 8866 } 8867 8868 // Otherwise, if all widening options failed, Instruction is to be 8869 // replicated. This may create a successor for VPBB. 8870 VPBasicBlock *NextVPBB = 8871 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8872 if (NextVPBB != VPBB) { 8873 VPBB = NextVPBB; 8874 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8875 : ""); 8876 } 8877 } 8878 8879 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8880 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8881 } 8882 8883 HeaderVPBB->setName("vector.body"); 8884 8885 // Fold the last, empty block into its predecessor. 8886 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8887 assert(VPBB && "expected to fold last (empty) block"); 8888 // After here, VPBB should not be used. 8889 VPBB = nullptr; 8890 8891 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8892 8893 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8894 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8895 "entry block must be set to a VPRegionBlock having a non-empty entry " 8896 "VPBasicBlock"); 8897 RecipeBuilder.fixHeaderPhis(); 8898 8899 // --------------------------------------------------------------------------- 8900 // Transform initial VPlan: Apply previously taken decisions, in order, to 8901 // bring the VPlan to its final state. 8902 // --------------------------------------------------------------------------- 8903 8904 // Apply Sink-After legal constraints. 8905 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8906 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8907 if (Region && Region->isReplicator()) { 8908 assert(Region->getNumSuccessors() == 1 && 8909 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8910 assert(R->getParent()->size() == 1 && 8911 "A recipe in an original replicator region must be the only " 8912 "recipe in its block"); 8913 return Region; 8914 } 8915 return nullptr; 8916 }; 8917 for (auto &Entry : SinkAfter) { 8918 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8919 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8920 8921 auto *TargetRegion = GetReplicateRegion(Target); 8922 auto *SinkRegion = GetReplicateRegion(Sink); 8923 if (!SinkRegion) { 8924 // If the sink source is not a replicate region, sink the recipe directly. 8925 if (TargetRegion) { 8926 // The target is in a replication region, make sure to move Sink to 8927 // the block after it, not into the replication region itself. 8928 VPBasicBlock *NextBlock = 8929 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8930 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8931 } else 8932 Sink->moveAfter(Target); 8933 continue; 8934 } 8935 8936 // The sink source is in a replicate region. Unhook the region from the CFG. 8937 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8938 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8939 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8940 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8941 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8942 8943 if (TargetRegion) { 8944 // The target recipe is also in a replicate region, move the sink region 8945 // after the target region. 8946 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8947 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8948 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8949 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8950 } else { 8951 // The sink source is in a replicate region, we need to move the whole 8952 // replicate region, which should only contain a single recipe in the 8953 // main block. 8954 auto *SplitBlock = 8955 Target->getParent()->splitAt(std::next(Target->getIterator())); 8956 8957 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8958 8959 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8960 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8961 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8962 } 8963 } 8964 8965 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8966 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8967 8968 // Now that sink-after is done, move induction recipes for optimized truncates 8969 // to the phi section of the header block. 8970 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8971 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8972 8973 // Adjust the recipes for any inloop reductions. 8974 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 8975 RecipeBuilder, Range.Start); 8976 8977 // Introduce a recipe to combine the incoming and previous values of a 8978 // first-order recurrence. 8979 for (VPRecipeBase &R : 8980 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8981 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8982 if (!RecurPhi) 8983 continue; 8984 8985 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8986 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8987 auto *Region = GetReplicateRegion(PrevRecipe); 8988 if (Region) 8989 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 8990 if (Region || PrevRecipe->isPhi()) 8991 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8992 else 8993 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 8994 8995 auto *RecurSplice = cast<VPInstruction>( 8996 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 8997 {RecurPhi, RecurPhi->getBackedgeValue()})); 8998 8999 RecurPhi->replaceAllUsesWith(RecurSplice); 9000 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9001 // all users. 9002 RecurSplice->setOperand(0, RecurPhi); 9003 } 9004 9005 // Interleave memory: for each Interleave Group we marked earlier as relevant 9006 // for this VPlan, replace the Recipes widening its memory instructions with a 9007 // single VPInterleaveRecipe at its insertion point. 9008 for (auto IG : InterleaveGroups) { 9009 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9010 RecipeBuilder.getRecipe(IG->getInsertPos())); 9011 SmallVector<VPValue *, 4> StoredValues; 9012 for (unsigned i = 0; i < IG->getFactor(); ++i) 9013 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9014 auto *StoreR = 9015 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9016 StoredValues.push_back(StoreR->getStoredValue()); 9017 } 9018 9019 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9020 Recipe->getMask()); 9021 VPIG->insertBefore(Recipe); 9022 unsigned J = 0; 9023 for (unsigned i = 0; i < IG->getFactor(); ++i) 9024 if (Instruction *Member = IG->getMember(i)) { 9025 if (!Member->getType()->isVoidTy()) { 9026 VPValue *OriginalV = Plan->getVPValue(Member); 9027 Plan->removeVPValueFor(Member); 9028 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9029 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9030 J++; 9031 } 9032 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9033 } 9034 } 9035 9036 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9037 // in ways that accessing values using original IR values is incorrect. 9038 Plan->disableValue2VPValue(); 9039 9040 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9041 VPlanTransforms::sinkScalarOperands(*Plan); 9042 VPlanTransforms::mergeReplicateRegions(*Plan); 9043 VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop); 9044 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9045 9046 std::string PlanName; 9047 raw_string_ostream RSO(PlanName); 9048 ElementCount VF = Range.Start; 9049 Plan->addVF(VF); 9050 RSO << "Initial VPlan for VF={" << VF; 9051 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9052 Plan->addVF(VF); 9053 RSO << "," << VF; 9054 } 9055 RSO << "},UF>=1"; 9056 RSO.flush(); 9057 Plan->setName(PlanName); 9058 9059 // Fold Exit block into its predecessor if possible. 9060 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9061 // VPBasicBlock as exit. 9062 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); 9063 9064 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9065 return Plan; 9066 } 9067 9068 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9069 // Outer loop handling: They may require CFG and instruction level 9070 // transformations before even evaluating whether vectorization is profitable. 9071 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9072 // the vectorization pipeline. 9073 assert(!OrigLoop->isInnermost()); 9074 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9075 9076 // Create new empty VPlan 9077 auto Plan = std::make_unique<VPlan>(); 9078 9079 // Build hierarchical CFG 9080 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9081 HCFGBuilder.buildHierarchicalCFG(); 9082 9083 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9084 VF *= 2) 9085 Plan->addVF(VF); 9086 9087 SmallPtrSet<Instruction *, 1> DeadInstructions; 9088 VPlanTransforms::VPInstructionsToVPRecipes( 9089 OrigLoop, Plan, 9090 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9091 DeadInstructions, *PSE.getSE()); 9092 9093 // Remove the existing terminator of the exiting block of the top-most region. 9094 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9095 auto *Term = 9096 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9097 Term->eraseFromParent(); 9098 9099 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9100 true); 9101 return Plan; 9102 } 9103 9104 // Adjust the recipes for reductions. For in-loop reductions the chain of 9105 // instructions leading from the loop exit instr to the phi need to be converted 9106 // to reductions, with one operand being vector and the other being the scalar 9107 // reduction chain. For other reductions, a select is introduced between the phi 9108 // and live-out recipes when folding the tail. 9109 void LoopVectorizationPlanner::adjustRecipesForReductions( 9110 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9111 ElementCount MinVF) { 9112 for (auto &Reduction : CM.getInLoopReductionChains()) { 9113 PHINode *Phi = Reduction.first; 9114 const RecurrenceDescriptor &RdxDesc = 9115 Legal->getReductionVars().find(Phi)->second; 9116 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9117 9118 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9119 continue; 9120 9121 // ReductionOperations are orders top-down from the phi's use to the 9122 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9123 // which of the two operands will remain scalar and which will be reduced. 9124 // For minmax the chain will be the select instructions. 9125 Instruction *Chain = Phi; 9126 for (Instruction *R : ReductionOperations) { 9127 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9128 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9129 9130 VPValue *ChainOp = Plan->getVPValue(Chain); 9131 unsigned FirstOpId; 9132 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9133 "Only min/max recurrences allowed for inloop reductions"); 9134 // Recognize a call to the llvm.fmuladd intrinsic. 9135 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9136 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9137 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9138 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9139 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9140 "Expected to replace a VPWidenSelectSC"); 9141 FirstOpId = 1; 9142 } else { 9143 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9144 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9145 "Expected to replace a VPWidenSC"); 9146 FirstOpId = 0; 9147 } 9148 unsigned VecOpId = 9149 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9150 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9151 9152 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9153 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9154 : nullptr; 9155 9156 if (IsFMulAdd) { 9157 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9158 // need to create an fmul recipe to use as the vector operand for the 9159 // fadd reduction. 9160 VPInstruction *FMulRecipe = new VPInstruction( 9161 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9162 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9163 WidenRecipe->getParent()->insert(FMulRecipe, 9164 WidenRecipe->getIterator()); 9165 VecOp = FMulRecipe; 9166 } 9167 VPReductionRecipe *RedRecipe = 9168 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9169 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9170 Plan->removeVPValueFor(R); 9171 Plan->addVPValue(R, RedRecipe); 9172 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9173 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9174 WidenRecipe->eraseFromParent(); 9175 9176 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9177 VPRecipeBase *CompareRecipe = 9178 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9179 assert(isa<VPWidenRecipe>(CompareRecipe) && 9180 "Expected to replace a VPWidenSC"); 9181 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9182 "Expected no remaining users"); 9183 CompareRecipe->eraseFromParent(); 9184 } 9185 Chain = R; 9186 } 9187 } 9188 9189 // If tail is folded by masking, introduce selects between the phi 9190 // and the live-out instruction of each reduction, at the beginning of the 9191 // dedicated latch block. 9192 if (CM.foldTailByMasking()) { 9193 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9194 for (VPRecipeBase &R : 9195 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9196 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9197 if (!PhiR || PhiR->isInLoop()) 9198 continue; 9199 VPValue *Cond = 9200 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9201 VPValue *Red = PhiR->getBackedgeValue(); 9202 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9203 "reduction recipe must be defined before latch"); 9204 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9205 } 9206 } 9207 } 9208 9209 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9210 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9211 VPSlotTracker &SlotTracker) const { 9212 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9213 IG->getInsertPos()->printAsOperand(O, false); 9214 O << ", "; 9215 getAddr()->printAsOperand(O, SlotTracker); 9216 VPValue *Mask = getMask(); 9217 if (Mask) { 9218 O << ", "; 9219 Mask->printAsOperand(O, SlotTracker); 9220 } 9221 9222 unsigned OpIdx = 0; 9223 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9224 if (!IG->getMember(i)) 9225 continue; 9226 if (getNumStoreOperands() > 0) { 9227 O << "\n" << Indent << " store "; 9228 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9229 O << " to index " << i; 9230 } else { 9231 O << "\n" << Indent << " "; 9232 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9233 O << " = load from index " << i; 9234 } 9235 ++OpIdx; 9236 } 9237 } 9238 #endif 9239 9240 void VPWidenCallRecipe::execute(VPTransformState &State) { 9241 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9242 *this, State); 9243 } 9244 9245 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9246 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9247 State.ILV->setDebugLocFromInst(&I); 9248 9249 // The condition can be loop invariant but still defined inside the 9250 // loop. This means that we can't just use the original 'cond' value. 9251 // We have to take the 'vectorized' value and pick the first lane. 9252 // Instcombine will make this a no-op. 9253 auto *InvarCond = 9254 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9255 9256 for (unsigned Part = 0; Part < State.UF; ++Part) { 9257 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9258 Value *Op0 = State.get(getOperand(1), Part); 9259 Value *Op1 = State.get(getOperand(2), Part); 9260 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9261 State.set(this, Sel, Part); 9262 State.ILV->addMetadata(Sel, &I); 9263 } 9264 } 9265 9266 void VPWidenRecipe::execute(VPTransformState &State) { 9267 auto &I = *cast<Instruction>(getUnderlyingValue()); 9268 auto &Builder = State.Builder; 9269 switch (I.getOpcode()) { 9270 case Instruction::Call: 9271 case Instruction::Br: 9272 case Instruction::PHI: 9273 case Instruction::GetElementPtr: 9274 case Instruction::Select: 9275 llvm_unreachable("This instruction is handled by a different recipe."); 9276 case Instruction::UDiv: 9277 case Instruction::SDiv: 9278 case Instruction::SRem: 9279 case Instruction::URem: 9280 case Instruction::Add: 9281 case Instruction::FAdd: 9282 case Instruction::Sub: 9283 case Instruction::FSub: 9284 case Instruction::FNeg: 9285 case Instruction::Mul: 9286 case Instruction::FMul: 9287 case Instruction::FDiv: 9288 case Instruction::FRem: 9289 case Instruction::Shl: 9290 case Instruction::LShr: 9291 case Instruction::AShr: 9292 case Instruction::And: 9293 case Instruction::Or: 9294 case Instruction::Xor: { 9295 // Just widen unops and binops. 9296 State.ILV->setDebugLocFromInst(&I); 9297 9298 for (unsigned Part = 0; Part < State.UF; ++Part) { 9299 SmallVector<Value *, 2> Ops; 9300 for (VPValue *VPOp : operands()) 9301 Ops.push_back(State.get(VPOp, Part)); 9302 9303 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9304 9305 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9306 VecOp->copyIRFlags(&I); 9307 9308 // If the instruction is vectorized and was in a basic block that needed 9309 // predication, we can't propagate poison-generating flags (nuw/nsw, 9310 // exact, etc.). The control flow has been linearized and the 9311 // instruction is no longer guarded by the predicate, which could make 9312 // the flag properties to no longer hold. 9313 if (State.MayGeneratePoisonRecipes.contains(this)) 9314 VecOp->dropPoisonGeneratingFlags(); 9315 } 9316 9317 // Use this vector value for all users of the original instruction. 9318 State.set(this, V, Part); 9319 State.ILV->addMetadata(V, &I); 9320 } 9321 9322 break; 9323 } 9324 case Instruction::Freeze: { 9325 State.ILV->setDebugLocFromInst(&I); 9326 9327 for (unsigned Part = 0; Part < State.UF; ++Part) { 9328 Value *Op = State.get(getOperand(0), Part); 9329 9330 Value *Freeze = Builder.CreateFreeze(Op); 9331 State.set(this, Freeze, Part); 9332 } 9333 break; 9334 } 9335 case Instruction::ICmp: 9336 case Instruction::FCmp: { 9337 // Widen compares. Generate vector compares. 9338 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9339 auto *Cmp = cast<CmpInst>(&I); 9340 State.ILV->setDebugLocFromInst(Cmp); 9341 for (unsigned Part = 0; Part < State.UF; ++Part) { 9342 Value *A = State.get(getOperand(0), Part); 9343 Value *B = State.get(getOperand(1), Part); 9344 Value *C = nullptr; 9345 if (FCmp) { 9346 // Propagate fast math flags. 9347 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9348 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9349 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9350 } else { 9351 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9352 } 9353 State.set(this, C, Part); 9354 State.ILV->addMetadata(C, &I); 9355 } 9356 9357 break; 9358 } 9359 9360 case Instruction::ZExt: 9361 case Instruction::SExt: 9362 case Instruction::FPToUI: 9363 case Instruction::FPToSI: 9364 case Instruction::FPExt: 9365 case Instruction::PtrToInt: 9366 case Instruction::IntToPtr: 9367 case Instruction::SIToFP: 9368 case Instruction::UIToFP: 9369 case Instruction::Trunc: 9370 case Instruction::FPTrunc: 9371 case Instruction::BitCast: { 9372 auto *CI = cast<CastInst>(&I); 9373 State.ILV->setDebugLocFromInst(CI); 9374 9375 /// Vectorize casts. 9376 Type *DestTy = (State.VF.isScalar()) 9377 ? CI->getType() 9378 : VectorType::get(CI->getType(), State.VF); 9379 9380 for (unsigned Part = 0; Part < State.UF; ++Part) { 9381 Value *A = State.get(getOperand(0), Part); 9382 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9383 State.set(this, Cast, Part); 9384 State.ILV->addMetadata(Cast, &I); 9385 } 9386 break; 9387 } 9388 default: 9389 // This instruction is not vectorized by simple widening. 9390 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9391 llvm_unreachable("Unhandled instruction!"); 9392 } // end of switch. 9393 } 9394 9395 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9396 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9397 // Construct a vector GEP by widening the operands of the scalar GEP as 9398 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9399 // results in a vector of pointers when at least one operand of the GEP 9400 // is vector-typed. Thus, to keep the representation compact, we only use 9401 // vector-typed operands for loop-varying values. 9402 9403 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9404 // If we are vectorizing, but the GEP has only loop-invariant operands, 9405 // the GEP we build (by only using vector-typed operands for 9406 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9407 // produce a vector of pointers, we need to either arbitrarily pick an 9408 // operand to broadcast, or broadcast a clone of the original GEP. 9409 // Here, we broadcast a clone of the original. 9410 // 9411 // TODO: If at some point we decide to scalarize instructions having 9412 // loop-invariant operands, this special case will no longer be 9413 // required. We would add the scalarization decision to 9414 // collectLoopScalars() and teach getVectorValue() to broadcast 9415 // the lane-zero scalar value. 9416 auto *Clone = State.Builder.Insert(GEP->clone()); 9417 for (unsigned Part = 0; Part < State.UF; ++Part) { 9418 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9419 State.set(this, EntryPart, Part); 9420 State.ILV->addMetadata(EntryPart, GEP); 9421 } 9422 } else { 9423 // If the GEP has at least one loop-varying operand, we are sure to 9424 // produce a vector of pointers. But if we are only unrolling, we want 9425 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9426 // produce with the code below will be scalar (if VF == 1) or vector 9427 // (otherwise). Note that for the unroll-only case, we still maintain 9428 // values in the vector mapping with initVector, as we do for other 9429 // instructions. 9430 for (unsigned Part = 0; Part < State.UF; ++Part) { 9431 // The pointer operand of the new GEP. If it's loop-invariant, we 9432 // won't broadcast it. 9433 auto *Ptr = IsPtrLoopInvariant 9434 ? State.get(getOperand(0), VPIteration(0, 0)) 9435 : State.get(getOperand(0), Part); 9436 9437 // Collect all the indices for the new GEP. If any index is 9438 // loop-invariant, we won't broadcast it. 9439 SmallVector<Value *, 4> Indices; 9440 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9441 VPValue *Operand = getOperand(I); 9442 if (IsIndexLoopInvariant[I - 1]) 9443 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9444 else 9445 Indices.push_back(State.get(Operand, Part)); 9446 } 9447 9448 // If the GEP instruction is vectorized and was in a basic block that 9449 // needed predication, we can't propagate the poison-generating 'inbounds' 9450 // flag. The control flow has been linearized and the GEP is no longer 9451 // guarded by the predicate, which could make the 'inbounds' properties to 9452 // no longer hold. 9453 bool IsInBounds = 9454 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9455 9456 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9457 // but it should be a vector, otherwise. 9458 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 9459 Indices, "", IsInBounds); 9460 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9461 "NewGEP is not a pointer vector"); 9462 State.set(this, NewGEP, Part); 9463 State.ILV->addMetadata(NewGEP, GEP); 9464 } 9465 } 9466 } 9467 9468 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9469 assert(!State.Instance && "Int or FP induction being replicated."); 9470 9471 Value *Start = getStartValue()->getLiveInIRValue(); 9472 const InductionDescriptor &ID = getInductionDescriptor(); 9473 TruncInst *Trunc = getTruncInst(); 9474 IRBuilderBase &Builder = State.Builder; 9475 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9476 assert(State.VF.isVector() && "must have vector VF"); 9477 9478 // The value from the original loop to which we are mapping the new induction 9479 // variable. 9480 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9481 9482 // Fast-math-flags propagate from the original induction instruction. 9483 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9484 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9485 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9486 9487 // Now do the actual transformations, and start with fetching the step value. 9488 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9489 9490 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9491 "Expected either an induction phi-node or a truncate of it!"); 9492 9493 // Construct the initial value of the vector IV in the vector loop preheader 9494 auto CurrIP = Builder.saveIP(); 9495 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9496 Builder.SetInsertPoint(VectorPH->getTerminator()); 9497 if (isa<TruncInst>(EntryVal)) { 9498 assert(Start->getType()->isIntegerTy() && 9499 "Truncation requires an integer type"); 9500 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9501 Step = Builder.CreateTrunc(Step, TruncType); 9502 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9503 } 9504 9505 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9506 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9507 Value *SteppedStart = getStepVector( 9508 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9509 9510 // We create vector phi nodes for both integer and floating-point induction 9511 // variables. Here, we determine the kind of arithmetic we will perform. 9512 Instruction::BinaryOps AddOp; 9513 Instruction::BinaryOps MulOp; 9514 if (Step->getType()->isIntegerTy()) { 9515 AddOp = Instruction::Add; 9516 MulOp = Instruction::Mul; 9517 } else { 9518 AddOp = ID.getInductionOpcode(); 9519 MulOp = Instruction::FMul; 9520 } 9521 9522 // Multiply the vectorization factor by the step using integer or 9523 // floating-point arithmetic as appropriate. 9524 Type *StepType = Step->getType(); 9525 Value *RuntimeVF; 9526 if (Step->getType()->isFloatingPointTy()) 9527 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9528 else 9529 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9530 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9531 9532 // Create a vector splat to use in the induction update. 9533 // 9534 // FIXME: If the step is non-constant, we create the vector splat with 9535 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9536 // handle a constant vector splat. 9537 Value *SplatVF = isa<Constant>(Mul) 9538 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9539 : Builder.CreateVectorSplat(State.VF, Mul); 9540 Builder.restoreIP(CurrIP); 9541 9542 // We may need to add the step a number of times, depending on the unroll 9543 // factor. The last of those goes into the PHI. 9544 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9545 &*State.CFG.PrevBB->getFirstInsertionPt()); 9546 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9547 Instruction *LastInduction = VecInd; 9548 for (unsigned Part = 0; Part < State.UF; ++Part) { 9549 State.set(this, LastInduction, Part); 9550 9551 if (isa<TruncInst>(EntryVal)) 9552 State.ILV->addMetadata(LastInduction, EntryVal); 9553 9554 LastInduction = cast<Instruction>( 9555 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9556 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9557 } 9558 9559 LastInduction->setName("vec.ind.next"); 9560 VecInd->addIncoming(SteppedStart, VectorPH); 9561 // Add induction update using an incorrect block temporarily. The phi node 9562 // will be fixed after VPlan execution. Note that at this point the latch 9563 // block cannot be used, as it does not exist yet. 9564 // TODO: Model increment value in VPlan, by turning the recipe into a 9565 // multi-def and a subclass of VPHeaderPHIRecipe. 9566 VecInd->addIncoming(LastInduction, VectorPH); 9567 } 9568 9569 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9570 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9571 "Not a pointer induction according to InductionDescriptor!"); 9572 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9573 "Unexpected type."); 9574 9575 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9576 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9577 9578 if (onlyScalarsGenerated(State.VF)) { 9579 // This is the normalized GEP that starts counting at zero. 9580 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9581 CanonicalIV, IndDesc.getStep()->getType()); 9582 // Determine the number of scalars we need to generate for each unroll 9583 // iteration. If the instruction is uniform, we only need to generate the 9584 // first lane. Otherwise, we generate all VF values. 9585 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9586 assert((IsUniform || !State.VF.isScalable()) && 9587 "Cannot scalarize a scalable VF"); 9588 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9589 9590 for (unsigned Part = 0; Part < State.UF; ++Part) { 9591 Value *PartStart = 9592 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9593 9594 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9595 Value *Idx = State.Builder.CreateAdd( 9596 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9597 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9598 9599 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9600 State.CFG.PrevBB->getTerminator()); 9601 Value *SclrGep = emitTransformedIndex( 9602 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9603 SclrGep->setName("next.gep"); 9604 State.set(this, SclrGep, VPIteration(Part, Lane)); 9605 } 9606 } 9607 return; 9608 } 9609 9610 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9611 "Induction step not a SCEV constant!"); 9612 Type *PhiType = IndDesc.getStep()->getType(); 9613 9614 // Build a pointer phi 9615 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9616 Type *ScStValueType = ScalarStartValue->getType(); 9617 PHINode *NewPointerPhi = 9618 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9619 9620 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9621 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9622 9623 // A pointer induction, performed by using a gep 9624 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9625 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9626 9627 const SCEV *ScalarStep = IndDesc.getStep(); 9628 SCEVExpander Exp(SE, DL, "induction"); 9629 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9630 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9631 Value *NumUnrolledElems = 9632 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9633 Value *InductionGEP = GetElementPtrInst::Create( 9634 IndDesc.getElementType(), NewPointerPhi, 9635 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9636 InductionLoc); 9637 // Add induction update using an incorrect block temporarily. The phi node 9638 // will be fixed after VPlan execution. Note that at this point the latch 9639 // block cannot be used, as it does not exist yet. 9640 // TODO: Model increment value in VPlan, by turning the recipe into a 9641 // multi-def and a subclass of VPHeaderPHIRecipe. 9642 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9643 9644 // Create UF many actual address geps that use the pointer 9645 // phi as base and a vectorized version of the step value 9646 // (<step*0, ..., step*N>) as offset. 9647 for (unsigned Part = 0; Part < State.UF; ++Part) { 9648 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9649 Value *StartOffsetScalar = 9650 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9651 Value *StartOffset = 9652 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9653 // Create a vector of consecutive numbers from zero to VF. 9654 StartOffset = State.Builder.CreateAdd( 9655 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9656 9657 Value *GEP = State.Builder.CreateGEP( 9658 IndDesc.getElementType(), NewPointerPhi, 9659 State.Builder.CreateMul( 9660 StartOffset, 9661 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9662 "vector.gep")); 9663 State.set(this, GEP, Part); 9664 } 9665 } 9666 9667 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9668 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9669 9670 // Fast-math-flags propagate from the original induction instruction. 9671 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9672 if (IndDesc.getInductionBinOp() && 9673 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9674 State.Builder.setFastMathFlags( 9675 IndDesc.getInductionBinOp()->getFastMathFlags()); 9676 9677 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9678 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9679 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9680 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9681 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9682 ScalarIV = 9683 Ty->isIntegerTy() 9684 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9685 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9686 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9687 getStartValue()->getLiveInIRValue(), Step, 9688 IndDesc); 9689 ScalarIV->setName("offset.idx"); 9690 } 9691 if (TruncToTy) { 9692 assert(Step->getType()->isIntegerTy() && 9693 "Truncation requires an integer step"); 9694 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9695 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9696 } 9697 return ScalarIV; 9698 }; 9699 9700 Value *ScalarIV = CreateScalarIV(Step); 9701 if (State.VF.isVector()) { 9702 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9703 return; 9704 } 9705 9706 for (unsigned Part = 0; Part < State.UF; ++Part) { 9707 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9708 Value *EntryPart; 9709 if (Step->getType()->isFloatingPointTy()) { 9710 Value *StartIdx = 9711 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9712 // Floating-point operations inherit FMF via the builder's flags. 9713 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9714 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9715 ScalarIV, MulOp); 9716 } else { 9717 Value *StartIdx = 9718 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9719 EntryPart = State.Builder.CreateAdd( 9720 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9721 } 9722 State.set(this, EntryPart, Part); 9723 } 9724 } 9725 9726 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9727 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9728 State); 9729 } 9730 9731 void VPBlendRecipe::execute(VPTransformState &State) { 9732 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9733 // We know that all PHIs in non-header blocks are converted into 9734 // selects, so we don't have to worry about the insertion order and we 9735 // can just use the builder. 9736 // At this point we generate the predication tree. There may be 9737 // duplications since this is a simple recursive scan, but future 9738 // optimizations will clean it up. 9739 9740 unsigned NumIncoming = getNumIncomingValues(); 9741 9742 // Generate a sequence of selects of the form: 9743 // SELECT(Mask3, In3, 9744 // SELECT(Mask2, In2, 9745 // SELECT(Mask1, In1, 9746 // In0))) 9747 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9748 // are essentially undef are taken from In0. 9749 InnerLoopVectorizer::VectorParts Entry(State.UF); 9750 for (unsigned In = 0; In < NumIncoming; ++In) { 9751 for (unsigned Part = 0; Part < State.UF; ++Part) { 9752 // We might have single edge PHIs (blocks) - use an identity 9753 // 'select' for the first PHI operand. 9754 Value *In0 = State.get(getIncomingValue(In), Part); 9755 if (In == 0) 9756 Entry[Part] = In0; // Initialize with the first incoming value. 9757 else { 9758 // Select between the current value and the previous incoming edge 9759 // based on the incoming mask. 9760 Value *Cond = State.get(getMask(In), Part); 9761 Entry[Part] = 9762 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9763 } 9764 } 9765 } 9766 for (unsigned Part = 0; Part < State.UF; ++Part) 9767 State.set(this, Entry[Part], Part); 9768 } 9769 9770 void VPInterleaveRecipe::execute(VPTransformState &State) { 9771 assert(!State.Instance && "Interleave group being replicated."); 9772 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9773 getStoredValues(), getMask()); 9774 } 9775 9776 void VPReductionRecipe::execute(VPTransformState &State) { 9777 assert(!State.Instance && "Reduction being replicated."); 9778 Value *PrevInChain = State.get(getChainOp(), 0); 9779 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9780 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9781 // Propagate the fast-math flags carried by the underlying instruction. 9782 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9783 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9784 for (unsigned Part = 0; Part < State.UF; ++Part) { 9785 Value *NewVecOp = State.get(getVecOp(), Part); 9786 if (VPValue *Cond = getCondOp()) { 9787 Value *NewCond = State.get(Cond, Part); 9788 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9789 Value *Iden = RdxDesc->getRecurrenceIdentity( 9790 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9791 Value *IdenVec = 9792 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9793 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9794 NewVecOp = Select; 9795 } 9796 Value *NewRed; 9797 Value *NextInChain; 9798 if (IsOrdered) { 9799 if (State.VF.isVector()) 9800 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9801 PrevInChain); 9802 else 9803 NewRed = State.Builder.CreateBinOp( 9804 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9805 NewVecOp); 9806 PrevInChain = NewRed; 9807 } else { 9808 PrevInChain = State.get(getChainOp(), Part); 9809 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9810 } 9811 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9812 NextInChain = 9813 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9814 NewRed, PrevInChain); 9815 } else if (IsOrdered) 9816 NextInChain = NewRed; 9817 else 9818 NextInChain = State.Builder.CreateBinOp( 9819 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9820 PrevInChain); 9821 State.set(this, NextInChain, Part); 9822 } 9823 } 9824 9825 void VPReplicateRecipe::execute(VPTransformState &State) { 9826 if (State.Instance) { // Generate a single instance. 9827 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9828 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9829 IsPredicated, State); 9830 // Insert scalar instance packing it into a vector. 9831 if (AlsoPack && State.VF.isVector()) { 9832 // If we're constructing lane 0, initialize to start from poison. 9833 if (State.Instance->Lane.isFirstLane()) { 9834 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9835 Value *Poison = PoisonValue::get( 9836 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9837 State.set(this, Poison, State.Instance->Part); 9838 } 9839 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9840 } 9841 return; 9842 } 9843 9844 // Generate scalar instances for all VF lanes of all UF parts, unless the 9845 // instruction is uniform inwhich case generate only the first lane for each 9846 // of the UF parts. 9847 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9848 assert((!State.VF.isScalable() || IsUniform) && 9849 "Can't scalarize a scalable vector"); 9850 for (unsigned Part = 0; Part < State.UF; ++Part) 9851 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9852 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9853 VPIteration(Part, Lane), IsPredicated, 9854 State); 9855 } 9856 9857 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9858 assert(State.Instance && "Branch on Mask works only on single instance."); 9859 9860 unsigned Part = State.Instance->Part; 9861 unsigned Lane = State.Instance->Lane.getKnownLane(); 9862 9863 Value *ConditionBit = nullptr; 9864 VPValue *BlockInMask = getMask(); 9865 if (BlockInMask) { 9866 ConditionBit = State.get(BlockInMask, Part); 9867 if (ConditionBit->getType()->isVectorTy()) 9868 ConditionBit = State.Builder.CreateExtractElement( 9869 ConditionBit, State.Builder.getInt32(Lane)); 9870 } else // Block in mask is all-one. 9871 ConditionBit = State.Builder.getTrue(); 9872 9873 // Replace the temporary unreachable terminator with a new conditional branch, 9874 // whose two destinations will be set later when they are created. 9875 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9876 assert(isa<UnreachableInst>(CurrentTerminator) && 9877 "Expected to replace unreachable terminator with conditional branch."); 9878 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9879 CondBr->setSuccessor(0, nullptr); 9880 ReplaceInstWithInst(CurrentTerminator, CondBr); 9881 } 9882 9883 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9884 assert(State.Instance && "Predicated instruction PHI works per instance."); 9885 Instruction *ScalarPredInst = 9886 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9887 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9888 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9889 assert(PredicatingBB && "Predicated block has no single predecessor."); 9890 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9891 "operand must be VPReplicateRecipe"); 9892 9893 // By current pack/unpack logic we need to generate only a single phi node: if 9894 // a vector value for the predicated instruction exists at this point it means 9895 // the instruction has vector users only, and a phi for the vector value is 9896 // needed. In this case the recipe of the predicated instruction is marked to 9897 // also do that packing, thereby "hoisting" the insert-element sequence. 9898 // Otherwise, a phi node for the scalar value is needed. 9899 unsigned Part = State.Instance->Part; 9900 if (State.hasVectorValue(getOperand(0), Part)) { 9901 Value *VectorValue = State.get(getOperand(0), Part); 9902 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9903 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9904 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9905 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9906 if (State.hasVectorValue(this, Part)) 9907 State.reset(this, VPhi, Part); 9908 else 9909 State.set(this, VPhi, Part); 9910 // NOTE: Currently we need to update the value of the operand, so the next 9911 // predicated iteration inserts its generated value in the correct vector. 9912 State.reset(getOperand(0), VPhi, Part); 9913 } else { 9914 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9915 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9916 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9917 PredicatingBB); 9918 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9919 if (State.hasScalarValue(this, *State.Instance)) 9920 State.reset(this, Phi, *State.Instance); 9921 else 9922 State.set(this, Phi, *State.Instance); 9923 // NOTE: Currently we need to update the value of the operand, so the next 9924 // predicated iteration inserts its generated value in the correct vector. 9925 State.reset(getOperand(0), Phi, *State.Instance); 9926 } 9927 } 9928 9929 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9930 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9931 9932 // Attempt to issue a wide load. 9933 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9934 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9935 9936 assert((LI || SI) && "Invalid Load/Store instruction"); 9937 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9938 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9939 9940 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9941 9942 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9943 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9944 bool CreateGatherScatter = !Consecutive; 9945 9946 auto &Builder = State.Builder; 9947 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9948 bool isMaskRequired = getMask(); 9949 if (isMaskRequired) 9950 for (unsigned Part = 0; Part < State.UF; ++Part) 9951 BlockInMaskParts[Part] = State.get(getMask(), Part); 9952 9953 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9954 // Calculate the pointer for the specific unroll-part. 9955 GetElementPtrInst *PartPtr = nullptr; 9956 9957 bool InBounds = false; 9958 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9959 InBounds = gep->isInBounds(); 9960 if (Reverse) { 9961 // If the address is consecutive but reversed, then the 9962 // wide store needs to start at the last vector element. 9963 // RunTimeVF = VScale * VF.getKnownMinValue() 9964 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9965 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9966 // NumElt = -Part * RunTimeVF 9967 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9968 // LastLane = 1 - RunTimeVF 9969 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9970 PartPtr = 9971 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9972 PartPtr->setIsInBounds(InBounds); 9973 PartPtr = cast<GetElementPtrInst>( 9974 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9975 PartPtr->setIsInBounds(InBounds); 9976 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9977 BlockInMaskParts[Part] = 9978 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9979 } else { 9980 Value *Increment = 9981 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9982 PartPtr = cast<GetElementPtrInst>( 9983 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9984 PartPtr->setIsInBounds(InBounds); 9985 } 9986 9987 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9988 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9989 }; 9990 9991 // Handle Stores: 9992 if (SI) { 9993 State.ILV->setDebugLocFromInst(SI); 9994 9995 for (unsigned Part = 0; Part < State.UF; ++Part) { 9996 Instruction *NewSI = nullptr; 9997 Value *StoredVal = State.get(StoredValue, Part); 9998 if (CreateGatherScatter) { 9999 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10000 Value *VectorGep = State.get(getAddr(), Part); 10001 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 10002 MaskPart); 10003 } else { 10004 if (Reverse) { 10005 // If we store to reverse consecutive memory locations, then we need 10006 // to reverse the order of elements in the stored value. 10007 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 10008 // We don't want to update the value in the map as it might be used in 10009 // another expression. So don't call resetVectorValue(StoredVal). 10010 } 10011 auto *VecPtr = 10012 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10013 if (isMaskRequired) 10014 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 10015 BlockInMaskParts[Part]); 10016 else 10017 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 10018 } 10019 State.ILV->addMetadata(NewSI, SI); 10020 } 10021 return; 10022 } 10023 10024 // Handle loads. 10025 assert(LI && "Must have a load instruction"); 10026 State.ILV->setDebugLocFromInst(LI); 10027 for (unsigned Part = 0; Part < State.UF; ++Part) { 10028 Value *NewLI; 10029 if (CreateGatherScatter) { 10030 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10031 Value *VectorGep = State.get(getAddr(), Part); 10032 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10033 nullptr, "wide.masked.gather"); 10034 State.ILV->addMetadata(NewLI, LI); 10035 } else { 10036 auto *VecPtr = 10037 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10038 if (isMaskRequired) 10039 NewLI = Builder.CreateMaskedLoad( 10040 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10041 PoisonValue::get(DataTy), "wide.masked.load"); 10042 else 10043 NewLI = 10044 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10045 10046 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10047 State.ILV->addMetadata(NewLI, LI); 10048 if (Reverse) 10049 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10050 } 10051 10052 State.set(getVPSingleValue(), NewLI, Part); 10053 } 10054 } 10055 10056 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10057 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10058 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10059 // for predication. 10060 static ScalarEpilogueLowering getScalarEpilogueLowering( 10061 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10062 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10063 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10064 LoopVectorizationLegality &LVL) { 10065 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10066 // don't look at hints or options, and don't request a scalar epilogue. 10067 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10068 // LoopAccessInfo (due to code dependency and not being able to reliably get 10069 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10070 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10071 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10072 // back to the old way and vectorize with versioning when forced. See D81345.) 10073 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10074 PGSOQueryType::IRPass) && 10075 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10076 return CM_ScalarEpilogueNotAllowedOptSize; 10077 10078 // 2) If set, obey the directives 10079 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10080 switch (PreferPredicateOverEpilogue) { 10081 case PreferPredicateTy::ScalarEpilogue: 10082 return CM_ScalarEpilogueAllowed; 10083 case PreferPredicateTy::PredicateElseScalarEpilogue: 10084 return CM_ScalarEpilogueNotNeededUsePredicate; 10085 case PreferPredicateTy::PredicateOrDontVectorize: 10086 return CM_ScalarEpilogueNotAllowedUsePredicate; 10087 }; 10088 } 10089 10090 // 3) If set, obey the hints 10091 switch (Hints.getPredicate()) { 10092 case LoopVectorizeHints::FK_Enabled: 10093 return CM_ScalarEpilogueNotNeededUsePredicate; 10094 case LoopVectorizeHints::FK_Disabled: 10095 return CM_ScalarEpilogueAllowed; 10096 }; 10097 10098 // 4) if the TTI hook indicates this is profitable, request predication. 10099 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10100 LVL.getLAI())) 10101 return CM_ScalarEpilogueNotNeededUsePredicate; 10102 10103 return CM_ScalarEpilogueAllowed; 10104 } 10105 10106 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10107 // If Values have been set for this Def return the one relevant for \p Part. 10108 if (hasVectorValue(Def, Part)) 10109 return Data.PerPartOutput[Def][Part]; 10110 10111 if (!hasScalarValue(Def, {Part, 0})) { 10112 Value *IRV = Def->getLiveInIRValue(); 10113 Value *B = ILV->getBroadcastInstrs(IRV); 10114 set(Def, B, Part); 10115 return B; 10116 } 10117 10118 Value *ScalarValue = get(Def, {Part, 0}); 10119 // If we aren't vectorizing, we can just copy the scalar map values over 10120 // to the vector map. 10121 if (VF.isScalar()) { 10122 set(Def, ScalarValue, Part); 10123 return ScalarValue; 10124 } 10125 10126 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10127 bool IsUniform = RepR && RepR->isUniform(); 10128 10129 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10130 // Check if there is a scalar value for the selected lane. 10131 if (!hasScalarValue(Def, {Part, LastLane})) { 10132 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10133 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10134 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10135 "unexpected recipe found to be invariant"); 10136 IsUniform = true; 10137 LastLane = 0; 10138 } 10139 10140 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10141 // Set the insert point after the last scalarized instruction or after the 10142 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10143 // will directly follow the scalar definitions. 10144 auto OldIP = Builder.saveIP(); 10145 auto NewIP = 10146 isa<PHINode>(LastInst) 10147 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10148 : std::next(BasicBlock::iterator(LastInst)); 10149 Builder.SetInsertPoint(&*NewIP); 10150 10151 // However, if we are vectorizing, we need to construct the vector values. 10152 // If the value is known to be uniform after vectorization, we can just 10153 // broadcast the scalar value corresponding to lane zero for each unroll 10154 // iteration. Otherwise, we construct the vector values using 10155 // insertelement instructions. Since the resulting vectors are stored in 10156 // State, we will only generate the insertelements once. 10157 Value *VectorValue = nullptr; 10158 if (IsUniform) { 10159 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10160 set(Def, VectorValue, Part); 10161 } else { 10162 // Initialize packing with insertelements to start from undef. 10163 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10164 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10165 set(Def, Undef, Part); 10166 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10167 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10168 VectorValue = get(Def, Part); 10169 } 10170 Builder.restoreIP(OldIP); 10171 return VectorValue; 10172 } 10173 10174 // Process the loop in the VPlan-native vectorization path. This path builds 10175 // VPlan upfront in the vectorization pipeline, which allows to apply 10176 // VPlan-to-VPlan transformations from the very beginning without modifying the 10177 // input LLVM IR. 10178 static bool processLoopInVPlanNativePath( 10179 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10180 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10181 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10182 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10183 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10184 LoopVectorizationRequirements &Requirements) { 10185 10186 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10187 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10188 return false; 10189 } 10190 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10191 Function *F = L->getHeader()->getParent(); 10192 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10193 10194 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10195 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10196 10197 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10198 &Hints, IAI); 10199 // Use the planner for outer loop vectorization. 10200 // TODO: CM is not used at this point inside the planner. Turn CM into an 10201 // optional argument if we don't need it in the future. 10202 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10203 Requirements, ORE); 10204 10205 // Get user vectorization factor. 10206 ElementCount UserVF = Hints.getWidth(); 10207 10208 CM.collectElementTypesForWidening(); 10209 10210 // Plan how to best vectorize, return the best VF and its cost. 10211 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10212 10213 // If we are stress testing VPlan builds, do not attempt to generate vector 10214 // code. Masked vector code generation support will follow soon. 10215 // Also, do not attempt to vectorize if no vector code will be produced. 10216 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10217 return false; 10218 10219 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10220 10221 { 10222 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10223 F->getParent()->getDataLayout()); 10224 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10225 &CM, BFI, PSI, Checks); 10226 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10227 << L->getHeader()->getParent()->getName() << "\"\n"); 10228 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10229 } 10230 10231 // Mark the loop as already vectorized to avoid vectorizing again. 10232 Hints.setAlreadyVectorized(); 10233 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10234 return true; 10235 } 10236 10237 // Emit a remark if there are stores to floats that required a floating point 10238 // extension. If the vectorized loop was generated with floating point there 10239 // will be a performance penalty from the conversion overhead and the change in 10240 // the vector width. 10241 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10242 SmallVector<Instruction *, 4> Worklist; 10243 for (BasicBlock *BB : L->getBlocks()) { 10244 for (Instruction &Inst : *BB) { 10245 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10246 if (S->getValueOperand()->getType()->isFloatTy()) 10247 Worklist.push_back(S); 10248 } 10249 } 10250 } 10251 10252 // Traverse the floating point stores upwards searching, for floating point 10253 // conversions. 10254 SmallPtrSet<const Instruction *, 4> Visited; 10255 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10256 while (!Worklist.empty()) { 10257 auto *I = Worklist.pop_back_val(); 10258 if (!L->contains(I)) 10259 continue; 10260 if (!Visited.insert(I).second) 10261 continue; 10262 10263 // Emit a remark if the floating point store required a floating 10264 // point conversion. 10265 // TODO: More work could be done to identify the root cause such as a 10266 // constant or a function return type and point the user to it. 10267 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10268 ORE->emit([&]() { 10269 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10270 I->getDebugLoc(), L->getHeader()) 10271 << "floating point conversion changes vector width. " 10272 << "Mixed floating point precision requires an up/down " 10273 << "cast that will negatively impact performance."; 10274 }); 10275 10276 for (Use &Op : I->operands()) 10277 if (auto *OpI = dyn_cast<Instruction>(Op)) 10278 Worklist.push_back(OpI); 10279 } 10280 } 10281 10282 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10283 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10284 !EnableLoopInterleaving), 10285 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10286 !EnableLoopVectorization) {} 10287 10288 bool LoopVectorizePass::processLoop(Loop *L) { 10289 assert((EnableVPlanNativePath || L->isInnermost()) && 10290 "VPlan-native path is not enabled. Only process inner loops."); 10291 10292 #ifndef NDEBUG 10293 const std::string DebugLocStr = getDebugLocString(L); 10294 #endif /* NDEBUG */ 10295 10296 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10297 << L->getHeader()->getParent()->getName() << "' from " 10298 << DebugLocStr << "\n"); 10299 10300 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10301 10302 LLVM_DEBUG( 10303 dbgs() << "LV: Loop hints:" 10304 << " force=" 10305 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10306 ? "disabled" 10307 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10308 ? "enabled" 10309 : "?")) 10310 << " width=" << Hints.getWidth() 10311 << " interleave=" << Hints.getInterleave() << "\n"); 10312 10313 // Function containing loop 10314 Function *F = L->getHeader()->getParent(); 10315 10316 // Looking at the diagnostic output is the only way to determine if a loop 10317 // was vectorized (other than looking at the IR or machine code), so it 10318 // is important to generate an optimization remark for each loop. Most of 10319 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10320 // generated as OptimizationRemark and OptimizationRemarkMissed are 10321 // less verbose reporting vectorized loops and unvectorized loops that may 10322 // benefit from vectorization, respectively. 10323 10324 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10325 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10326 return false; 10327 } 10328 10329 PredicatedScalarEvolution PSE(*SE, *L); 10330 10331 // Check if it is legal to vectorize the loop. 10332 LoopVectorizationRequirements Requirements; 10333 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10334 &Requirements, &Hints, DB, AC, BFI, PSI); 10335 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10336 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10337 Hints.emitRemarkWithHints(); 10338 return false; 10339 } 10340 10341 // Check the function attributes and profiles to find out if this function 10342 // should be optimized for size. 10343 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10344 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10345 10346 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10347 // here. They may require CFG and instruction level transformations before 10348 // even evaluating whether vectorization is profitable. Since we cannot modify 10349 // the incoming IR, we need to build VPlan upfront in the vectorization 10350 // pipeline. 10351 if (!L->isInnermost()) 10352 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10353 ORE, BFI, PSI, Hints, Requirements); 10354 10355 assert(L->isInnermost() && "Inner loop expected."); 10356 10357 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10358 // count by optimizing for size, to minimize overheads. 10359 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10360 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10361 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10362 << "This loop is worth vectorizing only if no scalar " 10363 << "iteration overheads are incurred."); 10364 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10365 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10366 else { 10367 LLVM_DEBUG(dbgs() << "\n"); 10368 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10369 } 10370 } 10371 10372 // Check the function attributes to see if implicit floats are allowed. 10373 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10374 // an integer loop and the vector instructions selected are purely integer 10375 // vector instructions? 10376 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10377 reportVectorizationFailure( 10378 "Can't vectorize when the NoImplicitFloat attribute is used", 10379 "loop not vectorized due to NoImplicitFloat attribute", 10380 "NoImplicitFloat", ORE, L); 10381 Hints.emitRemarkWithHints(); 10382 return false; 10383 } 10384 10385 // Check if the target supports potentially unsafe FP vectorization. 10386 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10387 // for the target we're vectorizing for, to make sure none of the 10388 // additional fp-math flags can help. 10389 if (Hints.isPotentiallyUnsafe() && 10390 TTI->isFPVectorizationPotentiallyUnsafe()) { 10391 reportVectorizationFailure( 10392 "Potentially unsafe FP op prevents vectorization", 10393 "loop not vectorized due to unsafe FP support.", 10394 "UnsafeFP", ORE, L); 10395 Hints.emitRemarkWithHints(); 10396 return false; 10397 } 10398 10399 bool AllowOrderedReductions; 10400 // If the flag is set, use that instead and override the TTI behaviour. 10401 if (ForceOrderedReductions.getNumOccurrences() > 0) 10402 AllowOrderedReductions = ForceOrderedReductions; 10403 else 10404 AllowOrderedReductions = TTI->enableOrderedReductions(); 10405 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10406 ORE->emit([&]() { 10407 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10408 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10409 ExactFPMathInst->getDebugLoc(), 10410 ExactFPMathInst->getParent()) 10411 << "loop not vectorized: cannot prove it is safe to reorder " 10412 "floating-point operations"; 10413 }); 10414 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10415 "reorder floating-point operations\n"); 10416 Hints.emitRemarkWithHints(); 10417 return false; 10418 } 10419 10420 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10421 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10422 10423 // If an override option has been passed in for interleaved accesses, use it. 10424 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10425 UseInterleaved = EnableInterleavedMemAccesses; 10426 10427 // Analyze interleaved memory accesses. 10428 if (UseInterleaved) { 10429 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10430 } 10431 10432 // Use the cost model. 10433 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10434 F, &Hints, IAI); 10435 CM.collectValuesToIgnore(); 10436 CM.collectElementTypesForWidening(); 10437 10438 // Use the planner for vectorization. 10439 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10440 Requirements, ORE); 10441 10442 // Get user vectorization factor and interleave count. 10443 ElementCount UserVF = Hints.getWidth(); 10444 unsigned UserIC = Hints.getInterleave(); 10445 10446 // Plan how to best vectorize, return the best VF and its cost. 10447 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10448 10449 VectorizationFactor VF = VectorizationFactor::Disabled(); 10450 unsigned IC = 1; 10451 10452 if (MaybeVF) { 10453 if (LVP.requiresTooManyRuntimeChecks()) { 10454 ORE->emit([&]() { 10455 return OptimizationRemarkAnalysisAliasing( 10456 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10457 L->getHeader()) 10458 << "loop not vectorized: cannot prove it is safe to reorder " 10459 "memory operations"; 10460 }); 10461 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10462 Hints.emitRemarkWithHints(); 10463 return false; 10464 } 10465 VF = *MaybeVF; 10466 // Select the interleave count. 10467 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10468 } 10469 10470 // Identify the diagnostic messages that should be produced. 10471 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10472 bool VectorizeLoop = true, InterleaveLoop = true; 10473 if (VF.Width.isScalar()) { 10474 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10475 VecDiagMsg = std::make_pair( 10476 "VectorizationNotBeneficial", 10477 "the cost-model indicates that vectorization is not beneficial"); 10478 VectorizeLoop = false; 10479 } 10480 10481 if (!MaybeVF && UserIC > 1) { 10482 // Tell the user interleaving was avoided up-front, despite being explicitly 10483 // requested. 10484 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10485 "interleaving should be avoided up front\n"); 10486 IntDiagMsg = std::make_pair( 10487 "InterleavingAvoided", 10488 "Ignoring UserIC, because interleaving was avoided up front"); 10489 InterleaveLoop = false; 10490 } else if (IC == 1 && UserIC <= 1) { 10491 // Tell the user interleaving is not beneficial. 10492 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10493 IntDiagMsg = std::make_pair( 10494 "InterleavingNotBeneficial", 10495 "the cost-model indicates that interleaving is not beneficial"); 10496 InterleaveLoop = false; 10497 if (UserIC == 1) { 10498 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10499 IntDiagMsg.second += 10500 " and is explicitly disabled or interleave count is set to 1"; 10501 } 10502 } else if (IC > 1 && UserIC == 1) { 10503 // Tell the user interleaving is beneficial, but it explicitly disabled. 10504 LLVM_DEBUG( 10505 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10506 IntDiagMsg = std::make_pair( 10507 "InterleavingBeneficialButDisabled", 10508 "the cost-model indicates that interleaving is beneficial " 10509 "but is explicitly disabled or interleave count is set to 1"); 10510 InterleaveLoop = false; 10511 } 10512 10513 // Override IC if user provided an interleave count. 10514 IC = UserIC > 0 ? UserIC : IC; 10515 10516 // Emit diagnostic messages, if any. 10517 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10518 if (!VectorizeLoop && !InterleaveLoop) { 10519 // Do not vectorize or interleaving the loop. 10520 ORE->emit([&]() { 10521 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10522 L->getStartLoc(), L->getHeader()) 10523 << VecDiagMsg.second; 10524 }); 10525 ORE->emit([&]() { 10526 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10527 L->getStartLoc(), L->getHeader()) 10528 << IntDiagMsg.second; 10529 }); 10530 return false; 10531 } else if (!VectorizeLoop && InterleaveLoop) { 10532 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10533 ORE->emit([&]() { 10534 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10535 L->getStartLoc(), L->getHeader()) 10536 << VecDiagMsg.second; 10537 }); 10538 } else if (VectorizeLoop && !InterleaveLoop) { 10539 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10540 << ") in " << DebugLocStr << '\n'); 10541 ORE->emit([&]() { 10542 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10543 L->getStartLoc(), L->getHeader()) 10544 << IntDiagMsg.second; 10545 }); 10546 } else if (VectorizeLoop && InterleaveLoop) { 10547 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10548 << ") in " << DebugLocStr << '\n'); 10549 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10550 } 10551 10552 bool DisableRuntimeUnroll = false; 10553 MDNode *OrigLoopID = L->getLoopID(); 10554 { 10555 // Optimistically generate runtime checks. Drop them if they turn out to not 10556 // be profitable. Limit the scope of Checks, so the cleanup happens 10557 // immediately after vector codegeneration is done. 10558 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10559 F->getParent()->getDataLayout()); 10560 if (!VF.Width.isScalar() || IC > 1) 10561 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC); 10562 10563 using namespace ore; 10564 if (!VectorizeLoop) { 10565 assert(IC > 1 && "interleave count should not be 1 or 0"); 10566 // If we decided that it is not legal to vectorize the loop, then 10567 // interleave it. 10568 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10569 &CM, BFI, PSI, Checks); 10570 10571 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10572 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10573 10574 ORE->emit([&]() { 10575 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10576 L->getHeader()) 10577 << "interleaved loop (interleaved count: " 10578 << NV("InterleaveCount", IC) << ")"; 10579 }); 10580 } else { 10581 // If we decided that it is *legal* to vectorize the loop, then do it. 10582 10583 // Consider vectorizing the epilogue too if it's profitable. 10584 VectorizationFactor EpilogueVF = 10585 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10586 if (EpilogueVF.Width.isVector()) { 10587 10588 // The first pass vectorizes the main loop and creates a scalar epilogue 10589 // to be vectorized by executing the plan (potentially with a different 10590 // factor) again shortly afterwards. 10591 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10592 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10593 EPI, &LVL, &CM, BFI, PSI, Checks); 10594 10595 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10596 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10597 DT); 10598 ++LoopsVectorized; 10599 10600 // Second pass vectorizes the epilogue and adjusts the control flow 10601 // edges from the first pass. 10602 EPI.MainLoopVF = EPI.EpilogueVF; 10603 EPI.MainLoopUF = EPI.EpilogueUF; 10604 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10605 ORE, EPI, &LVL, &CM, BFI, PSI, 10606 Checks); 10607 10608 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10609 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10610 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10611 Header->setName("vec.epilog.vector.body"); 10612 10613 // Ensure that the start values for any VPReductionPHIRecipes are 10614 // updated before vectorising the epilogue loop. 10615 for (VPRecipeBase &R : Header->phis()) { 10616 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10617 if (auto *Resume = MainILV.getReductionResumeValue( 10618 ReductionPhi->getRecurrenceDescriptor())) { 10619 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10620 ReductionPhi->setOperand(0, StartVal); 10621 } 10622 } 10623 } 10624 10625 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10626 DT); 10627 ++LoopsEpilogueVectorized; 10628 10629 if (!MainILV.areSafetyChecksAdded()) 10630 DisableRuntimeUnroll = true; 10631 } else { 10632 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10633 &LVL, &CM, BFI, PSI, Checks); 10634 10635 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10636 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10637 ++LoopsVectorized; 10638 10639 // Add metadata to disable runtime unrolling a scalar loop when there 10640 // are no runtime checks about strides and memory. A scalar loop that is 10641 // rarely used is not worth unrolling. 10642 if (!LB.areSafetyChecksAdded()) 10643 DisableRuntimeUnroll = true; 10644 } 10645 // Report the vectorization decision. 10646 ORE->emit([&]() { 10647 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10648 L->getHeader()) 10649 << "vectorized loop (vectorization width: " 10650 << NV("VectorizationFactor", VF.Width) 10651 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10652 }); 10653 } 10654 10655 if (ORE->allowExtraAnalysis(LV_NAME)) 10656 checkMixedPrecision(L, ORE); 10657 } 10658 10659 Optional<MDNode *> RemainderLoopID = 10660 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10661 LLVMLoopVectorizeFollowupEpilogue}); 10662 if (RemainderLoopID.hasValue()) { 10663 L->setLoopID(RemainderLoopID.getValue()); 10664 } else { 10665 if (DisableRuntimeUnroll) 10666 AddRuntimeUnrollDisableMetaData(L); 10667 10668 // Mark the loop as already vectorized to avoid vectorizing again. 10669 Hints.setAlreadyVectorized(); 10670 } 10671 10672 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10673 return true; 10674 } 10675 10676 LoopVectorizeResult LoopVectorizePass::runImpl( 10677 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10678 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10679 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10680 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10681 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10682 SE = &SE_; 10683 LI = &LI_; 10684 TTI = &TTI_; 10685 DT = &DT_; 10686 BFI = &BFI_; 10687 TLI = TLI_; 10688 AA = &AA_; 10689 AC = &AC_; 10690 GetLAA = &GetLAA_; 10691 DB = &DB_; 10692 ORE = &ORE_; 10693 PSI = PSI_; 10694 10695 // Don't attempt if 10696 // 1. the target claims to have no vector registers, and 10697 // 2. interleaving won't help ILP. 10698 // 10699 // The second condition is necessary because, even if the target has no 10700 // vector registers, loop vectorization may still enable scalar 10701 // interleaving. 10702 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10703 TTI->getMaxInterleaveFactor(1) < 2) 10704 return LoopVectorizeResult(false, false); 10705 10706 bool Changed = false, CFGChanged = false; 10707 10708 // The vectorizer requires loops to be in simplified form. 10709 // Since simplification may add new inner loops, it has to run before the 10710 // legality and profitability checks. This means running the loop vectorizer 10711 // will simplify all loops, regardless of whether anything end up being 10712 // vectorized. 10713 for (auto &L : *LI) 10714 Changed |= CFGChanged |= 10715 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10716 10717 // Build up a worklist of inner-loops to vectorize. This is necessary as 10718 // the act of vectorizing or partially unrolling a loop creates new loops 10719 // and can invalidate iterators across the loops. 10720 SmallVector<Loop *, 8> Worklist; 10721 10722 for (Loop *L : *LI) 10723 collectSupportedLoops(*L, LI, ORE, Worklist); 10724 10725 LoopsAnalyzed += Worklist.size(); 10726 10727 // Now walk the identified inner loops. 10728 while (!Worklist.empty()) { 10729 Loop *L = Worklist.pop_back_val(); 10730 10731 // For the inner loops we actually process, form LCSSA to simplify the 10732 // transform. 10733 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10734 10735 Changed |= CFGChanged |= processLoop(L); 10736 } 10737 10738 // Process each loop nest in the function. 10739 return LoopVectorizeResult(Changed, CFGChanged); 10740 } 10741 10742 PreservedAnalyses LoopVectorizePass::run(Function &F, 10743 FunctionAnalysisManager &AM) { 10744 auto &LI = AM.getResult<LoopAnalysis>(F); 10745 // There are no loops in the function. Return before computing other expensive 10746 // analyses. 10747 if (LI.empty()) 10748 return PreservedAnalyses::all(); 10749 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10750 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10751 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10752 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10753 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10754 auto &AA = AM.getResult<AAManager>(F); 10755 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10756 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10757 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10758 10759 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10760 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10761 [&](Loop &L) -> const LoopAccessInfo & { 10762 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10763 TLI, TTI, nullptr, nullptr, nullptr}; 10764 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10765 }; 10766 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10767 ProfileSummaryInfo *PSI = 10768 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10769 LoopVectorizeResult Result = 10770 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10771 if (!Result.MadeAnyChange) 10772 return PreservedAnalyses::all(); 10773 PreservedAnalyses PA; 10774 10775 // We currently do not preserve loopinfo/dominator analyses with outer loop 10776 // vectorization. Until this is addressed, mark these analyses as preserved 10777 // only for non-VPlan-native path. 10778 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10779 if (!EnableVPlanNativePath) { 10780 PA.preserve<LoopAnalysis>(); 10781 PA.preserve<DominatorTreeAnalysis>(); 10782 } 10783 10784 if (Result.MadeCFGChange) { 10785 // Making CFG changes likely means a loop got vectorized. Indicate that 10786 // extra simplification passes should be run. 10787 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10788 // be run if runtime checks have been added. 10789 AM.getResult<ShouldRunExtraVectorPasses>(F); 10790 PA.preserve<ShouldRunExtraVectorPasses>(); 10791 } else { 10792 PA.preserveSet<CFGAnalyses>(); 10793 } 10794 return PA; 10795 } 10796 10797 void LoopVectorizePass::printPipeline( 10798 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10799 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10800 OS, MapClassName2PassName); 10801 10802 OS << "<"; 10803 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10804 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10805 OS << ">"; 10806 } 10807