1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/None.h" 69 #include "llvm/ADT/Optional.h" 70 #include "llvm/ADT/STLExtras.h" 71 #include "llvm/ADT/SmallPtrSet.h" 72 #include "llvm/ADT/SmallSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 90 #include "llvm/Analysis/ProfileSummaryInfo.h" 91 #include "llvm/Analysis/ScalarEvolution.h" 92 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 93 #include "llvm/Analysis/TargetLibraryInfo.h" 94 #include "llvm/Analysis/TargetTransformInfo.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/InitializePasses.h" 125 #include "llvm/Pass.h" 126 #include "llvm/Support/Casting.h" 127 #include "llvm/Support/CommandLine.h" 128 #include "llvm/Support/Compiler.h" 129 #include "llvm/Support/Debug.h" 130 #include "llvm/Support/ErrorHandling.h" 131 #include "llvm/Support/InstructionCost.h" 132 #include "llvm/Support/MathExtras.h" 133 #include "llvm/Support/raw_ostream.h" 134 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 135 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 200 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks with a " 202 "vectorize(enable) pragma.")); 203 204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 205 // that predication is preferred, and this lists all options. I.e., the 206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 207 // and predicate the instructions accordingly. If tail-folding fails, there are 208 // different fallback strategies depending on these values: 209 namespace PreferPredicateTy { 210 enum Option { 211 ScalarEpilogue = 0, 212 PredicateElseScalarEpilogue, 213 PredicateOrDontVectorize 214 }; 215 } // namespace PreferPredicateTy 216 217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 218 "prefer-predicate-over-epilogue", 219 cl::init(PreferPredicateTy::ScalarEpilogue), 220 cl::Hidden, 221 cl::desc("Tail-folding and predication preferences over creating a scalar " 222 "epilogue loop."), 223 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 224 "scalar-epilogue", 225 "Don't tail-predicate loops, create scalar epilogue"), 226 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 227 "predicate-else-scalar-epilogue", 228 "prefer tail-folding, create scalar epilogue if tail " 229 "folding fails."), 230 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 231 "predicate-dont-vectorize", 232 "prefers tail-folding, don't attempt vectorization if " 233 "tail-folding fails."))); 234 235 static cl::opt<bool> MaximizeBandwidth( 236 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 237 cl::desc("Maximize bandwidth when selecting vectorization factor which " 238 "will be determined by the smallest type in loop.")); 239 240 static cl::opt<bool> EnableInterleavedMemAccesses( 241 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 242 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 243 244 /// An interleave-group may need masking if it resides in a block that needs 245 /// predication, or in order to mask away gaps. 246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 247 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 248 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 249 250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 251 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 252 cl::desc("We don't interleave loops with a estimated constant trip count " 253 "below this number")); 254 255 static cl::opt<unsigned> ForceTargetNumScalarRegs( 256 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of scalar registers.")); 258 259 static cl::opt<unsigned> ForceTargetNumVectorRegs( 260 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's number of vector registers.")); 262 263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 264 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 265 cl::desc("A flag that overrides the target's max interleave factor for " 266 "scalar loops.")); 267 268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 269 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 270 cl::desc("A flag that overrides the target's max interleave factor for " 271 "vectorized loops.")); 272 273 static cl::opt<unsigned> ForceTargetInstructionCost( 274 "force-target-instruction-cost", cl::init(0), cl::Hidden, 275 cl::desc("A flag that overrides the target's expected cost for " 276 "an instruction to a single constant value. Mostly " 277 "useful for getting consistent testing.")); 278 279 static cl::opt<bool> ForceTargetSupportsScalableVectors( 280 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 281 cl::desc( 282 "Pretend that scalable vectors are supported, even if the target does " 283 "not support them. This flag should only be used for testing.")); 284 285 static cl::opt<unsigned> SmallLoopCost( 286 "small-loop-cost", cl::init(20), cl::Hidden, 287 cl::desc( 288 "The cost of a loop that is considered 'small' by the interleaver.")); 289 290 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 291 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 292 cl::desc("Enable the use of the block frequency analysis to access PGO " 293 "heuristics minimizing code growth in cold regions and being more " 294 "aggressive in hot regions.")); 295 296 // Runtime interleave loops for load/store throughput. 297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 298 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 299 cl::desc( 300 "Enable runtime interleaving until load/store ports are saturated")); 301 302 /// Interleave small loops with scalar reductions. 303 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 304 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 305 cl::desc("Enable interleaving for loops with small iteration counts that " 306 "contain scalar reductions to expose ILP.")); 307 308 /// The number of stores in a loop that are allowed to need predication. 309 static cl::opt<unsigned> NumberOfStoresToPredicate( 310 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 311 cl::desc("Max number of stores to be predicated behind an if.")); 312 313 static cl::opt<bool> EnableIndVarRegisterHeur( 314 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 315 cl::desc("Count the induction variable only once when interleaving")); 316 317 static cl::opt<bool> EnableCondStoresVectorization( 318 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 319 cl::desc("Enable if predication of stores during vectorization.")); 320 321 static cl::opt<unsigned> MaxNestedScalarReductionIC( 322 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 323 cl::desc("The maximum interleave count to use when interleaving a scalar " 324 "reduction in a nested loop.")); 325 326 static cl::opt<bool> 327 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 328 cl::Hidden, 329 cl::desc("Prefer in-loop vector reductions, " 330 "overriding the targets preference.")); 331 332 static cl::opt<bool> ForceOrderedReductions( 333 "force-ordered-reductions", cl::init(false), cl::Hidden, 334 cl::desc("Enable the vectorisation of loops with in-order (strict) " 335 "FP reductions")); 336 337 static cl::opt<bool> PreferPredicatedReductionSelect( 338 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 339 cl::desc( 340 "Prefer predicating a reduction operation over an after loop select.")); 341 342 cl::opt<bool> EnableVPlanNativePath( 343 "enable-vplan-native-path", cl::init(false), cl::Hidden, 344 cl::desc("Enable VPlan-native vectorization path with " 345 "support for outer loop vectorization.")); 346 347 // This flag enables the stress testing of the VPlan H-CFG construction in the 348 // VPlan-native vectorization path. It must be used in conjuction with 349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 350 // verification of the H-CFGs built. 351 static cl::opt<bool> VPlanBuildStressTest( 352 "vplan-build-stress-test", cl::init(false), cl::Hidden, 353 cl::desc( 354 "Build VPlan for every supported loop nest in the function and bail " 355 "out right after the build (stress test the VPlan H-CFG construction " 356 "in the VPlan-native vectorization path).")); 357 358 cl::opt<bool> llvm::EnableLoopInterleaving( 359 "interleave-loops", cl::init(true), cl::Hidden, 360 cl::desc("Enable loop interleaving in Loop vectorization passes")); 361 cl::opt<bool> llvm::EnableLoopVectorization( 362 "vectorize-loops", cl::init(true), cl::Hidden, 363 cl::desc("Run the Loop vectorization passes")); 364 365 cl::opt<bool> PrintVPlansInDotFormat( 366 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 367 cl::desc("Use dot format instead of plain text when dumping VPlans")); 368 369 /// A helper function that returns true if the given type is irregular. The 370 /// type is irregular if its allocated size doesn't equal the store size of an 371 /// element of the corresponding vector type. 372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 373 // Determine if an array of N elements of type Ty is "bitcast compatible" 374 // with a <N x Ty> vector. 375 // This is only true if there is no padding between the array elements. 376 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 377 } 378 379 /// A helper function that returns the reciprocal of the block probability of 380 /// predicated blocks. If we return X, we are assuming the predicated block 381 /// will execute once for every X iterations of the loop header. 382 /// 383 /// TODO: We should use actual block probability here, if available. Currently, 384 /// we always assume predicated blocks have a 50% chance of executing. 385 static unsigned getReciprocalPredBlockProb() { return 2; } 386 387 /// A helper function that returns an integer or floating-point constant with 388 /// value C. 389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 390 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 391 : ConstantFP::get(Ty, C); 392 } 393 394 /// Returns "best known" trip count for the specified loop \p L as defined by 395 /// the following procedure: 396 /// 1) Returns exact trip count if it is known. 397 /// 2) Returns expected trip count according to profile data if any. 398 /// 3) Returns upper bound estimate if it is known. 399 /// 4) Returns None if all of the above failed. 400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 401 // Check if exact trip count is known. 402 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 403 return ExpectedTC; 404 405 // Check if there is an expected trip count available from profile data. 406 if (LoopVectorizeWithBlockFrequency) 407 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 408 return EstimatedTC; 409 410 // Check if upper bound estimate is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 412 return ExpectedTC; 413 414 return None; 415 } 416 417 // Forward declare GeneratedRTChecks. 418 class GeneratedRTChecks; 419 420 namespace llvm { 421 422 AnalysisKey ShouldRunExtraVectorPasses::Key; 423 424 /// InnerLoopVectorizer vectorizes loops which contain only one basic 425 /// block to a specified vectorization factor (VF). 426 /// This class performs the widening of scalars into vectors, or multiple 427 /// scalars. This class also implements the following features: 428 /// * It inserts an epilogue loop for handling loops that don't have iteration 429 /// counts that are known to be a multiple of the vectorization factor. 430 /// * It handles the code generation for reduction variables. 431 /// * Scalarization (implementation using scalars) of un-vectorizable 432 /// instructions. 433 /// InnerLoopVectorizer does not perform any vectorization-legality 434 /// checks, and relies on the caller to check for the different legality 435 /// aspects. The InnerLoopVectorizer relies on the 436 /// LoopVectorizationLegality class to provide information about the induction 437 /// and reduction variables that were found to a given vectorization factor. 438 class InnerLoopVectorizer { 439 public: 440 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 441 LoopInfo *LI, DominatorTree *DT, 442 const TargetLibraryInfo *TLI, 443 const TargetTransformInfo *TTI, AssumptionCache *AC, 444 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 445 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 446 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 447 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 448 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 449 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 450 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 451 PSI(PSI), RTChecks(RTChecks) { 452 // Query this against the original loop and save it here because the profile 453 // of the original loop header may change as the transformation happens. 454 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 455 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 456 } 457 458 virtual ~InnerLoopVectorizer() = default; 459 460 /// Create a new empty loop that will contain vectorized instructions later 461 /// on, while the old loop will be used as the scalar remainder. Control flow 462 /// is generated around the vectorized (and scalar epilogue) loops consisting 463 /// of various checks and bypasses. Return the pre-header block of the new 464 /// loop and the start value for the canonical induction, if it is != 0. The 465 /// latter is the case when vectorizing the epilogue loop. In the case of 466 /// epilogue vectorization, this function is overriden to handle the more 467 /// complex control flow around the loops. 468 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 469 470 /// Widen a single call instruction within the innermost loop. 471 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 472 VPTransformState &State); 473 474 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 475 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 476 477 // Return true if any runtime check is added. 478 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 479 480 /// A type for vectorized values in the new loop. Each value from the 481 /// original loop, when vectorized, is represented by UF vector values in the 482 /// new unrolled loop, where UF is the unroll factor. 483 using VectorParts = SmallVector<Value *, 2>; 484 485 /// A helper function to scalarize a single Instruction in the innermost loop. 486 /// Generates a sequence of scalar instances for each lane between \p MinLane 487 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 488 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 489 /// Instr's operands. 490 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 491 const VPIteration &Instance, bool IfPredicateInstr, 492 VPTransformState &State); 493 494 /// Construct the vector value of a scalarized value \p V one lane at a time. 495 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 496 VPTransformState &State); 497 498 /// Try to vectorize interleaved access group \p Group with the base address 499 /// given in \p Addr, optionally masking the vector operations if \p 500 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 501 /// values in the vectorized loop. 502 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 503 ArrayRef<VPValue *> VPDefs, 504 VPTransformState &State, VPValue *Addr, 505 ArrayRef<VPValue *> StoredValues, 506 VPValue *BlockInMask = nullptr); 507 508 /// Fix the non-induction PHIs in \p Plan. 509 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 510 511 /// Returns true if the reordering of FP operations is not allowed, but we are 512 /// able to vectorize with strict in-order reductions for the given RdxDesc. 513 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 514 515 /// Create a broadcast instruction. This method generates a broadcast 516 /// instruction (shuffle) for loop invariant values and for the induction 517 /// value. If this is the induction variable then we extend it to N, N+1, ... 518 /// this is needed because each iteration in the loop corresponds to a SIMD 519 /// element. 520 virtual Value *getBroadcastInstrs(Value *V); 521 522 // Returns the resume value (bc.merge.rdx) for a reduction as 523 // generated by fixReduction. 524 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 525 526 protected: 527 friend class LoopVectorizationPlanner; 528 529 /// A small list of PHINodes. 530 using PhiVector = SmallVector<PHINode *, 4>; 531 532 /// A type for scalarized values in the new loop. Each value from the 533 /// original loop, when scalarized, is represented by UF x VF scalar values 534 /// in the new unrolled loop, where UF is the unroll factor and VF is the 535 /// vectorization factor. 536 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 537 538 /// Set up the values of the IVs correctly when exiting the vector loop. 539 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 540 Value *VectorTripCount, Value *EndValue, 541 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 542 VPlan &Plan); 543 544 /// Handle all cross-iteration phis in the header. 545 void fixCrossIterationPHIs(VPTransformState &State); 546 547 /// Create the exit value of first order recurrences in the middle block and 548 /// update their users. 549 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 550 VPTransformState &State); 551 552 /// Create code for the loop exit value of the reduction. 553 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 554 555 /// Clear NSW/NUW flags from reduction instructions if necessary. 556 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 557 VPTransformState &State); 558 559 /// Iteratively sink the scalarized operands of a predicated instruction into 560 /// the block that was created for it. 561 void sinkScalarOperands(Instruction *PredInst); 562 563 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 564 /// represented as. 565 void truncateToMinimalBitwidths(VPTransformState &State); 566 567 /// Returns (and creates if needed) the original loop trip count. 568 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 569 570 /// Returns (and creates if needed) the trip count of the widened loop. 571 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 572 573 /// Returns a bitcasted value to the requested vector type. 574 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 575 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 576 const DataLayout &DL); 577 578 /// Emit a bypass check to see if the vector trip count is zero, including if 579 /// it overflows. 580 void emitIterationCountCheck(BasicBlock *Bypass); 581 582 /// Emit a bypass check to see if all of the SCEV assumptions we've 583 /// had to make are correct. Returns the block containing the checks or 584 /// nullptr if no checks have been added. 585 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 586 587 /// Emit bypass checks to check any memory assumptions we may have made. 588 /// Returns the block containing the checks or nullptr if no checks have been 589 /// added. 590 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 591 592 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 593 /// vector loop preheader, middle block and scalar preheader. 594 void createVectorLoopSkeleton(StringRef Prefix); 595 596 /// Create new phi nodes for the induction variables to resume iteration count 597 /// in the scalar epilogue, from where the vectorized loop left off. 598 /// In cases where the loop skeleton is more complicated (eg. epilogue 599 /// vectorization) and the resume values can come from an additional bypass 600 /// block, the \p AdditionalBypass pair provides information about the bypass 601 /// block and the end value on the edge from bypass to this loop. 602 void createInductionResumeValues( 603 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 604 605 /// Complete the loop skeleton by adding debug MDs, creating appropriate 606 /// conditional branches in the middle block, preparing the builder and 607 /// running the verifier. Return the preheader of the completed vector loop. 608 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 609 610 /// Collect poison-generating recipes that may generate a poison value that is 611 /// used after vectorization, even when their operands are not poison. Those 612 /// recipes meet the following conditions: 613 /// * Contribute to the address computation of a recipe generating a widen 614 /// memory load/store (VPWidenMemoryInstructionRecipe or 615 /// VPInterleaveRecipe). 616 /// * Such a widen memory load/store has at least one underlying Instruction 617 /// that is in a basic block that needs predication and after vectorization 618 /// the generated instruction won't be predicated. 619 void collectPoisonGeneratingRecipes(VPTransformState &State); 620 621 /// Allow subclasses to override and print debug traces before/after vplan 622 /// execution, when trace information is requested. 623 virtual void printDebugTracesAtStart(){}; 624 virtual void printDebugTracesAtEnd(){}; 625 626 /// The original loop. 627 Loop *OrigLoop; 628 629 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 630 /// dynamic knowledge to simplify SCEV expressions and converts them to a 631 /// more usable form. 632 PredicatedScalarEvolution &PSE; 633 634 /// Loop Info. 635 LoopInfo *LI; 636 637 /// Dominator Tree. 638 DominatorTree *DT; 639 640 /// Alias Analysis. 641 AAResults *AA; 642 643 /// Target Library Info. 644 const TargetLibraryInfo *TLI; 645 646 /// Target Transform Info. 647 const TargetTransformInfo *TTI; 648 649 /// Assumption Cache. 650 AssumptionCache *AC; 651 652 /// Interface to emit optimization remarks. 653 OptimizationRemarkEmitter *ORE; 654 655 /// The vectorization SIMD factor to use. Each vector will have this many 656 /// vector elements. 657 ElementCount VF; 658 659 /// The vectorization unroll factor to use. Each scalar is vectorized to this 660 /// many different vector instructions. 661 unsigned UF; 662 663 /// The builder that we use 664 IRBuilder<> Builder; 665 666 // --- Vectorization state --- 667 668 /// The vector-loop preheader. 669 BasicBlock *LoopVectorPreHeader; 670 671 /// The scalar-loop preheader. 672 BasicBlock *LoopScalarPreHeader; 673 674 /// Middle Block between the vector and the scalar. 675 BasicBlock *LoopMiddleBlock; 676 677 /// The unique ExitBlock of the scalar loop if one exists. Note that 678 /// there can be multiple exiting edges reaching this block. 679 BasicBlock *LoopExitBlock; 680 681 /// The scalar loop body. 682 BasicBlock *LoopScalarBody; 683 684 /// A list of all bypass blocks. The first block is the entry of the loop. 685 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 686 687 /// Store instructions that were predicated. 688 SmallVector<Instruction *, 4> PredicatedInstructions; 689 690 /// Trip count of the original loop. 691 Value *TripCount = nullptr; 692 693 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 694 Value *VectorTripCount = nullptr; 695 696 /// The legality analysis. 697 LoopVectorizationLegality *Legal; 698 699 /// The profitablity analysis. 700 LoopVectorizationCostModel *Cost; 701 702 // Record whether runtime checks are added. 703 bool AddedSafetyChecks = false; 704 705 // Holds the end values for each induction variable. We save the end values 706 // so we can later fix-up the external users of the induction variables. 707 DenseMap<PHINode *, Value *> IVEndValues; 708 709 /// BFI and PSI are used to check for profile guided size optimizations. 710 BlockFrequencyInfo *BFI; 711 ProfileSummaryInfo *PSI; 712 713 // Whether this loop should be optimized for size based on profile guided size 714 // optimizatios. 715 bool OptForSizeBasedOnProfile; 716 717 /// Structure to hold information about generated runtime checks, responsible 718 /// for cleaning the checks, if vectorization turns out unprofitable. 719 GeneratedRTChecks &RTChecks; 720 721 // Holds the resume values for reductions in the loops, used to set the 722 // correct start value of reduction PHIs when vectorizing the epilogue. 723 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 724 ReductionResumeValues; 725 }; 726 727 class InnerLoopUnroller : public InnerLoopVectorizer { 728 public: 729 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 730 LoopInfo *LI, DominatorTree *DT, 731 const TargetLibraryInfo *TLI, 732 const TargetTransformInfo *TTI, AssumptionCache *AC, 733 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 734 LoopVectorizationLegality *LVL, 735 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 736 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 737 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 738 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 739 BFI, PSI, Check) {} 740 741 private: 742 Value *getBroadcastInstrs(Value *V) override; 743 }; 744 745 /// Encapsulate information regarding vectorization of a loop and its epilogue. 746 /// This information is meant to be updated and used across two stages of 747 /// epilogue vectorization. 748 struct EpilogueLoopVectorizationInfo { 749 ElementCount MainLoopVF = ElementCount::getFixed(0); 750 unsigned MainLoopUF = 0; 751 ElementCount EpilogueVF = ElementCount::getFixed(0); 752 unsigned EpilogueUF = 0; 753 BasicBlock *MainLoopIterationCountCheck = nullptr; 754 BasicBlock *EpilogueIterationCountCheck = nullptr; 755 BasicBlock *SCEVSafetyCheck = nullptr; 756 BasicBlock *MemSafetyCheck = nullptr; 757 Value *TripCount = nullptr; 758 Value *VectorTripCount = nullptr; 759 760 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 761 ElementCount EVF, unsigned EUF) 762 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 763 assert(EUF == 1 && 764 "A high UF for the epilogue loop is likely not beneficial."); 765 } 766 }; 767 768 /// An extension of the inner loop vectorizer that creates a skeleton for a 769 /// vectorized loop that has its epilogue (residual) also vectorized. 770 /// The idea is to run the vplan on a given loop twice, firstly to setup the 771 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 772 /// from the first step and vectorize the epilogue. This is achieved by 773 /// deriving two concrete strategy classes from this base class and invoking 774 /// them in succession from the loop vectorizer planner. 775 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 776 public: 777 InnerLoopAndEpilogueVectorizer( 778 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 779 DominatorTree *DT, const TargetLibraryInfo *TLI, 780 const TargetTransformInfo *TTI, AssumptionCache *AC, 781 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 782 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 783 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 784 GeneratedRTChecks &Checks) 785 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 786 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 787 Checks), 788 EPI(EPI) {} 789 790 // Override this function to handle the more complex control flow around the 791 // three loops. 792 std::pair<BasicBlock *, Value *> 793 createVectorizedLoopSkeleton() final override { 794 return createEpilogueVectorizedLoopSkeleton(); 795 } 796 797 /// The interface for creating a vectorized skeleton using one of two 798 /// different strategies, each corresponding to one execution of the vplan 799 /// as described above. 800 virtual std::pair<BasicBlock *, Value *> 801 createEpilogueVectorizedLoopSkeleton() = 0; 802 803 /// Holds and updates state information required to vectorize the main loop 804 /// and its epilogue in two separate passes. This setup helps us avoid 805 /// regenerating and recomputing runtime safety checks. It also helps us to 806 /// shorten the iteration-count-check path length for the cases where the 807 /// iteration count of the loop is so small that the main vector loop is 808 /// completely skipped. 809 EpilogueLoopVectorizationInfo &EPI; 810 }; 811 812 /// A specialized derived class of inner loop vectorizer that performs 813 /// vectorization of *main* loops in the process of vectorizing loops and their 814 /// epilogues. 815 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 816 public: 817 EpilogueVectorizerMainLoop( 818 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 819 DominatorTree *DT, const TargetLibraryInfo *TLI, 820 const TargetTransformInfo *TTI, AssumptionCache *AC, 821 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 822 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 823 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 824 GeneratedRTChecks &Check) 825 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 826 EPI, LVL, CM, BFI, PSI, Check) {} 827 /// Implements the interface for creating a vectorized skeleton using the 828 /// *main loop* strategy (ie the first pass of vplan execution). 829 std::pair<BasicBlock *, Value *> 830 createEpilogueVectorizedLoopSkeleton() final override; 831 832 protected: 833 /// Emits an iteration count bypass check once for the main loop (when \p 834 /// ForEpilogue is false) and once for the epilogue loop (when \p 835 /// ForEpilogue is true). 836 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 837 void printDebugTracesAtStart() override; 838 void printDebugTracesAtEnd() override; 839 }; 840 841 // A specialized derived class of inner loop vectorizer that performs 842 // vectorization of *epilogue* loops in the process of vectorizing loops and 843 // their epilogues. 844 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 845 public: 846 EpilogueVectorizerEpilogueLoop( 847 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 848 DominatorTree *DT, const TargetLibraryInfo *TLI, 849 const TargetTransformInfo *TTI, AssumptionCache *AC, 850 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 851 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 852 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 853 GeneratedRTChecks &Checks) 854 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 855 EPI, LVL, CM, BFI, PSI, Checks) { 856 TripCount = EPI.TripCount; 857 } 858 /// Implements the interface for creating a vectorized skeleton using the 859 /// *epilogue loop* strategy (ie the second pass of vplan execution). 860 std::pair<BasicBlock *, Value *> 861 createEpilogueVectorizedLoopSkeleton() final override; 862 863 protected: 864 /// Emits an iteration count bypass check after the main vector loop has 865 /// finished to see if there are any iterations left to execute by either 866 /// the vector epilogue or the scalar epilogue. 867 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 868 BasicBlock *Bypass, 869 BasicBlock *Insert); 870 void printDebugTracesAtStart() override; 871 void printDebugTracesAtEnd() override; 872 }; 873 } // end namespace llvm 874 875 /// Look for a meaningful debug location on the instruction or it's 876 /// operands. 877 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 878 if (!I) 879 return I; 880 881 DebugLoc Empty; 882 if (I->getDebugLoc() != Empty) 883 return I; 884 885 for (Use &Op : I->operands()) { 886 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 887 if (OpInst->getDebugLoc() != Empty) 888 return OpInst; 889 } 890 891 return I; 892 } 893 894 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 895 /// is passed, the message relates to that particular instruction. 896 #ifndef NDEBUG 897 static void debugVectorizationMessage(const StringRef Prefix, 898 const StringRef DebugMsg, 899 Instruction *I) { 900 dbgs() << "LV: " << Prefix << DebugMsg; 901 if (I != nullptr) 902 dbgs() << " " << *I; 903 else 904 dbgs() << '.'; 905 dbgs() << '\n'; 906 } 907 #endif 908 909 /// Create an analysis remark that explains why vectorization failed 910 /// 911 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 912 /// RemarkName is the identifier for the remark. If \p I is passed it is an 913 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 914 /// the location of the remark. \return the remark object that can be 915 /// streamed to. 916 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 917 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 918 Value *CodeRegion = TheLoop->getHeader(); 919 DebugLoc DL = TheLoop->getStartLoc(); 920 921 if (I) { 922 CodeRegion = I->getParent(); 923 // If there is no debug location attached to the instruction, revert back to 924 // using the loop's. 925 if (I->getDebugLoc()) 926 DL = I->getDebugLoc(); 927 } 928 929 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 930 } 931 932 namespace llvm { 933 934 /// Return a value for Step multiplied by VF. 935 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 936 int64_t Step) { 937 assert(Ty->isIntegerTy() && "Expected an integer step"); 938 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 939 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 940 } 941 942 /// Return the runtime value for VF. 943 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 944 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 945 return VF.isScalable() ? B.CreateVScale(EC) : EC; 946 } 947 948 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 949 ElementCount VF) { 950 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 951 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 952 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 953 return B.CreateUIToFP(RuntimeVF, FTy); 954 } 955 956 void reportVectorizationFailure(const StringRef DebugMsg, 957 const StringRef OREMsg, const StringRef ORETag, 958 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 959 Instruction *I) { 960 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 961 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 962 ORE->emit( 963 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 964 << "loop not vectorized: " << OREMsg); 965 } 966 967 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 968 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 969 Instruction *I) { 970 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 971 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 972 ORE->emit( 973 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 974 << Msg); 975 } 976 977 } // end namespace llvm 978 979 #ifndef NDEBUG 980 /// \return string containing a file name and a line # for the given loop. 981 static std::string getDebugLocString(const Loop *L) { 982 std::string Result; 983 if (L) { 984 raw_string_ostream OS(Result); 985 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 986 LoopDbgLoc.print(OS); 987 else 988 // Just print the module name. 989 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 990 OS.flush(); 991 } 992 return Result; 993 } 994 #endif 995 996 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 997 VPTransformState &State) { 998 999 // Collect recipes in the backward slice of `Root` that may generate a poison 1000 // value that is used after vectorization. 1001 SmallPtrSet<VPRecipeBase *, 16> Visited; 1002 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1003 SmallVector<VPRecipeBase *, 16> Worklist; 1004 Worklist.push_back(Root); 1005 1006 // Traverse the backward slice of Root through its use-def chain. 1007 while (!Worklist.empty()) { 1008 VPRecipeBase *CurRec = Worklist.back(); 1009 Worklist.pop_back(); 1010 1011 if (!Visited.insert(CurRec).second) 1012 continue; 1013 1014 // Prune search if we find another recipe generating a widen memory 1015 // instruction. Widen memory instructions involved in address computation 1016 // will lead to gather/scatter instructions, which don't need to be 1017 // handled. 1018 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1019 isa<VPInterleaveRecipe>(CurRec) || 1020 isa<VPScalarIVStepsRecipe>(CurRec) || 1021 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1022 continue; 1023 1024 // This recipe contributes to the address computation of a widen 1025 // load/store. Collect recipe if its underlying instruction has 1026 // poison-generating flags. 1027 Instruction *Instr = CurRec->getUnderlyingInstr(); 1028 if (Instr && Instr->hasPoisonGeneratingFlags()) 1029 State.MayGeneratePoisonRecipes.insert(CurRec); 1030 1031 // Add new definitions to the worklist. 1032 for (VPValue *operand : CurRec->operands()) 1033 if (VPDef *OpDef = operand->getDef()) 1034 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1035 } 1036 }); 1037 1038 // Traverse all the recipes in the VPlan and collect the poison-generating 1039 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1040 // VPInterleaveRecipe. 1041 auto Iter = depth_first( 1042 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1043 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1044 for (VPRecipeBase &Recipe : *VPBB) { 1045 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1046 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1047 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1048 if (AddrDef && WidenRec->isConsecutive() && 1049 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1050 collectPoisonGeneratingInstrsInBackwardSlice( 1051 cast<VPRecipeBase>(AddrDef)); 1052 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1053 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1054 if (AddrDef) { 1055 // Check if any member of the interleave group needs predication. 1056 const InterleaveGroup<Instruction> *InterGroup = 1057 InterleaveRec->getInterleaveGroup(); 1058 bool NeedPredication = false; 1059 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1060 I < NumMembers; ++I) { 1061 Instruction *Member = InterGroup->getMember(I); 1062 if (Member) 1063 NeedPredication |= 1064 Legal->blockNeedsPredication(Member->getParent()); 1065 } 1066 1067 if (NeedPredication) 1068 collectPoisonGeneratingInstrsInBackwardSlice( 1069 cast<VPRecipeBase>(AddrDef)); 1070 } 1071 } 1072 } 1073 } 1074 } 1075 1076 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1077 const RecurrenceDescriptor &RdxDesc) { 1078 auto It = ReductionResumeValues.find(&RdxDesc); 1079 assert(It != ReductionResumeValues.end() && 1080 "Expected to find a resume value for the reduction."); 1081 return It->second; 1082 } 1083 1084 namespace llvm { 1085 1086 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1087 // lowered. 1088 enum ScalarEpilogueLowering { 1089 1090 // The default: allowing scalar epilogues. 1091 CM_ScalarEpilogueAllowed, 1092 1093 // Vectorization with OptForSize: don't allow epilogues. 1094 CM_ScalarEpilogueNotAllowedOptSize, 1095 1096 // A special case of vectorisation with OptForSize: loops with a very small 1097 // trip count are considered for vectorization under OptForSize, thereby 1098 // making sure the cost of their loop body is dominant, free of runtime 1099 // guards and scalar iteration overheads. 1100 CM_ScalarEpilogueNotAllowedLowTripLoop, 1101 1102 // Loop hint predicate indicating an epilogue is undesired. 1103 CM_ScalarEpilogueNotNeededUsePredicate, 1104 1105 // Directive indicating we must either tail fold or not vectorize 1106 CM_ScalarEpilogueNotAllowedUsePredicate 1107 }; 1108 1109 /// ElementCountComparator creates a total ordering for ElementCount 1110 /// for the purposes of using it in a set structure. 1111 struct ElementCountComparator { 1112 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1113 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1114 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1115 } 1116 }; 1117 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1118 1119 /// LoopVectorizationCostModel - estimates the expected speedups due to 1120 /// vectorization. 1121 /// In many cases vectorization is not profitable. This can happen because of 1122 /// a number of reasons. In this class we mainly attempt to predict the 1123 /// expected speedup/slowdowns due to the supported instruction set. We use the 1124 /// TargetTransformInfo to query the different backends for the cost of 1125 /// different operations. 1126 class LoopVectorizationCostModel { 1127 public: 1128 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1129 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1130 LoopVectorizationLegality *Legal, 1131 const TargetTransformInfo &TTI, 1132 const TargetLibraryInfo *TLI, DemandedBits *DB, 1133 AssumptionCache *AC, 1134 OptimizationRemarkEmitter *ORE, const Function *F, 1135 const LoopVectorizeHints *Hints, 1136 InterleavedAccessInfo &IAI) 1137 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1138 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1139 Hints(Hints), InterleaveInfo(IAI) {} 1140 1141 /// \return An upper bound for the vectorization factors (both fixed and 1142 /// scalable). If the factors are 0, vectorization and interleaving should be 1143 /// avoided up front. 1144 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1145 1146 /// \return True if runtime checks are required for vectorization, and false 1147 /// otherwise. 1148 bool runtimeChecksRequired(); 1149 1150 /// \return The most profitable vectorization factor and the cost of that VF. 1151 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1152 /// then this vectorization factor will be selected if vectorization is 1153 /// possible. 1154 VectorizationFactor 1155 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1156 1157 VectorizationFactor 1158 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1159 const LoopVectorizationPlanner &LVP); 1160 1161 /// Setup cost-based decisions for user vectorization factor. 1162 /// \return true if the UserVF is a feasible VF to be chosen. 1163 bool selectUserVectorizationFactor(ElementCount UserVF) { 1164 collectUniformsAndScalars(UserVF); 1165 collectInstsToScalarize(UserVF); 1166 return expectedCost(UserVF).first.isValid(); 1167 } 1168 1169 /// \return The size (in bits) of the smallest and widest types in the code 1170 /// that needs to be vectorized. We ignore values that remain scalar such as 1171 /// 64 bit loop indices. 1172 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1173 1174 /// \return The desired interleave count. 1175 /// If interleave count has been specified by metadata it will be returned. 1176 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1177 /// are the selected vectorization factor and the cost of the selected VF. 1178 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1179 1180 /// Memory access instruction may be vectorized in more than one way. 1181 /// Form of instruction after vectorization depends on cost. 1182 /// This function takes cost-based decisions for Load/Store instructions 1183 /// and collects them in a map. This decisions map is used for building 1184 /// the lists of loop-uniform and loop-scalar instructions. 1185 /// The calculated cost is saved with widening decision in order to 1186 /// avoid redundant calculations. 1187 void setCostBasedWideningDecision(ElementCount VF); 1188 1189 /// A struct that represents some properties of the register usage 1190 /// of a loop. 1191 struct RegisterUsage { 1192 /// Holds the number of loop invariant values that are used in the loop. 1193 /// The key is ClassID of target-provided register class. 1194 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1195 /// Holds the maximum number of concurrent live intervals in the loop. 1196 /// The key is ClassID of target-provided register class. 1197 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1198 }; 1199 1200 /// \return Returns information about the register usages of the loop for the 1201 /// given vectorization factors. 1202 SmallVector<RegisterUsage, 8> 1203 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1204 1205 /// Collect values we want to ignore in the cost model. 1206 void collectValuesToIgnore(); 1207 1208 /// Collect all element types in the loop for which widening is needed. 1209 void collectElementTypesForWidening(); 1210 1211 /// Split reductions into those that happen in the loop, and those that happen 1212 /// outside. In loop reductions are collected into InLoopReductionChains. 1213 void collectInLoopReductions(); 1214 1215 /// Returns true if we should use strict in-order reductions for the given 1216 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1217 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1218 /// of FP operations. 1219 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1220 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1221 } 1222 1223 /// \returns The smallest bitwidth each instruction can be represented with. 1224 /// The vector equivalents of these instructions should be truncated to this 1225 /// type. 1226 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1227 return MinBWs; 1228 } 1229 1230 /// \returns True if it is more profitable to scalarize instruction \p I for 1231 /// vectorization factor \p VF. 1232 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1233 assert(VF.isVector() && 1234 "Profitable to scalarize relevant only for VF > 1."); 1235 1236 // Cost model is not run in the VPlan-native path - return conservative 1237 // result until this changes. 1238 if (EnableVPlanNativePath) 1239 return false; 1240 1241 auto Scalars = InstsToScalarize.find(VF); 1242 assert(Scalars != InstsToScalarize.end() && 1243 "VF not yet analyzed for scalarization profitability"); 1244 return Scalars->second.find(I) != Scalars->second.end(); 1245 } 1246 1247 /// Returns true if \p I is known to be uniform after vectorization. 1248 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1249 if (VF.isScalar()) 1250 return true; 1251 1252 // Cost model is not run in the VPlan-native path - return conservative 1253 // result until this changes. 1254 if (EnableVPlanNativePath) 1255 return false; 1256 1257 auto UniformsPerVF = Uniforms.find(VF); 1258 assert(UniformsPerVF != Uniforms.end() && 1259 "VF not yet analyzed for uniformity"); 1260 return UniformsPerVF->second.count(I); 1261 } 1262 1263 /// Returns true if \p I is known to be scalar after vectorization. 1264 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1265 if (VF.isScalar()) 1266 return true; 1267 1268 // Cost model is not run in the VPlan-native path - return conservative 1269 // result until this changes. 1270 if (EnableVPlanNativePath) 1271 return false; 1272 1273 auto ScalarsPerVF = Scalars.find(VF); 1274 assert(ScalarsPerVF != Scalars.end() && 1275 "Scalar values are not calculated for VF"); 1276 return ScalarsPerVF->second.count(I); 1277 } 1278 1279 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1280 /// for vectorization factor \p VF. 1281 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1282 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1283 !isProfitableToScalarize(I, VF) && 1284 !isScalarAfterVectorization(I, VF); 1285 } 1286 1287 /// Decision that was taken during cost calculation for memory instruction. 1288 enum InstWidening { 1289 CM_Unknown, 1290 CM_Widen, // For consecutive accesses with stride +1. 1291 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1292 CM_Interleave, 1293 CM_GatherScatter, 1294 CM_Scalarize 1295 }; 1296 1297 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1298 /// instruction \p I and vector width \p VF. 1299 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1300 InstructionCost Cost) { 1301 assert(VF.isVector() && "Expected VF >=2"); 1302 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1303 } 1304 1305 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1306 /// interleaving group \p Grp and vector width \p VF. 1307 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1308 ElementCount VF, InstWidening W, 1309 InstructionCost Cost) { 1310 assert(VF.isVector() && "Expected VF >=2"); 1311 /// Broadcast this decicion to all instructions inside the group. 1312 /// But the cost will be assigned to one instruction only. 1313 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1314 if (auto *I = Grp->getMember(i)) { 1315 if (Grp->getInsertPos() == I) 1316 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1317 else 1318 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1319 } 1320 } 1321 } 1322 1323 /// Return the cost model decision for the given instruction \p I and vector 1324 /// width \p VF. Return CM_Unknown if this instruction did not pass 1325 /// through the cost modeling. 1326 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1327 assert(VF.isVector() && "Expected VF to be a vector VF"); 1328 // Cost model is not run in the VPlan-native path - return conservative 1329 // result until this changes. 1330 if (EnableVPlanNativePath) 1331 return CM_GatherScatter; 1332 1333 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1334 auto Itr = WideningDecisions.find(InstOnVF); 1335 if (Itr == WideningDecisions.end()) 1336 return CM_Unknown; 1337 return Itr->second.first; 1338 } 1339 1340 /// Return the vectorization cost for the given instruction \p I and vector 1341 /// width \p VF. 1342 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1343 assert(VF.isVector() && "Expected VF >=2"); 1344 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1345 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1346 "The cost is not calculated"); 1347 return WideningDecisions[InstOnVF].second; 1348 } 1349 1350 /// Return True if instruction \p I is an optimizable truncate whose operand 1351 /// is an induction variable. Such a truncate will be removed by adding a new 1352 /// induction variable with the destination type. 1353 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1354 // If the instruction is not a truncate, return false. 1355 auto *Trunc = dyn_cast<TruncInst>(I); 1356 if (!Trunc) 1357 return false; 1358 1359 // Get the source and destination types of the truncate. 1360 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1361 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1362 1363 // If the truncate is free for the given types, return false. Replacing a 1364 // free truncate with an induction variable would add an induction variable 1365 // update instruction to each iteration of the loop. We exclude from this 1366 // check the primary induction variable since it will need an update 1367 // instruction regardless. 1368 Value *Op = Trunc->getOperand(0); 1369 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1370 return false; 1371 1372 // If the truncated value is not an induction variable, return false. 1373 return Legal->isInductionPhi(Op); 1374 } 1375 1376 /// Collects the instructions to scalarize for each predicated instruction in 1377 /// the loop. 1378 void collectInstsToScalarize(ElementCount VF); 1379 1380 /// Collect Uniform and Scalar values for the given \p VF. 1381 /// The sets depend on CM decision for Load/Store instructions 1382 /// that may be vectorized as interleave, gather-scatter or scalarized. 1383 void collectUniformsAndScalars(ElementCount VF) { 1384 // Do the analysis once. 1385 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1386 return; 1387 setCostBasedWideningDecision(VF); 1388 collectLoopUniforms(VF); 1389 collectLoopScalars(VF); 1390 } 1391 1392 /// Returns true if the target machine supports masked store operation 1393 /// for the given \p DataType and kind of access to \p Ptr. 1394 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1395 return Legal->isConsecutivePtr(DataType, Ptr) && 1396 TTI.isLegalMaskedStore(DataType, Alignment); 1397 } 1398 1399 /// Returns true if the target machine supports masked load operation 1400 /// for the given \p DataType and kind of access to \p Ptr. 1401 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1402 return Legal->isConsecutivePtr(DataType, Ptr) && 1403 TTI.isLegalMaskedLoad(DataType, Alignment); 1404 } 1405 1406 /// Returns true if the target machine can represent \p V as a masked gather 1407 /// or scatter operation. 1408 bool isLegalGatherOrScatter(Value *V, 1409 ElementCount VF = ElementCount::getFixed(1)) { 1410 bool LI = isa<LoadInst>(V); 1411 bool SI = isa<StoreInst>(V); 1412 if (!LI && !SI) 1413 return false; 1414 auto *Ty = getLoadStoreType(V); 1415 Align Align = getLoadStoreAlignment(V); 1416 if (VF.isVector()) 1417 Ty = VectorType::get(Ty, VF); 1418 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1419 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1420 } 1421 1422 /// Returns true if the target machine supports all of the reduction 1423 /// variables found for the given VF. 1424 bool canVectorizeReductions(ElementCount VF) const { 1425 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1426 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1427 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1428 })); 1429 } 1430 1431 /// Returns true if \p I is an instruction that will be scalarized with 1432 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1433 /// instructions include conditional stores and instructions that may divide 1434 /// by zero. 1435 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1436 1437 // Returns true if \p I is an instruction that will be predicated either 1438 // through scalar predication or masked load/store or masked gather/scatter. 1439 // \p VF is the vectorization factor that will be used to vectorize \p I. 1440 // Superset of instructions that return true for isScalarWithPredication. 1441 bool isPredicatedInst(Instruction *I, ElementCount VF, 1442 bool IsKnownUniform = false) { 1443 // When we know the load is uniform and the original scalar loop was not 1444 // predicated we don't need to mark it as a predicated instruction. Any 1445 // vectorised blocks created when tail-folding are something artificial we 1446 // have introduced and we know there is always at least one active lane. 1447 // That's why we call Legal->blockNeedsPredication here because it doesn't 1448 // query tail-folding. 1449 if (IsKnownUniform && isa<LoadInst>(I) && 1450 !Legal->blockNeedsPredication(I->getParent())) 1451 return false; 1452 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1453 return false; 1454 // Loads and stores that need some form of masked operation are predicated 1455 // instructions. 1456 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1457 return Legal->isMaskRequired(I); 1458 return isScalarWithPredication(I, VF); 1459 } 1460 1461 /// Returns true if \p I is a memory instruction with consecutive memory 1462 /// access that can be widened. 1463 bool 1464 memoryInstructionCanBeWidened(Instruction *I, 1465 ElementCount VF = ElementCount::getFixed(1)); 1466 1467 /// Returns true if \p I is a memory instruction in an interleaved-group 1468 /// of memory accesses that can be vectorized with wide vector loads/stores 1469 /// and shuffles. 1470 bool 1471 interleavedAccessCanBeWidened(Instruction *I, 1472 ElementCount VF = ElementCount::getFixed(1)); 1473 1474 /// Check if \p Instr belongs to any interleaved access group. 1475 bool isAccessInterleaved(Instruction *Instr) { 1476 return InterleaveInfo.isInterleaved(Instr); 1477 } 1478 1479 /// Get the interleaved access group that \p Instr belongs to. 1480 const InterleaveGroup<Instruction> * 1481 getInterleavedAccessGroup(Instruction *Instr) { 1482 return InterleaveInfo.getInterleaveGroup(Instr); 1483 } 1484 1485 /// Returns true if we're required to use a scalar epilogue for at least 1486 /// the final iteration of the original loop. 1487 bool requiresScalarEpilogue(ElementCount VF) const { 1488 if (!isScalarEpilogueAllowed()) 1489 return false; 1490 // If we might exit from anywhere but the latch, must run the exiting 1491 // iteration in scalar form. 1492 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1493 return true; 1494 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1495 } 1496 1497 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1498 /// loop hint annotation. 1499 bool isScalarEpilogueAllowed() const { 1500 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1501 } 1502 1503 /// Returns true if all loop blocks should be masked to fold tail loop. 1504 bool foldTailByMasking() const { return FoldTailByMasking; } 1505 1506 /// Returns true if the instructions in this block requires predication 1507 /// for any reason, e.g. because tail folding now requires a predicate 1508 /// or because the block in the original loop was predicated. 1509 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1510 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1511 } 1512 1513 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1514 /// nodes to the chain of instructions representing the reductions. Uses a 1515 /// MapVector to ensure deterministic iteration order. 1516 using ReductionChainMap = 1517 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1518 1519 /// Return the chain of instructions representing an inloop reduction. 1520 const ReductionChainMap &getInLoopReductionChains() const { 1521 return InLoopReductionChains; 1522 } 1523 1524 /// Returns true if the Phi is part of an inloop reduction. 1525 bool isInLoopReduction(PHINode *Phi) const { 1526 return InLoopReductionChains.count(Phi); 1527 } 1528 1529 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1530 /// with factor VF. Return the cost of the instruction, including 1531 /// scalarization overhead if it's needed. 1532 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1533 1534 /// Estimate cost of a call instruction CI if it were vectorized with factor 1535 /// VF. Return the cost of the instruction, including scalarization overhead 1536 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1537 /// scalarized - 1538 /// i.e. either vector version isn't available, or is too expensive. 1539 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1540 bool &NeedToScalarize) const; 1541 1542 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1543 /// that of B. 1544 bool isMoreProfitable(const VectorizationFactor &A, 1545 const VectorizationFactor &B) const; 1546 1547 /// Invalidates decisions already taken by the cost model. 1548 void invalidateCostModelingDecisions() { 1549 WideningDecisions.clear(); 1550 Uniforms.clear(); 1551 Scalars.clear(); 1552 } 1553 1554 private: 1555 unsigned NumPredStores = 0; 1556 1557 /// Convenience function that returns the value of vscale_range iff 1558 /// vscale_range.min == vscale_range.max or otherwise returns the value 1559 /// returned by the corresponding TLI method. 1560 Optional<unsigned> getVScaleForTuning() const; 1561 1562 /// \return An upper bound for the vectorization factors for both 1563 /// fixed and scalable vectorization, where the minimum-known number of 1564 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1565 /// disabled or unsupported, then the scalable part will be equal to 1566 /// ElementCount::getScalable(0). 1567 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1568 ElementCount UserVF, 1569 bool FoldTailByMasking); 1570 1571 /// \return the maximized element count based on the targets vector 1572 /// registers and the loop trip-count, but limited to a maximum safe VF. 1573 /// This is a helper function of computeFeasibleMaxVF. 1574 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1575 unsigned SmallestType, 1576 unsigned WidestType, 1577 ElementCount MaxSafeVF, 1578 bool FoldTailByMasking); 1579 1580 /// \return the maximum legal scalable VF, based on the safe max number 1581 /// of elements. 1582 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1583 1584 /// The vectorization cost is a combination of the cost itself and a boolean 1585 /// indicating whether any of the contributing operations will actually 1586 /// operate on vector values after type legalization in the backend. If this 1587 /// latter value is false, then all operations will be scalarized (i.e. no 1588 /// vectorization has actually taken place). 1589 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1590 1591 /// Returns the expected execution cost. The unit of the cost does 1592 /// not matter because we use the 'cost' units to compare different 1593 /// vector widths. The cost that is returned is *not* normalized by 1594 /// the factor width. If \p Invalid is not nullptr, this function 1595 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1596 /// each instruction that has an Invalid cost for the given VF. 1597 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1598 VectorizationCostTy 1599 expectedCost(ElementCount VF, 1600 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1601 1602 /// Returns the execution time cost of an instruction for a given vector 1603 /// width. Vector width of one means scalar. 1604 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1605 1606 /// The cost-computation logic from getInstructionCost which provides 1607 /// the vector type as an output parameter. 1608 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1609 Type *&VectorTy); 1610 1611 /// Return the cost of instructions in an inloop reduction pattern, if I is 1612 /// part of that pattern. 1613 Optional<InstructionCost> 1614 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1615 TTI::TargetCostKind CostKind); 1616 1617 /// Calculate vectorization cost of memory instruction \p I. 1618 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1619 1620 /// The cost computation for scalarized memory instruction. 1621 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1622 1623 /// The cost computation for interleaving group of memory instructions. 1624 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1625 1626 /// The cost computation for Gather/Scatter instruction. 1627 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1628 1629 /// The cost computation for widening instruction \p I with consecutive 1630 /// memory access. 1631 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1632 1633 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1634 /// Load: scalar load + broadcast. 1635 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1636 /// element) 1637 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1638 1639 /// Estimate the overhead of scalarizing an instruction. This is a 1640 /// convenience wrapper for the type-based getScalarizationOverhead API. 1641 InstructionCost getScalarizationOverhead(Instruction *I, 1642 ElementCount VF) const; 1643 1644 /// Returns whether the instruction is a load or store and will be a emitted 1645 /// as a vector operation. 1646 bool isConsecutiveLoadOrStore(Instruction *I); 1647 1648 /// Returns true if an artificially high cost for emulated masked memrefs 1649 /// should be used. 1650 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1651 1652 /// Map of scalar integer values to the smallest bitwidth they can be legally 1653 /// represented as. The vector equivalents of these values should be truncated 1654 /// to this type. 1655 MapVector<Instruction *, uint64_t> MinBWs; 1656 1657 /// A type representing the costs for instructions if they were to be 1658 /// scalarized rather than vectorized. The entries are Instruction-Cost 1659 /// pairs. 1660 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1661 1662 /// A set containing all BasicBlocks that are known to present after 1663 /// vectorization as a predicated block. 1664 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1665 1666 /// Records whether it is allowed to have the original scalar loop execute at 1667 /// least once. This may be needed as a fallback loop in case runtime 1668 /// aliasing/dependence checks fail, or to handle the tail/remainder 1669 /// iterations when the trip count is unknown or doesn't divide by the VF, 1670 /// or as a peel-loop to handle gaps in interleave-groups. 1671 /// Under optsize and when the trip count is very small we don't allow any 1672 /// iterations to execute in the scalar loop. 1673 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1674 1675 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1676 bool FoldTailByMasking = false; 1677 1678 /// A map holding scalar costs for different vectorization factors. The 1679 /// presence of a cost for an instruction in the mapping indicates that the 1680 /// instruction will be scalarized when vectorizing with the associated 1681 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1682 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1683 1684 /// Holds the instructions known to be uniform after vectorization. 1685 /// The data is collected per VF. 1686 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1687 1688 /// Holds the instructions known to be scalar after vectorization. 1689 /// The data is collected per VF. 1690 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1691 1692 /// Holds the instructions (address computations) that are forced to be 1693 /// scalarized. 1694 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1695 1696 /// PHINodes of the reductions that should be expanded in-loop along with 1697 /// their associated chains of reduction operations, in program order from top 1698 /// (PHI) to bottom 1699 ReductionChainMap InLoopReductionChains; 1700 1701 /// A Map of inloop reduction operations and their immediate chain operand. 1702 /// FIXME: This can be removed once reductions can be costed correctly in 1703 /// vplan. This was added to allow quick lookup to the inloop operations, 1704 /// without having to loop through InLoopReductionChains. 1705 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1706 1707 /// Returns the expected difference in cost from scalarizing the expression 1708 /// feeding a predicated instruction \p PredInst. The instructions to 1709 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1710 /// non-negative return value implies the expression will be scalarized. 1711 /// Currently, only single-use chains are considered for scalarization. 1712 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1713 ElementCount VF); 1714 1715 /// Collect the instructions that are uniform after vectorization. An 1716 /// instruction is uniform if we represent it with a single scalar value in 1717 /// the vectorized loop corresponding to each vector iteration. Examples of 1718 /// uniform instructions include pointer operands of consecutive or 1719 /// interleaved memory accesses. Note that although uniformity implies an 1720 /// instruction will be scalar, the reverse is not true. In general, a 1721 /// scalarized instruction will be represented by VF scalar values in the 1722 /// vectorized loop, each corresponding to an iteration of the original 1723 /// scalar loop. 1724 void collectLoopUniforms(ElementCount VF); 1725 1726 /// Collect the instructions that are scalar after vectorization. An 1727 /// instruction is scalar if it is known to be uniform or will be scalarized 1728 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1729 /// to the list if they are used by a load/store instruction that is marked as 1730 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1731 /// VF values in the vectorized loop, each corresponding to an iteration of 1732 /// the original scalar loop. 1733 void collectLoopScalars(ElementCount VF); 1734 1735 /// Keeps cost model vectorization decision and cost for instructions. 1736 /// Right now it is used for memory instructions only. 1737 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1738 std::pair<InstWidening, InstructionCost>>; 1739 1740 DecisionList WideningDecisions; 1741 1742 /// Returns true if \p V is expected to be vectorized and it needs to be 1743 /// extracted. 1744 bool needsExtract(Value *V, ElementCount VF) const { 1745 Instruction *I = dyn_cast<Instruction>(V); 1746 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1747 TheLoop->isLoopInvariant(I)) 1748 return false; 1749 1750 // Assume we can vectorize V (and hence we need extraction) if the 1751 // scalars are not computed yet. This can happen, because it is called 1752 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1753 // the scalars are collected. That should be a safe assumption in most 1754 // cases, because we check if the operands have vectorizable types 1755 // beforehand in LoopVectorizationLegality. 1756 return Scalars.find(VF) == Scalars.end() || 1757 !isScalarAfterVectorization(I, VF); 1758 }; 1759 1760 /// Returns a range containing only operands needing to be extracted. 1761 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1762 ElementCount VF) const { 1763 return SmallVector<Value *, 4>(make_filter_range( 1764 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1765 } 1766 1767 /// Determines if we have the infrastructure to vectorize loop \p L and its 1768 /// epilogue, assuming the main loop is vectorized by \p VF. 1769 bool isCandidateForEpilogueVectorization(const Loop &L, 1770 const ElementCount VF) const; 1771 1772 /// Returns true if epilogue vectorization is considered profitable, and 1773 /// false otherwise. 1774 /// \p VF is the vectorization factor chosen for the original loop. 1775 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1776 1777 public: 1778 /// The loop that we evaluate. 1779 Loop *TheLoop; 1780 1781 /// Predicated scalar evolution analysis. 1782 PredicatedScalarEvolution &PSE; 1783 1784 /// Loop Info analysis. 1785 LoopInfo *LI; 1786 1787 /// Vectorization legality. 1788 LoopVectorizationLegality *Legal; 1789 1790 /// Vector target information. 1791 const TargetTransformInfo &TTI; 1792 1793 /// Target Library Info. 1794 const TargetLibraryInfo *TLI; 1795 1796 /// Demanded bits analysis. 1797 DemandedBits *DB; 1798 1799 /// Assumption cache. 1800 AssumptionCache *AC; 1801 1802 /// Interface to emit optimization remarks. 1803 OptimizationRemarkEmitter *ORE; 1804 1805 const Function *TheFunction; 1806 1807 /// Loop Vectorize Hint. 1808 const LoopVectorizeHints *Hints; 1809 1810 /// The interleave access information contains groups of interleaved accesses 1811 /// with the same stride and close to each other. 1812 InterleavedAccessInfo &InterleaveInfo; 1813 1814 /// Values to ignore in the cost model. 1815 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1816 1817 /// Values to ignore in the cost model when VF > 1. 1818 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1819 1820 /// All element types found in the loop. 1821 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1822 1823 /// Profitable vector factors. 1824 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1825 }; 1826 } // end namespace llvm 1827 1828 /// Helper struct to manage generating runtime checks for vectorization. 1829 /// 1830 /// The runtime checks are created up-front in temporary blocks to allow better 1831 /// estimating the cost and un-linked from the existing IR. After deciding to 1832 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1833 /// temporary blocks are completely removed. 1834 class GeneratedRTChecks { 1835 /// Basic block which contains the generated SCEV checks, if any. 1836 BasicBlock *SCEVCheckBlock = nullptr; 1837 1838 /// The value representing the result of the generated SCEV checks. If it is 1839 /// nullptr, either no SCEV checks have been generated or they have been used. 1840 Value *SCEVCheckCond = nullptr; 1841 1842 /// Basic block which contains the generated memory runtime checks, if any. 1843 BasicBlock *MemCheckBlock = nullptr; 1844 1845 /// The value representing the result of the generated memory runtime checks. 1846 /// If it is nullptr, either no memory runtime checks have been generated or 1847 /// they have been used. 1848 Value *MemRuntimeCheckCond = nullptr; 1849 1850 DominatorTree *DT; 1851 LoopInfo *LI; 1852 1853 SCEVExpander SCEVExp; 1854 SCEVExpander MemCheckExp; 1855 1856 public: 1857 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1858 const DataLayout &DL) 1859 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1860 MemCheckExp(SE, DL, "scev.check") {} 1861 1862 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1863 /// accurately estimate the cost of the runtime checks. The blocks are 1864 /// un-linked from the IR and is added back during vector code generation. If 1865 /// there is no vector code generation, the check blocks are removed 1866 /// completely. 1867 void Create(Loop *L, const LoopAccessInfo &LAI, 1868 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1869 1870 BasicBlock *LoopHeader = L->getHeader(); 1871 BasicBlock *Preheader = L->getLoopPreheader(); 1872 1873 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1874 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1875 // may be used by SCEVExpander. The blocks will be un-linked from their 1876 // predecessors and removed from LI & DT at the end of the function. 1877 if (!UnionPred.isAlwaysTrue()) { 1878 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1879 nullptr, "vector.scevcheck"); 1880 1881 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1882 &UnionPred, SCEVCheckBlock->getTerminator()); 1883 } 1884 1885 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1886 if (RtPtrChecking.Need) { 1887 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1888 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1889 "vector.memcheck"); 1890 1891 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1892 if (DiffChecks) { 1893 MemRuntimeCheckCond = addDiffRuntimeChecks( 1894 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1895 [VF](IRBuilderBase &B, unsigned Bits) { 1896 return getRuntimeVF(B, B.getIntNTy(Bits), VF); 1897 }, 1898 IC); 1899 } else { 1900 MemRuntimeCheckCond = 1901 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1902 RtPtrChecking.getChecks(), MemCheckExp); 1903 } 1904 assert(MemRuntimeCheckCond && 1905 "no RT checks generated although RtPtrChecking " 1906 "claimed checks are required"); 1907 } 1908 1909 if (!MemCheckBlock && !SCEVCheckBlock) 1910 return; 1911 1912 // Unhook the temporary block with the checks, update various places 1913 // accordingly. 1914 if (SCEVCheckBlock) 1915 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1916 if (MemCheckBlock) 1917 MemCheckBlock->replaceAllUsesWith(Preheader); 1918 1919 if (SCEVCheckBlock) { 1920 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1921 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1922 Preheader->getTerminator()->eraseFromParent(); 1923 } 1924 if (MemCheckBlock) { 1925 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1926 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1927 Preheader->getTerminator()->eraseFromParent(); 1928 } 1929 1930 DT->changeImmediateDominator(LoopHeader, Preheader); 1931 if (MemCheckBlock) { 1932 DT->eraseNode(MemCheckBlock); 1933 LI->removeBlock(MemCheckBlock); 1934 } 1935 if (SCEVCheckBlock) { 1936 DT->eraseNode(SCEVCheckBlock); 1937 LI->removeBlock(SCEVCheckBlock); 1938 } 1939 } 1940 1941 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1942 /// unused. 1943 ~GeneratedRTChecks() { 1944 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 1945 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 1946 if (!SCEVCheckCond) 1947 SCEVCleaner.markResultUsed(); 1948 1949 if (!MemRuntimeCheckCond) 1950 MemCheckCleaner.markResultUsed(); 1951 1952 if (MemRuntimeCheckCond) { 1953 auto &SE = *MemCheckExp.getSE(); 1954 // Memory runtime check generation creates compares that use expanded 1955 // values. Remove them before running the SCEVExpanderCleaners. 1956 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1957 if (MemCheckExp.isInsertedInstruction(&I)) 1958 continue; 1959 SE.forgetValue(&I); 1960 I.eraseFromParent(); 1961 } 1962 } 1963 MemCheckCleaner.cleanup(); 1964 SCEVCleaner.cleanup(); 1965 1966 if (SCEVCheckCond) 1967 SCEVCheckBlock->eraseFromParent(); 1968 if (MemRuntimeCheckCond) 1969 MemCheckBlock->eraseFromParent(); 1970 } 1971 1972 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1973 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1974 /// depending on the generated condition. 1975 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 1976 BasicBlock *LoopVectorPreHeader, 1977 BasicBlock *LoopExitBlock) { 1978 if (!SCEVCheckCond) 1979 return nullptr; 1980 1981 Value *Cond = SCEVCheckCond; 1982 // Mark the check as used, to prevent it from being removed during cleanup. 1983 SCEVCheckCond = nullptr; 1984 if (auto *C = dyn_cast<ConstantInt>(Cond)) 1985 if (C->isZero()) 1986 return nullptr; 1987 1988 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 1989 1990 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 1991 // Create new preheader for vector loop. 1992 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 1993 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 1994 1995 SCEVCheckBlock->getTerminator()->eraseFromParent(); 1996 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 1997 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 1998 SCEVCheckBlock); 1999 2000 DT->addNewBlock(SCEVCheckBlock, Pred); 2001 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2002 2003 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2004 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2005 return SCEVCheckBlock; 2006 } 2007 2008 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2009 /// the branches to branch to the vector preheader or \p Bypass, depending on 2010 /// the generated condition. 2011 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2012 BasicBlock *LoopVectorPreHeader) { 2013 // Check if we generated code that checks in runtime if arrays overlap. 2014 if (!MemRuntimeCheckCond) 2015 return nullptr; 2016 2017 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2018 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2019 MemCheckBlock); 2020 2021 DT->addNewBlock(MemCheckBlock, Pred); 2022 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2023 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2024 2025 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2026 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2027 2028 ReplaceInstWithInst( 2029 MemCheckBlock->getTerminator(), 2030 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2031 MemCheckBlock->getTerminator()->setDebugLoc( 2032 Pred->getTerminator()->getDebugLoc()); 2033 2034 // Mark the check as used, to prevent it from being removed during cleanup. 2035 MemRuntimeCheckCond = nullptr; 2036 return MemCheckBlock; 2037 } 2038 }; 2039 2040 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2041 // vectorization. The loop needs to be annotated with #pragma omp simd 2042 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2043 // vector length information is not provided, vectorization is not considered 2044 // explicit. Interleave hints are not allowed either. These limitations will be 2045 // relaxed in the future. 2046 // Please, note that we are currently forced to abuse the pragma 'clang 2047 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2048 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2049 // provides *explicit vectorization hints* (LV can bypass legal checks and 2050 // assume that vectorization is legal). However, both hints are implemented 2051 // using the same metadata (llvm.loop.vectorize, processed by 2052 // LoopVectorizeHints). This will be fixed in the future when the native IR 2053 // representation for pragma 'omp simd' is introduced. 2054 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2055 OptimizationRemarkEmitter *ORE) { 2056 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2057 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2058 2059 // Only outer loops with an explicit vectorization hint are supported. 2060 // Unannotated outer loops are ignored. 2061 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2062 return false; 2063 2064 Function *Fn = OuterLp->getHeader()->getParent(); 2065 if (!Hints.allowVectorization(Fn, OuterLp, 2066 true /*VectorizeOnlyWhenForced*/)) { 2067 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2068 return false; 2069 } 2070 2071 if (Hints.getInterleave() > 1) { 2072 // TODO: Interleave support is future work. 2073 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2074 "outer loops.\n"); 2075 Hints.emitRemarkWithHints(); 2076 return false; 2077 } 2078 2079 return true; 2080 } 2081 2082 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2083 OptimizationRemarkEmitter *ORE, 2084 SmallVectorImpl<Loop *> &V) { 2085 // Collect inner loops and outer loops without irreducible control flow. For 2086 // now, only collect outer loops that have explicit vectorization hints. If we 2087 // are stress testing the VPlan H-CFG construction, we collect the outermost 2088 // loop of every loop nest. 2089 if (L.isInnermost() || VPlanBuildStressTest || 2090 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2091 LoopBlocksRPO RPOT(&L); 2092 RPOT.perform(LI); 2093 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2094 V.push_back(&L); 2095 // TODO: Collect inner loops inside marked outer loops in case 2096 // vectorization fails for the outer loop. Do not invoke 2097 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2098 // already known to be reducible. We can use an inherited attribute for 2099 // that. 2100 return; 2101 } 2102 } 2103 for (Loop *InnerL : L) 2104 collectSupportedLoops(*InnerL, LI, ORE, V); 2105 } 2106 2107 namespace { 2108 2109 /// The LoopVectorize Pass. 2110 struct LoopVectorize : public FunctionPass { 2111 /// Pass identification, replacement for typeid 2112 static char ID; 2113 2114 LoopVectorizePass Impl; 2115 2116 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2117 bool VectorizeOnlyWhenForced = false) 2118 : FunctionPass(ID), 2119 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2120 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2121 } 2122 2123 bool runOnFunction(Function &F) override { 2124 if (skipFunction(F)) 2125 return false; 2126 2127 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2128 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2129 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2130 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2131 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2132 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2133 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2134 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2135 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2136 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2137 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2138 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2139 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2140 2141 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2142 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2143 2144 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2145 GetLAA, *ORE, PSI).MadeAnyChange; 2146 } 2147 2148 void getAnalysisUsage(AnalysisUsage &AU) const override { 2149 AU.addRequired<AssumptionCacheTracker>(); 2150 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2151 AU.addRequired<DominatorTreeWrapperPass>(); 2152 AU.addRequired<LoopInfoWrapperPass>(); 2153 AU.addRequired<ScalarEvolutionWrapperPass>(); 2154 AU.addRequired<TargetTransformInfoWrapperPass>(); 2155 AU.addRequired<AAResultsWrapperPass>(); 2156 AU.addRequired<LoopAccessLegacyAnalysis>(); 2157 AU.addRequired<DemandedBitsWrapperPass>(); 2158 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2159 AU.addRequired<InjectTLIMappingsLegacy>(); 2160 2161 // We currently do not preserve loopinfo/dominator analyses with outer loop 2162 // vectorization. Until this is addressed, mark these analyses as preserved 2163 // only for non-VPlan-native path. 2164 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2165 if (!EnableVPlanNativePath) { 2166 AU.addPreserved<LoopInfoWrapperPass>(); 2167 AU.addPreserved<DominatorTreeWrapperPass>(); 2168 } 2169 2170 AU.addPreserved<BasicAAWrapperPass>(); 2171 AU.addPreserved<GlobalsAAWrapperPass>(); 2172 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2173 } 2174 }; 2175 2176 } // end anonymous namespace 2177 2178 //===----------------------------------------------------------------------===// 2179 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2180 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2181 //===----------------------------------------------------------------------===// 2182 2183 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2184 // We need to place the broadcast of invariant variables outside the loop, 2185 // but only if it's proven safe to do so. Else, broadcast will be inside 2186 // vector loop body. 2187 Instruction *Instr = dyn_cast<Instruction>(V); 2188 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2189 (!Instr || 2190 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2191 // Place the code for broadcasting invariant variables in the new preheader. 2192 IRBuilder<>::InsertPointGuard Guard(Builder); 2193 if (SafeToHoist) 2194 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2195 2196 // Broadcast the scalar into all locations in the vector. 2197 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2198 2199 return Shuf; 2200 } 2201 2202 /// This function adds 2203 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2204 /// to each vector element of Val. The sequence starts at StartIndex. 2205 /// \p Opcode is relevant for FP induction variable. 2206 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2207 Instruction::BinaryOps BinOp, ElementCount VF, 2208 IRBuilderBase &Builder) { 2209 assert(VF.isVector() && "only vector VFs are supported"); 2210 2211 // Create and check the types. 2212 auto *ValVTy = cast<VectorType>(Val->getType()); 2213 ElementCount VLen = ValVTy->getElementCount(); 2214 2215 Type *STy = Val->getType()->getScalarType(); 2216 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2217 "Induction Step must be an integer or FP"); 2218 assert(Step->getType() == STy && "Step has wrong type"); 2219 2220 SmallVector<Constant *, 8> Indices; 2221 2222 // Create a vector of consecutive numbers from zero to VF. 2223 VectorType *InitVecValVTy = ValVTy; 2224 if (STy->isFloatingPointTy()) { 2225 Type *InitVecValSTy = 2226 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2227 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2228 } 2229 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2230 2231 // Splat the StartIdx 2232 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2233 2234 if (STy->isIntegerTy()) { 2235 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2236 Step = Builder.CreateVectorSplat(VLen, Step); 2237 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2238 // FIXME: The newly created binary instructions should contain nsw/nuw 2239 // flags, which can be found from the original scalar operations. 2240 Step = Builder.CreateMul(InitVec, Step); 2241 return Builder.CreateAdd(Val, Step, "induction"); 2242 } 2243 2244 // Floating point induction. 2245 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2246 "Binary Opcode should be specified for FP induction"); 2247 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2248 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2249 2250 Step = Builder.CreateVectorSplat(VLen, Step); 2251 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2252 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2253 } 2254 2255 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2256 /// variable on which to base the steps, \p Step is the size of the step. 2257 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2258 const InductionDescriptor &ID, VPValue *Def, 2259 VPTransformState &State) { 2260 IRBuilderBase &Builder = State.Builder; 2261 // We shouldn't have to build scalar steps if we aren't vectorizing. 2262 assert(State.VF.isVector() && "VF should be greater than one"); 2263 // Get the value type and ensure it and the step have the same integer type. 2264 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2265 assert(ScalarIVTy == Step->getType() && 2266 "Val and Step should have the same type"); 2267 2268 // We build scalar steps for both integer and floating-point induction 2269 // variables. Here, we determine the kind of arithmetic we will perform. 2270 Instruction::BinaryOps AddOp; 2271 Instruction::BinaryOps MulOp; 2272 if (ScalarIVTy->isIntegerTy()) { 2273 AddOp = Instruction::Add; 2274 MulOp = Instruction::Mul; 2275 } else { 2276 AddOp = ID.getInductionOpcode(); 2277 MulOp = Instruction::FMul; 2278 } 2279 2280 // Determine the number of scalars we need to generate for each unroll 2281 // iteration. 2282 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2283 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2284 // Compute the scalar steps and save the results in State. 2285 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2286 ScalarIVTy->getScalarSizeInBits()); 2287 Type *VecIVTy = nullptr; 2288 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2289 if (!FirstLaneOnly && State.VF.isScalable()) { 2290 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2291 UnitStepVec = 2292 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2293 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2294 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2295 } 2296 2297 for (unsigned Part = 0; Part < State.UF; ++Part) { 2298 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2299 2300 if (!FirstLaneOnly && State.VF.isScalable()) { 2301 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2302 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2303 if (ScalarIVTy->isFloatingPointTy()) 2304 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2305 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2306 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2307 State.set(Def, Add, Part); 2308 // It's useful to record the lane values too for the known minimum number 2309 // of elements so we do those below. This improves the code quality when 2310 // trying to extract the first element, for example. 2311 } 2312 2313 if (ScalarIVTy->isFloatingPointTy()) 2314 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2315 2316 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2317 Value *StartIdx = Builder.CreateBinOp( 2318 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2319 // The step returned by `createStepForVF` is a runtime-evaluated value 2320 // when VF is scalable. Otherwise, it should be folded into a Constant. 2321 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2322 "Expected StartIdx to be folded to a constant when VF is not " 2323 "scalable"); 2324 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2325 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2326 State.set(Def, Add, VPIteration(Part, Lane)); 2327 } 2328 } 2329 } 2330 2331 // Generate code for the induction step. Note that induction steps are 2332 // required to be loop-invariant 2333 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2334 Instruction *InsertBefore, 2335 Loop *OrigLoop = nullptr) { 2336 const DataLayout &DL = SE.getDataLayout(); 2337 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2338 "Induction step should be loop invariant"); 2339 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2340 return E->getValue(); 2341 2342 SCEVExpander Exp(SE, DL, "induction"); 2343 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2344 } 2345 2346 /// Compute the transformed value of Index at offset StartValue using step 2347 /// StepValue. 2348 /// For integer induction, returns StartValue + Index * StepValue. 2349 /// For pointer induction, returns StartValue[Index * StepValue]. 2350 /// FIXME: The newly created binary instructions should contain nsw/nuw 2351 /// flags, which can be found from the original scalar operations. 2352 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2353 Value *StartValue, Value *Step, 2354 const InductionDescriptor &ID) { 2355 assert(Index->getType()->getScalarType() == Step->getType() && 2356 "Index scalar type does not match StepValue type"); 2357 2358 // Note: the IR at this point is broken. We cannot use SE to create any new 2359 // SCEV and then expand it, hoping that SCEV's simplification will give us 2360 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2361 // lead to various SCEV crashes. So all we can do is to use builder and rely 2362 // on InstCombine for future simplifications. Here we handle some trivial 2363 // cases only. 2364 auto CreateAdd = [&B](Value *X, Value *Y) { 2365 assert(X->getType() == Y->getType() && "Types don't match!"); 2366 if (auto *CX = dyn_cast<ConstantInt>(X)) 2367 if (CX->isZero()) 2368 return Y; 2369 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2370 if (CY->isZero()) 2371 return X; 2372 return B.CreateAdd(X, Y); 2373 }; 2374 2375 // We allow X to be a vector type, in which case Y will potentially be 2376 // splatted into a vector with the same element count. 2377 auto CreateMul = [&B](Value *X, Value *Y) { 2378 assert(X->getType()->getScalarType() == Y->getType() && 2379 "Types don't match!"); 2380 if (auto *CX = dyn_cast<ConstantInt>(X)) 2381 if (CX->isOne()) 2382 return Y; 2383 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2384 if (CY->isOne()) 2385 return X; 2386 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2387 if (XVTy && !isa<VectorType>(Y->getType())) 2388 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2389 return B.CreateMul(X, Y); 2390 }; 2391 2392 switch (ID.getKind()) { 2393 case InductionDescriptor::IK_IntInduction: { 2394 assert(!isa<VectorType>(Index->getType()) && 2395 "Vector indices not supported for integer inductions yet"); 2396 assert(Index->getType() == StartValue->getType() && 2397 "Index type does not match StartValue type"); 2398 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2399 return B.CreateSub(StartValue, Index); 2400 auto *Offset = CreateMul(Index, Step); 2401 return CreateAdd(StartValue, Offset); 2402 } 2403 case InductionDescriptor::IK_PtrInduction: { 2404 assert(isa<Constant>(Step) && 2405 "Expected constant step for pointer induction"); 2406 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2407 } 2408 case InductionDescriptor::IK_FpInduction: { 2409 assert(!isa<VectorType>(Index->getType()) && 2410 "Vector indices not supported for FP inductions yet"); 2411 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2412 auto InductionBinOp = ID.getInductionBinOp(); 2413 assert(InductionBinOp && 2414 (InductionBinOp->getOpcode() == Instruction::FAdd || 2415 InductionBinOp->getOpcode() == Instruction::FSub) && 2416 "Original bin op should be defined for FP induction"); 2417 2418 Value *MulExp = B.CreateFMul(Step, Index); 2419 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2420 "induction"); 2421 } 2422 case InductionDescriptor::IK_NoInduction: 2423 return nullptr; 2424 } 2425 llvm_unreachable("invalid enum"); 2426 } 2427 2428 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2429 const VPIteration &Instance, 2430 VPTransformState &State) { 2431 Value *ScalarInst = State.get(Def, Instance); 2432 Value *VectorValue = State.get(Def, Instance.Part); 2433 VectorValue = Builder.CreateInsertElement( 2434 VectorValue, ScalarInst, 2435 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2436 State.set(Def, VectorValue, Instance.Part); 2437 } 2438 2439 // Return whether we allow using masked interleave-groups (for dealing with 2440 // strided loads/stores that reside in predicated blocks, or for dealing 2441 // with gaps). 2442 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2443 // If an override option has been passed in for interleaved accesses, use it. 2444 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2445 return EnableMaskedInterleavedMemAccesses; 2446 2447 return TTI.enableMaskedInterleavedAccessVectorization(); 2448 } 2449 2450 // Try to vectorize the interleave group that \p Instr belongs to. 2451 // 2452 // E.g. Translate following interleaved load group (factor = 3): 2453 // for (i = 0; i < N; i+=3) { 2454 // R = Pic[i]; // Member of index 0 2455 // G = Pic[i+1]; // Member of index 1 2456 // B = Pic[i+2]; // Member of index 2 2457 // ... // do something to R, G, B 2458 // } 2459 // To: 2460 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2461 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2462 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2463 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2464 // 2465 // Or translate following interleaved store group (factor = 3): 2466 // for (i = 0; i < N; i+=3) { 2467 // ... do something to R, G, B 2468 // Pic[i] = R; // Member of index 0 2469 // Pic[i+1] = G; // Member of index 1 2470 // Pic[i+2] = B; // Member of index 2 2471 // } 2472 // To: 2473 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2474 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2475 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2476 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2477 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2478 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2479 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2480 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2481 VPValue *BlockInMask) { 2482 Instruction *Instr = Group->getInsertPos(); 2483 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2484 2485 // Prepare for the vector type of the interleaved load/store. 2486 Type *ScalarTy = getLoadStoreType(Instr); 2487 unsigned InterleaveFactor = Group->getFactor(); 2488 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2489 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2490 2491 // Prepare for the new pointers. 2492 SmallVector<Value *, 2> AddrParts; 2493 unsigned Index = Group->getIndex(Instr); 2494 2495 // TODO: extend the masked interleaved-group support to reversed access. 2496 assert((!BlockInMask || !Group->isReverse()) && 2497 "Reversed masked interleave-group not supported."); 2498 2499 // If the group is reverse, adjust the index to refer to the last vector lane 2500 // instead of the first. We adjust the index from the first vector lane, 2501 // rather than directly getting the pointer for lane VF - 1, because the 2502 // pointer operand of the interleaved access is supposed to be uniform. For 2503 // uniform instructions, we're only required to generate a value for the 2504 // first vector lane in each unroll iteration. 2505 if (Group->isReverse()) 2506 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2507 2508 for (unsigned Part = 0; Part < UF; Part++) { 2509 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2510 State.setDebugLocFromInst(AddrPart); 2511 2512 // Notice current instruction could be any index. Need to adjust the address 2513 // to the member of index 0. 2514 // 2515 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2516 // b = A[i]; // Member of index 0 2517 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2518 // 2519 // E.g. A[i+1] = a; // Member of index 1 2520 // A[i] = b; // Member of index 0 2521 // A[i+2] = c; // Member of index 2 (Current instruction) 2522 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2523 2524 bool InBounds = false; 2525 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2526 InBounds = gep->isInBounds(); 2527 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2528 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2529 2530 // Cast to the vector pointer type. 2531 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2532 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2533 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2534 } 2535 2536 State.setDebugLocFromInst(Instr); 2537 Value *PoisonVec = PoisonValue::get(VecTy); 2538 2539 Value *MaskForGaps = nullptr; 2540 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2541 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2542 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2543 } 2544 2545 // Vectorize the interleaved load group. 2546 if (isa<LoadInst>(Instr)) { 2547 // For each unroll part, create a wide load for the group. 2548 SmallVector<Value *, 2> NewLoads; 2549 for (unsigned Part = 0; Part < UF; Part++) { 2550 Instruction *NewLoad; 2551 if (BlockInMask || MaskForGaps) { 2552 assert(useMaskedInterleavedAccesses(*TTI) && 2553 "masked interleaved groups are not allowed."); 2554 Value *GroupMask = MaskForGaps; 2555 if (BlockInMask) { 2556 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2557 Value *ShuffledMask = Builder.CreateShuffleVector( 2558 BlockInMaskPart, 2559 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2560 "interleaved.mask"); 2561 GroupMask = MaskForGaps 2562 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2563 MaskForGaps) 2564 : ShuffledMask; 2565 } 2566 NewLoad = 2567 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2568 GroupMask, PoisonVec, "wide.masked.vec"); 2569 } 2570 else 2571 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2572 Group->getAlign(), "wide.vec"); 2573 Group->addMetadata(NewLoad); 2574 NewLoads.push_back(NewLoad); 2575 } 2576 2577 // For each member in the group, shuffle out the appropriate data from the 2578 // wide loads. 2579 unsigned J = 0; 2580 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2581 Instruction *Member = Group->getMember(I); 2582 2583 // Skip the gaps in the group. 2584 if (!Member) 2585 continue; 2586 2587 auto StrideMask = 2588 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2589 for (unsigned Part = 0; Part < UF; Part++) { 2590 Value *StridedVec = Builder.CreateShuffleVector( 2591 NewLoads[Part], StrideMask, "strided.vec"); 2592 2593 // If this member has different type, cast the result type. 2594 if (Member->getType() != ScalarTy) { 2595 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2596 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2597 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2598 } 2599 2600 if (Group->isReverse()) 2601 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2602 2603 State.set(VPDefs[J], StridedVec, Part); 2604 } 2605 ++J; 2606 } 2607 return; 2608 } 2609 2610 // The sub vector type for current instruction. 2611 auto *SubVT = VectorType::get(ScalarTy, VF); 2612 2613 // Vectorize the interleaved store group. 2614 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2615 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2616 "masked interleaved groups are not allowed."); 2617 assert((!MaskForGaps || !VF.isScalable()) && 2618 "masking gaps for scalable vectors is not yet supported."); 2619 for (unsigned Part = 0; Part < UF; Part++) { 2620 // Collect the stored vector from each member. 2621 SmallVector<Value *, 4> StoredVecs; 2622 for (unsigned i = 0; i < InterleaveFactor; i++) { 2623 assert((Group->getMember(i) || MaskForGaps) && 2624 "Fail to get a member from an interleaved store group"); 2625 Instruction *Member = Group->getMember(i); 2626 2627 // Skip the gaps in the group. 2628 if (!Member) { 2629 Value *Undef = PoisonValue::get(SubVT); 2630 StoredVecs.push_back(Undef); 2631 continue; 2632 } 2633 2634 Value *StoredVec = State.get(StoredValues[i], Part); 2635 2636 if (Group->isReverse()) 2637 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2638 2639 // If this member has different type, cast it to a unified type. 2640 2641 if (StoredVec->getType() != SubVT) 2642 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2643 2644 StoredVecs.push_back(StoredVec); 2645 } 2646 2647 // Concatenate all vectors into a wide vector. 2648 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2649 2650 // Interleave the elements in the wide vector. 2651 Value *IVec = Builder.CreateShuffleVector( 2652 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2653 "interleaved.vec"); 2654 2655 Instruction *NewStoreInstr; 2656 if (BlockInMask || MaskForGaps) { 2657 Value *GroupMask = MaskForGaps; 2658 if (BlockInMask) { 2659 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2660 Value *ShuffledMask = Builder.CreateShuffleVector( 2661 BlockInMaskPart, 2662 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2663 "interleaved.mask"); 2664 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2665 ShuffledMask, MaskForGaps) 2666 : ShuffledMask; 2667 } 2668 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2669 Group->getAlign(), GroupMask); 2670 } else 2671 NewStoreInstr = 2672 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2673 2674 Group->addMetadata(NewStoreInstr); 2675 } 2676 } 2677 2678 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2679 VPReplicateRecipe *RepRecipe, 2680 const VPIteration &Instance, 2681 bool IfPredicateInstr, 2682 VPTransformState &State) { 2683 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2684 2685 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2686 // the first lane and part. 2687 if (isa<NoAliasScopeDeclInst>(Instr)) 2688 if (!Instance.isFirstIteration()) 2689 return; 2690 2691 // Does this instruction return a value ? 2692 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2693 2694 Instruction *Cloned = Instr->clone(); 2695 if (!IsVoidRetTy) 2696 Cloned->setName(Instr->getName() + ".cloned"); 2697 2698 // If the scalarized instruction contributes to the address computation of a 2699 // widen masked load/store which was in a basic block that needed predication 2700 // and is not predicated after vectorization, we can't propagate 2701 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2702 // instruction could feed a poison value to the base address of the widen 2703 // load/store. 2704 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2705 Cloned->dropPoisonGeneratingFlags(); 2706 2707 if (Instr->getDebugLoc()) 2708 State.setDebugLocFromInst(Instr); 2709 2710 // Replace the operands of the cloned instructions with their scalar 2711 // equivalents in the new loop. 2712 for (auto &I : enumerate(RepRecipe->operands())) { 2713 auto InputInstance = Instance; 2714 VPValue *Operand = I.value(); 2715 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2716 if (OperandR && OperandR->isUniform()) 2717 InputInstance.Lane = VPLane::getFirstLane(); 2718 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2719 } 2720 State.addNewMetadata(Cloned, Instr); 2721 2722 // Place the cloned scalar in the new loop. 2723 State.Builder.Insert(Cloned); 2724 2725 State.set(RepRecipe, Cloned, Instance); 2726 2727 // If we just cloned a new assumption, add it the assumption cache. 2728 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2729 AC->registerAssumption(II); 2730 2731 // End if-block. 2732 if (IfPredicateInstr) 2733 PredicatedInstructions.push_back(Cloned); 2734 } 2735 2736 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2737 if (TripCount) 2738 return TripCount; 2739 2740 assert(InsertBlock); 2741 IRBuilder<> Builder(InsertBlock->getTerminator()); 2742 // Find the loop boundaries. 2743 ScalarEvolution *SE = PSE.getSE(); 2744 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2745 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2746 "Invalid loop count"); 2747 2748 Type *IdxTy = Legal->getWidestInductionType(); 2749 assert(IdxTy && "No type for induction"); 2750 2751 // The exit count might have the type of i64 while the phi is i32. This can 2752 // happen if we have an induction variable that is sign extended before the 2753 // compare. The only way that we get a backedge taken count is that the 2754 // induction variable was signed and as such will not overflow. In such a case 2755 // truncation is legal. 2756 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2757 IdxTy->getPrimitiveSizeInBits()) 2758 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2759 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2760 2761 // Get the total trip count from the count by adding 1. 2762 const SCEV *ExitCount = SE->getAddExpr( 2763 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2764 2765 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2766 2767 // Expand the trip count and place the new instructions in the preheader. 2768 // Notice that the pre-header does not change, only the loop body. 2769 SCEVExpander Exp(*SE, DL, "induction"); 2770 2771 // Count holds the overall loop count (N). 2772 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2773 InsertBlock->getTerminator()); 2774 2775 if (TripCount->getType()->isPointerTy()) 2776 TripCount = 2777 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2778 InsertBlock->getTerminator()); 2779 2780 return TripCount; 2781 } 2782 2783 Value * 2784 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2785 if (VectorTripCount) 2786 return VectorTripCount; 2787 2788 Value *TC = getOrCreateTripCount(InsertBlock); 2789 IRBuilder<> Builder(InsertBlock->getTerminator()); 2790 2791 Type *Ty = TC->getType(); 2792 // This is where we can make the step a runtime constant. 2793 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2794 2795 // If the tail is to be folded by masking, round the number of iterations N 2796 // up to a multiple of Step instead of rounding down. This is done by first 2797 // adding Step-1 and then rounding down. Note that it's ok if this addition 2798 // overflows: the vector induction variable will eventually wrap to zero given 2799 // that it starts at zero and its Step is a power of two; the loop will then 2800 // exit, with the last early-exit vector comparison also producing all-true. 2801 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2802 // is accounted for in emitIterationCountCheck that adds an overflow check. 2803 if (Cost->foldTailByMasking()) { 2804 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2805 "VF*UF must be a power of 2 when folding tail by masking"); 2806 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2807 TC = Builder.CreateAdd( 2808 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2809 } 2810 2811 // Now we need to generate the expression for the part of the loop that the 2812 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2813 // iterations are not required for correctness, or N - Step, otherwise. Step 2814 // is equal to the vectorization factor (number of SIMD elements) times the 2815 // unroll factor (number of SIMD instructions). 2816 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2817 2818 // There are cases where we *must* run at least one iteration in the remainder 2819 // loop. See the cost model for when this can happen. If the step evenly 2820 // divides the trip count, we set the remainder to be equal to the step. If 2821 // the step does not evenly divide the trip count, no adjustment is necessary 2822 // since there will already be scalar iterations. Note that the minimum 2823 // iterations check ensures that N >= Step. 2824 if (Cost->requiresScalarEpilogue(VF)) { 2825 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2826 R = Builder.CreateSelect(IsZero, Step, R); 2827 } 2828 2829 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2830 2831 return VectorTripCount; 2832 } 2833 2834 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2835 const DataLayout &DL) { 2836 // Verify that V is a vector type with same number of elements as DstVTy. 2837 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2838 unsigned VF = DstFVTy->getNumElements(); 2839 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2840 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2841 Type *SrcElemTy = SrcVecTy->getElementType(); 2842 Type *DstElemTy = DstFVTy->getElementType(); 2843 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2844 "Vector elements must have same size"); 2845 2846 // Do a direct cast if element types are castable. 2847 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2848 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2849 } 2850 // V cannot be directly casted to desired vector type. 2851 // May happen when V is a floating point vector but DstVTy is a vector of 2852 // pointers or vice-versa. Handle this using a two-step bitcast using an 2853 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2854 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2855 "Only one type should be a pointer type"); 2856 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2857 "Only one type should be a floating point type"); 2858 Type *IntTy = 2859 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2860 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2861 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2862 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2863 } 2864 2865 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2866 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2867 // Reuse existing vector loop preheader for TC checks. 2868 // Note that new preheader block is generated for vector loop. 2869 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2870 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2871 2872 // Generate code to check if the loop's trip count is less than VF * UF, or 2873 // equal to it in case a scalar epilogue is required; this implies that the 2874 // vector trip count is zero. This check also covers the case where adding one 2875 // to the backedge-taken count overflowed leading to an incorrect trip count 2876 // of zero. In this case we will also jump to the scalar loop. 2877 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2878 : ICmpInst::ICMP_ULT; 2879 2880 // If tail is to be folded, vector loop takes care of all iterations. 2881 Type *CountTy = Count->getType(); 2882 Value *CheckMinIters = Builder.getFalse(); 2883 Value *Step = createStepForVF(Builder, CountTy, VF, UF); 2884 if (!Cost->foldTailByMasking()) 2885 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2886 else if (VF.isScalable()) { 2887 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2888 // an overflow to zero when updating induction variables and so an 2889 // additional overflow check is required before entering the vector loop. 2890 2891 // Get the maximum unsigned value for the type. 2892 Value *MaxUIntTripCount = 2893 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2894 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2895 2896 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2897 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); 2898 } 2899 // Create new preheader for vector loop. 2900 LoopVectorPreHeader = 2901 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2902 "vector.ph"); 2903 2904 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2905 DT->getNode(Bypass)->getIDom()) && 2906 "TC check is expected to dominate Bypass"); 2907 2908 // Update dominator for Bypass & LoopExit (if needed). 2909 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2910 if (!Cost->requiresScalarEpilogue(VF)) 2911 // If there is an epilogue which must run, there's no edge from the 2912 // middle block to exit blocks and thus no need to update the immediate 2913 // dominator of the exit blocks. 2914 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2915 2916 ReplaceInstWithInst( 2917 TCCheckBlock->getTerminator(), 2918 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2919 LoopBypassBlocks.push_back(TCCheckBlock); 2920 } 2921 2922 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2923 2924 BasicBlock *const SCEVCheckBlock = 2925 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 2926 if (!SCEVCheckBlock) 2927 return nullptr; 2928 2929 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2930 (OptForSizeBasedOnProfile && 2931 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2932 "Cannot SCEV check stride or overflow when optimizing for size"); 2933 2934 2935 // Update dominator only if this is first RT check. 2936 if (LoopBypassBlocks.empty()) { 2937 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2938 if (!Cost->requiresScalarEpilogue(VF)) 2939 // If there is an epilogue which must run, there's no edge from the 2940 // middle block to exit blocks and thus no need to update the immediate 2941 // dominator of the exit blocks. 2942 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2943 } 2944 2945 LoopBypassBlocks.push_back(SCEVCheckBlock); 2946 AddedSafetyChecks = true; 2947 return SCEVCheckBlock; 2948 } 2949 2950 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2951 // VPlan-native path does not do any analysis for runtime checks currently. 2952 if (EnableVPlanNativePath) 2953 return nullptr; 2954 2955 BasicBlock *const MemCheckBlock = 2956 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2957 2958 // Check if we generated code that checks in runtime if arrays overlap. We put 2959 // the checks into a separate block to make the more common case of few 2960 // elements faster. 2961 if (!MemCheckBlock) 2962 return nullptr; 2963 2964 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2965 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2966 "Cannot emit memory checks when optimizing for size, unless forced " 2967 "to vectorize."); 2968 ORE->emit([&]() { 2969 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2970 OrigLoop->getStartLoc(), 2971 OrigLoop->getHeader()) 2972 << "Code-size may be reduced by not forcing " 2973 "vectorization, or by source-code modifications " 2974 "eliminating the need for runtime checks " 2975 "(e.g., adding 'restrict')."; 2976 }); 2977 } 2978 2979 LoopBypassBlocks.push_back(MemCheckBlock); 2980 2981 AddedSafetyChecks = true; 2982 2983 return MemCheckBlock; 2984 } 2985 2986 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2987 LoopScalarBody = OrigLoop->getHeader(); 2988 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2989 assert(LoopVectorPreHeader && "Invalid loop structure"); 2990 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 2991 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 2992 "multiple exit loop without required epilogue?"); 2993 2994 LoopMiddleBlock = 2995 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2996 LI, nullptr, Twine(Prefix) + "middle.block"); 2997 LoopScalarPreHeader = 2998 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2999 nullptr, Twine(Prefix) + "scalar.ph"); 3000 3001 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3002 3003 // Set up the middle block terminator. Two cases: 3004 // 1) If we know that we must execute the scalar epilogue, emit an 3005 // unconditional branch. 3006 // 2) Otherwise, we must have a single unique exit block (due to how we 3007 // implement the multiple exit case). In this case, set up a conditonal 3008 // branch from the middle block to the loop scalar preheader, and the 3009 // exit block. completeLoopSkeleton will update the condition to use an 3010 // iteration check, if required to decide whether to execute the remainder. 3011 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3012 BranchInst::Create(LoopScalarPreHeader) : 3013 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3014 Builder.getTrue()); 3015 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3016 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3017 3018 // Update dominator for loop exit. During skeleton creation, only the vector 3019 // pre-header and the middle block are created. The vector loop is entirely 3020 // created during VPlan exection. 3021 if (!Cost->requiresScalarEpilogue(VF)) 3022 // If there is an epilogue which must run, there's no edge from the 3023 // middle block to exit blocks and thus no need to update the immediate 3024 // dominator of the exit blocks. 3025 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3026 } 3027 3028 void InnerLoopVectorizer::createInductionResumeValues( 3029 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3030 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3031 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3032 "Inconsistent information about additional bypass."); 3033 3034 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3035 assert(VectorTripCount && "Expected valid arguments"); 3036 // We are going to resume the execution of the scalar loop. 3037 // Go over all of the induction variables that we found and fix the 3038 // PHIs that are left in the scalar version of the loop. 3039 // The starting values of PHI nodes depend on the counter of the last 3040 // iteration in the vectorized loop. 3041 // If we come from a bypass edge then we need to start from the original 3042 // start value. 3043 Instruction *OldInduction = Legal->getPrimaryInduction(); 3044 for (auto &InductionEntry : Legal->getInductionVars()) { 3045 PHINode *OrigPhi = InductionEntry.first; 3046 InductionDescriptor II = InductionEntry.second; 3047 3048 Value *&EndValue = IVEndValues[OrigPhi]; 3049 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3050 if (OrigPhi == OldInduction) { 3051 // We know what the end value is. 3052 EndValue = VectorTripCount; 3053 } else { 3054 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3055 3056 // Fast-math-flags propagate from the original induction instruction. 3057 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3058 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3059 3060 Type *StepType = II.getStep()->getType(); 3061 Instruction::CastOps CastOp = 3062 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3063 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3064 Value *Step = 3065 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3066 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3067 EndValue->setName("ind.end"); 3068 3069 // Compute the end value for the additional bypass (if applicable). 3070 if (AdditionalBypass.first) { 3071 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3072 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3073 StepType, true); 3074 Value *Step = 3075 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3076 VTC = 3077 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3078 EndValueFromAdditionalBypass = 3079 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3080 EndValueFromAdditionalBypass->setName("ind.end"); 3081 } 3082 } 3083 3084 // Create phi nodes to merge from the backedge-taken check block. 3085 PHINode *BCResumeVal = 3086 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3087 LoopScalarPreHeader->getTerminator()); 3088 // Copy original phi DL over to the new one. 3089 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3090 3091 // The new PHI merges the original incoming value, in case of a bypass, 3092 // or the value at the end of the vectorized loop. 3093 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3094 3095 // Fix the scalar body counter (PHI node). 3096 // The old induction's phi node in the scalar body needs the truncated 3097 // value. 3098 for (BasicBlock *BB : LoopBypassBlocks) 3099 BCResumeVal->addIncoming(II.getStartValue(), BB); 3100 3101 if (AdditionalBypass.first) 3102 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3103 EndValueFromAdditionalBypass); 3104 3105 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3106 } 3107 } 3108 3109 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3110 // The trip counts should be cached by now. 3111 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3112 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3113 3114 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3115 3116 // Add a check in the middle block to see if we have completed 3117 // all of the iterations in the first vector loop. Three cases: 3118 // 1) If we require a scalar epilogue, there is no conditional branch as 3119 // we unconditionally branch to the scalar preheader. Do nothing. 3120 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3121 // Thus if tail is to be folded, we know we don't need to run the 3122 // remainder and we can use the previous value for the condition (true). 3123 // 3) Otherwise, construct a runtime check. 3124 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3125 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3126 Count, VectorTripCount, "cmp.n", 3127 LoopMiddleBlock->getTerminator()); 3128 3129 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3130 // of the corresponding compare because they may have ended up with 3131 // different line numbers and we want to avoid awkward line stepping while 3132 // debugging. Eg. if the compare has got a line number inside the loop. 3133 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3134 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3135 } 3136 3137 #ifdef EXPENSIVE_CHECKS 3138 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3139 #endif 3140 3141 return LoopVectorPreHeader; 3142 } 3143 3144 std::pair<BasicBlock *, Value *> 3145 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3146 /* 3147 In this function we generate a new loop. The new loop will contain 3148 the vectorized instructions while the old loop will continue to run the 3149 scalar remainder. 3150 3151 [ ] <-- loop iteration number check. 3152 / | 3153 / v 3154 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3155 | / | 3156 | / v 3157 || [ ] <-- vector pre header. 3158 |/ | 3159 | v 3160 | [ ] \ 3161 | [ ]_| <-- vector loop (created during VPlan execution). 3162 | | 3163 | v 3164 \ -[ ] <--- middle-block. 3165 \/ | 3166 /\ v 3167 | ->[ ] <--- new preheader. 3168 | | 3169 (opt) v <-- edge from middle to exit iff epilogue is not required. 3170 | [ ] \ 3171 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3172 \ | 3173 \ v 3174 >[ ] <-- exit block(s). 3175 ... 3176 */ 3177 3178 // Get the metadata of the original loop before it gets modified. 3179 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3180 3181 // Workaround! Compute the trip count of the original loop and cache it 3182 // before we start modifying the CFG. This code has a systemic problem 3183 // wherein it tries to run analysis over partially constructed IR; this is 3184 // wrong, and not simply for SCEV. The trip count of the original loop 3185 // simply happens to be prone to hitting this in practice. In theory, we 3186 // can hit the same issue for any SCEV, or ValueTracking query done during 3187 // mutation. See PR49900. 3188 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3189 3190 // Create an empty vector loop, and prepare basic blocks for the runtime 3191 // checks. 3192 createVectorLoopSkeleton(""); 3193 3194 // Now, compare the new count to zero. If it is zero skip the vector loop and 3195 // jump to the scalar loop. This check also covers the case where the 3196 // backedge-taken count is uint##_max: adding one to it will overflow leading 3197 // to an incorrect trip count of zero. In this (rare) case we will also jump 3198 // to the scalar loop. 3199 emitIterationCountCheck(LoopScalarPreHeader); 3200 3201 // Generate the code to check any assumptions that we've made for SCEV 3202 // expressions. 3203 emitSCEVChecks(LoopScalarPreHeader); 3204 3205 // Generate the code that checks in runtime if arrays overlap. We put the 3206 // checks into a separate block to make the more common case of few elements 3207 // faster. 3208 emitMemRuntimeChecks(LoopScalarPreHeader); 3209 3210 // Emit phis for the new starting index of the scalar loop. 3211 createInductionResumeValues(); 3212 3213 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3214 } 3215 3216 // Fix up external users of the induction variable. At this point, we are 3217 // in LCSSA form, with all external PHIs that use the IV having one input value, 3218 // coming from the remainder loop. We need those PHIs to also have a correct 3219 // value for the IV when arriving directly from the middle block. 3220 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3221 const InductionDescriptor &II, 3222 Value *VectorTripCount, Value *EndValue, 3223 BasicBlock *MiddleBlock, 3224 BasicBlock *VectorHeader, VPlan &Plan) { 3225 // There are two kinds of external IV usages - those that use the value 3226 // computed in the last iteration (the PHI) and those that use the penultimate 3227 // value (the value that feeds into the phi from the loop latch). 3228 // We allow both, but they, obviously, have different values. 3229 3230 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3231 3232 DenseMap<Value *, Value *> MissingVals; 3233 3234 // An external user of the last iteration's value should see the value that 3235 // the remainder loop uses to initialize its own IV. 3236 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3237 for (User *U : PostInc->users()) { 3238 Instruction *UI = cast<Instruction>(U); 3239 if (!OrigLoop->contains(UI)) { 3240 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3241 MissingVals[UI] = EndValue; 3242 } 3243 } 3244 3245 // An external user of the penultimate value need to see EndValue - Step. 3246 // The simplest way to get this is to recompute it from the constituent SCEVs, 3247 // that is Start + (Step * (CRD - 1)). 3248 for (User *U : OrigPhi->users()) { 3249 auto *UI = cast<Instruction>(U); 3250 if (!OrigLoop->contains(UI)) { 3251 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3252 3253 IRBuilder<> B(MiddleBlock->getTerminator()); 3254 3255 // Fast-math-flags propagate from the original induction instruction. 3256 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3257 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3258 3259 Value *CountMinusOne = B.CreateSub( 3260 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3261 Value *CMO = 3262 !II.getStep()->getType()->isIntegerTy() 3263 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3264 II.getStep()->getType()) 3265 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3266 CMO->setName("cast.cmo"); 3267 3268 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3269 VectorHeader->getTerminator()); 3270 Value *Escape = 3271 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3272 Escape->setName("ind.escape"); 3273 MissingVals[UI] = Escape; 3274 } 3275 } 3276 3277 for (auto &I : MissingVals) { 3278 PHINode *PHI = cast<PHINode>(I.first); 3279 // One corner case we have to handle is two IVs "chasing" each-other, 3280 // that is %IV2 = phi [...], [ %IV1, %latch ] 3281 // In this case, if IV1 has an external use, we need to avoid adding both 3282 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3283 // don't already have an incoming value for the middle block. 3284 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3285 PHI->addIncoming(I.second, MiddleBlock); 3286 Plan.removeLiveOut(PHI); 3287 } 3288 } 3289 } 3290 3291 namespace { 3292 3293 struct CSEDenseMapInfo { 3294 static bool canHandle(const Instruction *I) { 3295 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3296 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3297 } 3298 3299 static inline Instruction *getEmptyKey() { 3300 return DenseMapInfo<Instruction *>::getEmptyKey(); 3301 } 3302 3303 static inline Instruction *getTombstoneKey() { 3304 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3305 } 3306 3307 static unsigned getHashValue(const Instruction *I) { 3308 assert(canHandle(I) && "Unknown instruction!"); 3309 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3310 I->value_op_end())); 3311 } 3312 3313 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3314 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3315 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3316 return LHS == RHS; 3317 return LHS->isIdenticalTo(RHS); 3318 } 3319 }; 3320 3321 } // end anonymous namespace 3322 3323 ///Perform cse of induction variable instructions. 3324 static void cse(BasicBlock *BB) { 3325 // Perform simple cse. 3326 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3327 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3328 if (!CSEDenseMapInfo::canHandle(&In)) 3329 continue; 3330 3331 // Check if we can replace this instruction with any of the 3332 // visited instructions. 3333 if (Instruction *V = CSEMap.lookup(&In)) { 3334 In.replaceAllUsesWith(V); 3335 In.eraseFromParent(); 3336 continue; 3337 } 3338 3339 CSEMap[&In] = &In; 3340 } 3341 } 3342 3343 InstructionCost 3344 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3345 bool &NeedToScalarize) const { 3346 Function *F = CI->getCalledFunction(); 3347 Type *ScalarRetTy = CI->getType(); 3348 SmallVector<Type *, 4> Tys, ScalarTys; 3349 for (auto &ArgOp : CI->args()) 3350 ScalarTys.push_back(ArgOp->getType()); 3351 3352 // Estimate cost of scalarized vector call. The source operands are assumed 3353 // to be vectors, so we need to extract individual elements from there, 3354 // execute VF scalar calls, and then gather the result into the vector return 3355 // value. 3356 InstructionCost ScalarCallCost = 3357 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3358 if (VF.isScalar()) 3359 return ScalarCallCost; 3360 3361 // Compute corresponding vector type for return value and arguments. 3362 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3363 for (Type *ScalarTy : ScalarTys) 3364 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3365 3366 // Compute costs of unpacking argument values for the scalar calls and 3367 // packing the return values to a vector. 3368 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3369 3370 InstructionCost Cost = 3371 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3372 3373 // If we can't emit a vector call for this function, then the currently found 3374 // cost is the cost we need to return. 3375 NeedToScalarize = true; 3376 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3377 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3378 3379 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3380 return Cost; 3381 3382 // If the corresponding vector cost is cheaper, return its cost. 3383 InstructionCost VectorCallCost = 3384 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3385 if (VectorCallCost < Cost) { 3386 NeedToScalarize = false; 3387 Cost = VectorCallCost; 3388 } 3389 return Cost; 3390 } 3391 3392 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3393 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3394 return Elt; 3395 return VectorType::get(Elt, VF); 3396 } 3397 3398 InstructionCost 3399 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3400 ElementCount VF) const { 3401 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3402 assert(ID && "Expected intrinsic call!"); 3403 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3404 FastMathFlags FMF; 3405 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3406 FMF = FPMO->getFastMathFlags(); 3407 3408 SmallVector<const Value *> Arguments(CI->args()); 3409 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3410 SmallVector<Type *> ParamTys; 3411 std::transform(FTy->param_begin(), FTy->param_end(), 3412 std::back_inserter(ParamTys), 3413 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3414 3415 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3416 dyn_cast<IntrinsicInst>(CI)); 3417 return TTI.getIntrinsicInstrCost(CostAttrs, 3418 TargetTransformInfo::TCK_RecipThroughput); 3419 } 3420 3421 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3422 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3423 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3424 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3425 } 3426 3427 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3428 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3429 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3430 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3431 } 3432 3433 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3434 // For every instruction `I` in MinBWs, truncate the operands, create a 3435 // truncated version of `I` and reextend its result. InstCombine runs 3436 // later and will remove any ext/trunc pairs. 3437 SmallPtrSet<Value *, 4> Erased; 3438 for (const auto &KV : Cost->getMinimalBitwidths()) { 3439 // If the value wasn't vectorized, we must maintain the original scalar 3440 // type. The absence of the value from State indicates that it 3441 // wasn't vectorized. 3442 // FIXME: Should not rely on getVPValue at this point. 3443 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3444 if (!State.hasAnyVectorValue(Def)) 3445 continue; 3446 for (unsigned Part = 0; Part < UF; ++Part) { 3447 Value *I = State.get(Def, Part); 3448 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3449 continue; 3450 Type *OriginalTy = I->getType(); 3451 Type *ScalarTruncatedTy = 3452 IntegerType::get(OriginalTy->getContext(), KV.second); 3453 auto *TruncatedTy = VectorType::get( 3454 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3455 if (TruncatedTy == OriginalTy) 3456 continue; 3457 3458 IRBuilder<> B(cast<Instruction>(I)); 3459 auto ShrinkOperand = [&](Value *V) -> Value * { 3460 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3461 if (ZI->getSrcTy() == TruncatedTy) 3462 return ZI->getOperand(0); 3463 return B.CreateZExtOrTrunc(V, TruncatedTy); 3464 }; 3465 3466 // The actual instruction modification depends on the instruction type, 3467 // unfortunately. 3468 Value *NewI = nullptr; 3469 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3470 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3471 ShrinkOperand(BO->getOperand(1))); 3472 3473 // Any wrapping introduced by shrinking this operation shouldn't be 3474 // considered undefined behavior. So, we can't unconditionally copy 3475 // arithmetic wrapping flags to NewI. 3476 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3477 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3478 NewI = 3479 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3480 ShrinkOperand(CI->getOperand(1))); 3481 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3482 NewI = B.CreateSelect(SI->getCondition(), 3483 ShrinkOperand(SI->getTrueValue()), 3484 ShrinkOperand(SI->getFalseValue())); 3485 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3486 switch (CI->getOpcode()) { 3487 default: 3488 llvm_unreachable("Unhandled cast!"); 3489 case Instruction::Trunc: 3490 NewI = ShrinkOperand(CI->getOperand(0)); 3491 break; 3492 case Instruction::SExt: 3493 NewI = B.CreateSExtOrTrunc( 3494 CI->getOperand(0), 3495 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3496 break; 3497 case Instruction::ZExt: 3498 NewI = B.CreateZExtOrTrunc( 3499 CI->getOperand(0), 3500 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3501 break; 3502 } 3503 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3504 auto Elements0 = 3505 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3506 auto *O0 = B.CreateZExtOrTrunc( 3507 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3508 auto Elements1 = 3509 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3510 auto *O1 = B.CreateZExtOrTrunc( 3511 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3512 3513 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3514 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3515 // Don't do anything with the operands, just extend the result. 3516 continue; 3517 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3518 auto Elements = 3519 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3520 auto *O0 = B.CreateZExtOrTrunc( 3521 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3522 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3523 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3524 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3525 auto Elements = 3526 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3527 auto *O0 = B.CreateZExtOrTrunc( 3528 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3529 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3530 } else { 3531 // If we don't know what to do, be conservative and don't do anything. 3532 continue; 3533 } 3534 3535 // Lastly, extend the result. 3536 NewI->takeName(cast<Instruction>(I)); 3537 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3538 I->replaceAllUsesWith(Res); 3539 cast<Instruction>(I)->eraseFromParent(); 3540 Erased.insert(I); 3541 State.reset(Def, Res, Part); 3542 } 3543 } 3544 3545 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3546 for (const auto &KV : Cost->getMinimalBitwidths()) { 3547 // If the value wasn't vectorized, we must maintain the original scalar 3548 // type. The absence of the value from State indicates that it 3549 // wasn't vectorized. 3550 // FIXME: Should not rely on getVPValue at this point. 3551 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3552 if (!State.hasAnyVectorValue(Def)) 3553 continue; 3554 for (unsigned Part = 0; Part < UF; ++Part) { 3555 Value *I = State.get(Def, Part); 3556 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3557 if (Inst && Inst->use_empty()) { 3558 Value *NewI = Inst->getOperand(0); 3559 Inst->eraseFromParent(); 3560 State.reset(Def, NewI, Part); 3561 } 3562 } 3563 } 3564 } 3565 3566 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3567 VPlan &Plan) { 3568 // Insert truncates and extends for any truncated instructions as hints to 3569 // InstCombine. 3570 if (VF.isVector()) 3571 truncateToMinimalBitwidths(State); 3572 3573 // Fix widened non-induction PHIs by setting up the PHI operands. 3574 if (EnableVPlanNativePath) 3575 fixNonInductionPHIs(Plan, State); 3576 3577 // At this point every instruction in the original loop is widened to a 3578 // vector form. Now we need to fix the recurrences in the loop. These PHI 3579 // nodes are currently empty because we did not want to introduce cycles. 3580 // This is the second stage of vectorizing recurrences. 3581 fixCrossIterationPHIs(State); 3582 3583 // Forget the original basic block. 3584 PSE.getSE()->forgetLoop(OrigLoop); 3585 3586 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3587 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3588 if (Cost->requiresScalarEpilogue(VF)) { 3589 // No edge from the middle block to the unique exit block has been inserted 3590 // and there is nothing to fix from vector loop; phis should have incoming 3591 // from scalar loop only. 3592 Plan.clearLiveOuts(); 3593 } else { 3594 // If we inserted an edge from the middle block to the unique exit block, 3595 // update uses outside the loop (phis) to account for the newly inserted 3596 // edge. 3597 3598 // Fix-up external users of the induction variables. 3599 for (auto &Entry : Legal->getInductionVars()) 3600 fixupIVUsers(Entry.first, Entry.second, 3601 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3602 IVEndValues[Entry.first], LoopMiddleBlock, 3603 VectorLoop->getHeader(), Plan); 3604 } 3605 3606 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3607 // in the exit block, so update the builder. 3608 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3609 for (auto &KV : Plan.getLiveOuts()) 3610 KV.second->fixPhi(Plan, State); 3611 3612 for (Instruction *PI : PredicatedInstructions) 3613 sinkScalarOperands(&*PI); 3614 3615 // Remove redundant induction instructions. 3616 cse(VectorLoop->getHeader()); 3617 3618 // Set/update profile weights for the vector and remainder loops as original 3619 // loop iterations are now distributed among them. Note that original loop 3620 // represented by LoopScalarBody becomes remainder loop after vectorization. 3621 // 3622 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3623 // end up getting slightly roughened result but that should be OK since 3624 // profile is not inherently precise anyway. Note also possible bypass of 3625 // vector code caused by legality checks is ignored, assigning all the weight 3626 // to the vector loop, optimistically. 3627 // 3628 // For scalable vectorization we can't know at compile time how many iterations 3629 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3630 // vscale of '1'. 3631 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3632 LI->getLoopFor(LoopScalarBody), 3633 VF.getKnownMinValue() * UF); 3634 } 3635 3636 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3637 // In order to support recurrences we need to be able to vectorize Phi nodes. 3638 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3639 // stage #2: We now need to fix the recurrences by adding incoming edges to 3640 // the currently empty PHI nodes. At this point every instruction in the 3641 // original loop is widened to a vector form so we can use them to construct 3642 // the incoming edges. 3643 VPBasicBlock *Header = 3644 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3645 for (VPRecipeBase &R : Header->phis()) { 3646 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3647 fixReduction(ReductionPhi, State); 3648 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3649 fixFirstOrderRecurrence(FOR, State); 3650 } 3651 } 3652 3653 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3654 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3655 // This is the second phase of vectorizing first-order recurrences. An 3656 // overview of the transformation is described below. Suppose we have the 3657 // following loop. 3658 // 3659 // for (int i = 0; i < n; ++i) 3660 // b[i] = a[i] - a[i - 1]; 3661 // 3662 // There is a first-order recurrence on "a". For this loop, the shorthand 3663 // scalar IR looks like: 3664 // 3665 // scalar.ph: 3666 // s_init = a[-1] 3667 // br scalar.body 3668 // 3669 // scalar.body: 3670 // i = phi [0, scalar.ph], [i+1, scalar.body] 3671 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3672 // s2 = a[i] 3673 // b[i] = s2 - s1 3674 // br cond, scalar.body, ... 3675 // 3676 // In this example, s1 is a recurrence because it's value depends on the 3677 // previous iteration. In the first phase of vectorization, we created a 3678 // vector phi v1 for s1. We now complete the vectorization and produce the 3679 // shorthand vector IR shown below (for VF = 4, UF = 1). 3680 // 3681 // vector.ph: 3682 // v_init = vector(..., ..., ..., a[-1]) 3683 // br vector.body 3684 // 3685 // vector.body 3686 // i = phi [0, vector.ph], [i+4, vector.body] 3687 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3688 // v2 = a[i, i+1, i+2, i+3]; 3689 // v3 = vector(v1(3), v2(0, 1, 2)) 3690 // b[i, i+1, i+2, i+3] = v2 - v3 3691 // br cond, vector.body, middle.block 3692 // 3693 // middle.block: 3694 // x = v2(3) 3695 // br scalar.ph 3696 // 3697 // scalar.ph: 3698 // s_init = phi [x, middle.block], [a[-1], otherwise] 3699 // br scalar.body 3700 // 3701 // After execution completes the vector loop, we extract the next value of 3702 // the recurrence (x) to use as the initial value in the scalar loop. 3703 3704 // Extract the last vector element in the middle block. This will be the 3705 // initial value for the recurrence when jumping to the scalar loop. 3706 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3707 Value *Incoming = State.get(PreviousDef, UF - 1); 3708 auto *ExtractForScalar = Incoming; 3709 auto *IdxTy = Builder.getInt32Ty(); 3710 if (VF.isVector()) { 3711 auto *One = ConstantInt::get(IdxTy, 1); 3712 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3713 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3714 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3715 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3716 "vector.recur.extract"); 3717 } 3718 // Extract the second last element in the middle block if the 3719 // Phi is used outside the loop. We need to extract the phi itself 3720 // and not the last element (the phi update in the current iteration). This 3721 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3722 // when the scalar loop is not run at all. 3723 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3724 if (VF.isVector()) { 3725 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3726 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3727 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3728 Incoming, Idx, "vector.recur.extract.for.phi"); 3729 } else if (UF > 1) 3730 // When loop is unrolled without vectorizing, initialize 3731 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3732 // of `Incoming`. This is analogous to the vectorized case above: extracting 3733 // the second last element when VF > 1. 3734 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3735 3736 // Fix the initial value of the original recurrence in the scalar loop. 3737 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3738 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3739 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3740 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3741 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3742 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3743 Start->addIncoming(Incoming, BB); 3744 } 3745 3746 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3747 Phi->setName("scalar.recur"); 3748 3749 // Finally, fix users of the recurrence outside the loop. The users will need 3750 // either the last value of the scalar recurrence or the last value of the 3751 // vector recurrence we extracted in the middle block. Since the loop is in 3752 // LCSSA form, we just need to find all the phi nodes for the original scalar 3753 // recurrence in the exit block, and then add an edge for the middle block. 3754 // Note that LCSSA does not imply single entry when the original scalar loop 3755 // had multiple exiting edges (as we always run the last iteration in the 3756 // scalar epilogue); in that case, there is no edge from middle to exit and 3757 // and thus no phis which needed updated. 3758 if (!Cost->requiresScalarEpilogue(VF)) 3759 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3760 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3761 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3762 State.Plan->removeLiveOut(&LCSSAPhi); 3763 } 3764 } 3765 3766 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3767 VPTransformState &State) { 3768 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3769 // Get it's reduction variable descriptor. 3770 assert(Legal->isReductionVariable(OrigPhi) && 3771 "Unable to find the reduction variable"); 3772 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3773 3774 RecurKind RK = RdxDesc.getRecurrenceKind(); 3775 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3776 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3777 State.setDebugLocFromInst(ReductionStartValue); 3778 3779 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3780 // This is the vector-clone of the value that leaves the loop. 3781 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3782 3783 // Wrap flags are in general invalid after vectorization, clear them. 3784 clearReductionWrapFlags(PhiR, State); 3785 3786 // Before each round, move the insertion point right between 3787 // the PHIs and the values we are going to write. 3788 // This allows us to write both PHINodes and the extractelement 3789 // instructions. 3790 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3791 3792 State.setDebugLocFromInst(LoopExitInst); 3793 3794 Type *PhiTy = OrigPhi->getType(); 3795 3796 VPBasicBlock *LatchVPBB = 3797 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3798 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3799 // If tail is folded by masking, the vector value to leave the loop should be 3800 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3801 // instead of the former. For an inloop reduction the reduction will already 3802 // be predicated, and does not need to be handled here. 3803 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3804 for (unsigned Part = 0; Part < UF; ++Part) { 3805 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3806 SelectInst *Sel = nullptr; 3807 for (User *U : VecLoopExitInst->users()) { 3808 if (isa<SelectInst>(U)) { 3809 assert(!Sel && "Reduction exit feeding two selects"); 3810 Sel = cast<SelectInst>(U); 3811 } else 3812 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3813 } 3814 assert(Sel && "Reduction exit feeds no select"); 3815 State.reset(LoopExitInstDef, Sel, Part); 3816 3817 if (isa<FPMathOperator>(Sel)) 3818 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3819 3820 // If the target can create a predicated operator for the reduction at no 3821 // extra cost in the loop (for example a predicated vadd), it can be 3822 // cheaper for the select to remain in the loop than be sunk out of it, 3823 // and so use the select value for the phi instead of the old 3824 // LoopExitValue. 3825 if (PreferPredicatedReductionSelect || 3826 TTI->preferPredicatedReductionSelect( 3827 RdxDesc.getOpcode(), PhiTy, 3828 TargetTransformInfo::ReductionFlags())) { 3829 auto *VecRdxPhi = 3830 cast<PHINode>(State.get(PhiR, Part)); 3831 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3832 } 3833 } 3834 } 3835 3836 // If the vector reduction can be performed in a smaller type, we truncate 3837 // then extend the loop exit value to enable InstCombine to evaluate the 3838 // entire expression in the smaller type. 3839 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3840 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3841 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3842 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3843 VectorParts RdxParts(UF); 3844 for (unsigned Part = 0; Part < UF; ++Part) { 3845 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3846 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3847 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3848 : Builder.CreateZExt(Trunc, VecTy); 3849 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3850 if (U != Trunc) { 3851 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3852 RdxParts[Part] = Extnd; 3853 } 3854 } 3855 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3856 for (unsigned Part = 0; Part < UF; ++Part) { 3857 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3858 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3859 } 3860 } 3861 3862 // Reduce all of the unrolled parts into a single vector. 3863 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3864 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3865 3866 // The middle block terminator has already been assigned a DebugLoc here (the 3867 // OrigLoop's single latch terminator). We want the whole middle block to 3868 // appear to execute on this line because: (a) it is all compiler generated, 3869 // (b) these instructions are always executed after evaluating the latch 3870 // conditional branch, and (c) other passes may add new predecessors which 3871 // terminate on this line. This is the easiest way to ensure we don't 3872 // accidentally cause an extra step back into the loop while debugging. 3873 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3874 if (PhiR->isOrdered()) 3875 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3876 else { 3877 // Floating-point operations should have some FMF to enable the reduction. 3878 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3879 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3880 for (unsigned Part = 1; Part < UF; ++Part) { 3881 Value *RdxPart = State.get(LoopExitInstDef, Part); 3882 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3883 ReducedPartRdx = Builder.CreateBinOp( 3884 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3885 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3886 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3887 ReducedPartRdx, RdxPart); 3888 else 3889 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3890 } 3891 } 3892 3893 // Create the reduction after the loop. Note that inloop reductions create the 3894 // target reduction in the loop using a Reduction recipe. 3895 if (VF.isVector() && !PhiR->isInLoop()) { 3896 ReducedPartRdx = 3897 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3898 // If the reduction can be performed in a smaller type, we need to extend 3899 // the reduction to the wider type before we branch to the original loop. 3900 if (PhiTy != RdxDesc.getRecurrenceType()) 3901 ReducedPartRdx = RdxDesc.isSigned() 3902 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3903 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3904 } 3905 3906 PHINode *ResumePhi = 3907 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 3908 3909 // Create a phi node that merges control-flow from the backedge-taken check 3910 // block and the middle block. 3911 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 3912 LoopScalarPreHeader->getTerminator()); 3913 3914 // If we are fixing reductions in the epilogue loop then we should already 3915 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 3916 // we carry over the incoming values correctly. 3917 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 3918 if (Incoming == LoopMiddleBlock) 3919 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 3920 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 3921 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 3922 Incoming); 3923 else 3924 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 3925 } 3926 3927 // Set the resume value for this reduction 3928 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 3929 3930 // If there were stores of the reduction value to a uniform memory address 3931 // inside the loop, create the final store here. 3932 if (StoreInst *SI = RdxDesc.IntermediateStore) { 3933 StoreInst *NewSI = 3934 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 3935 propagateMetadata(NewSI, SI); 3936 3937 // If the reduction value is used in other places, 3938 // then let the code below create PHI's for that. 3939 } 3940 3941 // Now, we need to fix the users of the reduction variable 3942 // inside and outside of the scalar remainder loop. 3943 3944 // We know that the loop is in LCSSA form. We need to update the PHI nodes 3945 // in the exit blocks. See comment on analogous loop in 3946 // fixFirstOrderRecurrence for a more complete explaination of the logic. 3947 if (!Cost->requiresScalarEpilogue(VF)) 3948 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3949 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 3950 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3951 State.Plan->removeLiveOut(&LCSSAPhi); 3952 } 3953 3954 // Fix the scalar loop reduction variable with the incoming reduction sum 3955 // from the vector body and from the backedge value. 3956 int IncomingEdgeBlockIdx = 3957 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3958 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3959 // Pick the other block. 3960 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3961 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3962 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3963 } 3964 3965 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 3966 VPTransformState &State) { 3967 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3968 RecurKind RK = RdxDesc.getRecurrenceKind(); 3969 if (RK != RecurKind::Add && RK != RecurKind::Mul) 3970 return; 3971 3972 SmallVector<VPValue *, 8> Worklist; 3973 SmallPtrSet<VPValue *, 8> Visited; 3974 Worklist.push_back(PhiR); 3975 Visited.insert(PhiR); 3976 3977 while (!Worklist.empty()) { 3978 VPValue *Cur = Worklist.pop_back_val(); 3979 for (unsigned Part = 0; Part < UF; ++Part) { 3980 Value *V = State.get(Cur, Part); 3981 if (!isa<OverflowingBinaryOperator>(V)) 3982 break; 3983 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3984 } 3985 3986 for (VPUser *U : Cur->users()) { 3987 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 3988 if (!UserRecipe) 3989 continue; 3990 for (VPValue *V : UserRecipe->definedValues()) 3991 if (Visited.insert(V).second) 3992 Worklist.push_back(V); 3993 } 3994 } 3995 } 3996 3997 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3998 // The basic block and loop containing the predicated instruction. 3999 auto *PredBB = PredInst->getParent(); 4000 auto *VectorLoop = LI->getLoopFor(PredBB); 4001 4002 // Initialize a worklist with the operands of the predicated instruction. 4003 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4004 4005 // Holds instructions that we need to analyze again. An instruction may be 4006 // reanalyzed if we don't yet know if we can sink it or not. 4007 SmallVector<Instruction *, 8> InstsToReanalyze; 4008 4009 // Returns true if a given use occurs in the predicated block. Phi nodes use 4010 // their operands in their corresponding predecessor blocks. 4011 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4012 auto *I = cast<Instruction>(U.getUser()); 4013 BasicBlock *BB = I->getParent(); 4014 if (auto *Phi = dyn_cast<PHINode>(I)) 4015 BB = Phi->getIncomingBlock( 4016 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4017 return BB == PredBB; 4018 }; 4019 4020 // Iteratively sink the scalarized operands of the predicated instruction 4021 // into the block we created for it. When an instruction is sunk, it's 4022 // operands are then added to the worklist. The algorithm ends after one pass 4023 // through the worklist doesn't sink a single instruction. 4024 bool Changed; 4025 do { 4026 // Add the instructions that need to be reanalyzed to the worklist, and 4027 // reset the changed indicator. 4028 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4029 InstsToReanalyze.clear(); 4030 Changed = false; 4031 4032 while (!Worklist.empty()) { 4033 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4034 4035 // We can't sink an instruction if it is a phi node, is not in the loop, 4036 // or may have side effects. 4037 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4038 I->mayHaveSideEffects()) 4039 continue; 4040 4041 // If the instruction is already in PredBB, check if we can sink its 4042 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4043 // sinking the scalar instruction I, hence it appears in PredBB; but it 4044 // may have failed to sink I's operands (recursively), which we try 4045 // (again) here. 4046 if (I->getParent() == PredBB) { 4047 Worklist.insert(I->op_begin(), I->op_end()); 4048 continue; 4049 } 4050 4051 // It's legal to sink the instruction if all its uses occur in the 4052 // predicated block. Otherwise, there's nothing to do yet, and we may 4053 // need to reanalyze the instruction. 4054 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4055 InstsToReanalyze.push_back(I); 4056 continue; 4057 } 4058 4059 // Move the instruction to the beginning of the predicated block, and add 4060 // it's operands to the worklist. 4061 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4062 Worklist.insert(I->op_begin(), I->op_end()); 4063 4064 // The sinking may have enabled other instructions to be sunk, so we will 4065 // need to iterate. 4066 Changed = true; 4067 } 4068 } while (Changed); 4069 } 4070 4071 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4072 VPTransformState &State) { 4073 auto Iter = depth_first( 4074 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); 4075 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4076 for (VPRecipeBase &P : VPBB->phis()) { 4077 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4078 if (!VPPhi) 4079 continue; 4080 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4081 // Make sure the builder has a valid insert point. 4082 Builder.SetInsertPoint(NewPhi); 4083 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4084 VPValue *Inc = VPPhi->getIncomingValue(i); 4085 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4086 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4087 } 4088 } 4089 } 4090 } 4091 4092 bool InnerLoopVectorizer::useOrderedReductions( 4093 const RecurrenceDescriptor &RdxDesc) { 4094 return Cost->useOrderedReductions(RdxDesc); 4095 } 4096 4097 /// A helper function for checking whether an integer division-related 4098 /// instruction may divide by zero (in which case it must be predicated if 4099 /// executed conditionally in the scalar code). 4100 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4101 /// Non-zero divisors that are non compile-time constants will not be 4102 /// converted into multiplication, so we will still end up scalarizing 4103 /// the division, but can do so w/o predication. 4104 static bool mayDivideByZero(Instruction &I) { 4105 assert((I.getOpcode() == Instruction::UDiv || 4106 I.getOpcode() == Instruction::SDiv || 4107 I.getOpcode() == Instruction::URem || 4108 I.getOpcode() == Instruction::SRem) && 4109 "Unexpected instruction"); 4110 Value *Divisor = I.getOperand(1); 4111 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4112 return !CInt || CInt->isZero(); 4113 } 4114 4115 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4116 VPUser &ArgOperands, 4117 VPTransformState &State) { 4118 assert(!isa<DbgInfoIntrinsic>(I) && 4119 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4120 State.setDebugLocFromInst(&I); 4121 4122 Module *M = I.getParent()->getParent()->getParent(); 4123 auto *CI = cast<CallInst>(&I); 4124 4125 SmallVector<Type *, 4> Tys; 4126 for (Value *ArgOperand : CI->args()) 4127 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4128 4129 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4130 4131 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4132 // version of the instruction. 4133 // Is it beneficial to perform intrinsic call compared to lib call? 4134 bool NeedToScalarize = false; 4135 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4136 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4137 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4138 assert((UseVectorIntrinsic || !NeedToScalarize) && 4139 "Instruction should be scalarized elsewhere."); 4140 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4141 "Either the intrinsic cost or vector call cost must be valid"); 4142 4143 for (unsigned Part = 0; Part < UF; ++Part) { 4144 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4145 SmallVector<Value *, 4> Args; 4146 for (auto &I : enumerate(ArgOperands.operands())) { 4147 // Some intrinsics have a scalar argument - don't replace it with a 4148 // vector. 4149 Value *Arg; 4150 if (!UseVectorIntrinsic || 4151 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4152 Arg = State.get(I.value(), Part); 4153 else 4154 Arg = State.get(I.value(), VPIteration(0, 0)); 4155 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4156 TysForDecl.push_back(Arg->getType()); 4157 Args.push_back(Arg); 4158 } 4159 4160 Function *VectorF; 4161 if (UseVectorIntrinsic) { 4162 // Use vector version of the intrinsic. 4163 if (VF.isVector()) 4164 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4165 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4166 assert(VectorF && "Can't retrieve vector intrinsic."); 4167 } else { 4168 // Use vector version of the function call. 4169 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4170 #ifndef NDEBUG 4171 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4172 "Can't create vector function."); 4173 #endif 4174 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4175 } 4176 SmallVector<OperandBundleDef, 1> OpBundles; 4177 CI->getOperandBundlesAsDefs(OpBundles); 4178 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4179 4180 if (isa<FPMathOperator>(V)) 4181 V->copyFastMathFlags(CI); 4182 4183 State.set(Def, V, Part); 4184 State.addMetadata(V, &I); 4185 } 4186 } 4187 4188 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4189 // We should not collect Scalars more than once per VF. Right now, this 4190 // function is called from collectUniformsAndScalars(), which already does 4191 // this check. Collecting Scalars for VF=1 does not make any sense. 4192 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4193 "This function should not be visited twice for the same VF"); 4194 4195 // This avoids any chances of creating a REPLICATE recipe during planning 4196 // since that would result in generation of scalarized code during execution, 4197 // which is not supported for scalable vectors. 4198 if (VF.isScalable()) { 4199 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4200 return; 4201 } 4202 4203 SmallSetVector<Instruction *, 8> Worklist; 4204 4205 // These sets are used to seed the analysis with pointers used by memory 4206 // accesses that will remain scalar. 4207 SmallSetVector<Instruction *, 8> ScalarPtrs; 4208 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4209 auto *Latch = TheLoop->getLoopLatch(); 4210 4211 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4212 // The pointer operands of loads and stores will be scalar as long as the 4213 // memory access is not a gather or scatter operation. The value operand of a 4214 // store will remain scalar if the store is scalarized. 4215 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4216 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4217 assert(WideningDecision != CM_Unknown && 4218 "Widening decision should be ready at this moment"); 4219 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4220 if (Ptr == Store->getValueOperand()) 4221 return WideningDecision == CM_Scalarize; 4222 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4223 "Ptr is neither a value or pointer operand"); 4224 return WideningDecision != CM_GatherScatter; 4225 }; 4226 4227 // A helper that returns true if the given value is a bitcast or 4228 // getelementptr instruction contained in the loop. 4229 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4230 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4231 isa<GetElementPtrInst>(V)) && 4232 !TheLoop->isLoopInvariant(V); 4233 }; 4234 4235 // A helper that evaluates a memory access's use of a pointer. If the use will 4236 // be a scalar use and the pointer is only used by memory accesses, we place 4237 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4238 // PossibleNonScalarPtrs. 4239 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4240 // We only care about bitcast and getelementptr instructions contained in 4241 // the loop. 4242 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4243 return; 4244 4245 // If the pointer has already been identified as scalar (e.g., if it was 4246 // also identified as uniform), there's nothing to do. 4247 auto *I = cast<Instruction>(Ptr); 4248 if (Worklist.count(I)) 4249 return; 4250 4251 // If the use of the pointer will be a scalar use, and all users of the 4252 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4253 // place the pointer in PossibleNonScalarPtrs. 4254 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4255 return isa<LoadInst>(U) || isa<StoreInst>(U); 4256 })) 4257 ScalarPtrs.insert(I); 4258 else 4259 PossibleNonScalarPtrs.insert(I); 4260 }; 4261 4262 // We seed the scalars analysis with three classes of instructions: (1) 4263 // instructions marked uniform-after-vectorization and (2) bitcast, 4264 // getelementptr and (pointer) phi instructions used by memory accesses 4265 // requiring a scalar use. 4266 // 4267 // (1) Add to the worklist all instructions that have been identified as 4268 // uniform-after-vectorization. 4269 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4270 4271 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4272 // memory accesses requiring a scalar use. The pointer operands of loads and 4273 // stores will be scalar as long as the memory accesses is not a gather or 4274 // scatter operation. The value operand of a store will remain scalar if the 4275 // store is scalarized. 4276 for (auto *BB : TheLoop->blocks()) 4277 for (auto &I : *BB) { 4278 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4279 evaluatePtrUse(Load, Load->getPointerOperand()); 4280 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4281 evaluatePtrUse(Store, Store->getPointerOperand()); 4282 evaluatePtrUse(Store, Store->getValueOperand()); 4283 } 4284 } 4285 for (auto *I : ScalarPtrs) 4286 if (!PossibleNonScalarPtrs.count(I)) { 4287 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4288 Worklist.insert(I); 4289 } 4290 4291 // Insert the forced scalars. 4292 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4293 // induction variable when the PHI user is scalarized. 4294 auto ForcedScalar = ForcedScalars.find(VF); 4295 if (ForcedScalar != ForcedScalars.end()) 4296 for (auto *I : ForcedScalar->second) 4297 Worklist.insert(I); 4298 4299 // Expand the worklist by looking through any bitcasts and getelementptr 4300 // instructions we've already identified as scalar. This is similar to the 4301 // expansion step in collectLoopUniforms(); however, here we're only 4302 // expanding to include additional bitcasts and getelementptr instructions. 4303 unsigned Idx = 0; 4304 while (Idx != Worklist.size()) { 4305 Instruction *Dst = Worklist[Idx++]; 4306 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4307 continue; 4308 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4309 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4310 auto *J = cast<Instruction>(U); 4311 return !TheLoop->contains(J) || Worklist.count(J) || 4312 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4313 isScalarUse(J, Src)); 4314 })) { 4315 Worklist.insert(Src); 4316 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4317 } 4318 } 4319 4320 // An induction variable will remain scalar if all users of the induction 4321 // variable and induction variable update remain scalar. 4322 for (auto &Induction : Legal->getInductionVars()) { 4323 auto *Ind = Induction.first; 4324 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4325 4326 // If tail-folding is applied, the primary induction variable will be used 4327 // to feed a vector compare. 4328 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4329 continue; 4330 4331 // Returns true if \p Indvar is a pointer induction that is used directly by 4332 // load/store instruction \p I. 4333 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4334 Instruction *I) { 4335 return Induction.second.getKind() == 4336 InductionDescriptor::IK_PtrInduction && 4337 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4338 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4339 }; 4340 4341 // Determine if all users of the induction variable are scalar after 4342 // vectorization. 4343 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4344 auto *I = cast<Instruction>(U); 4345 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4346 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4347 }); 4348 if (!ScalarInd) 4349 continue; 4350 4351 // Determine if all users of the induction variable update instruction are 4352 // scalar after vectorization. 4353 auto ScalarIndUpdate = 4354 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4355 auto *I = cast<Instruction>(U); 4356 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4357 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4358 }); 4359 if (!ScalarIndUpdate) 4360 continue; 4361 4362 // The induction variable and its update instruction will remain scalar. 4363 Worklist.insert(Ind); 4364 Worklist.insert(IndUpdate); 4365 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4366 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4367 << "\n"); 4368 } 4369 4370 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4371 } 4372 4373 bool LoopVectorizationCostModel::isScalarWithPredication( 4374 Instruction *I, ElementCount VF) const { 4375 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4376 return false; 4377 switch(I->getOpcode()) { 4378 default: 4379 break; 4380 case Instruction::Load: 4381 case Instruction::Store: { 4382 if (!Legal->isMaskRequired(I)) 4383 return false; 4384 auto *Ptr = getLoadStorePointerOperand(I); 4385 auto *Ty = getLoadStoreType(I); 4386 Type *VTy = Ty; 4387 if (VF.isVector()) 4388 VTy = VectorType::get(Ty, VF); 4389 const Align Alignment = getLoadStoreAlignment(I); 4390 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4391 TTI.isLegalMaskedGather(VTy, Alignment)) 4392 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4393 TTI.isLegalMaskedScatter(VTy, Alignment)); 4394 } 4395 case Instruction::UDiv: 4396 case Instruction::SDiv: 4397 case Instruction::SRem: 4398 case Instruction::URem: 4399 return mayDivideByZero(*I); 4400 } 4401 return false; 4402 } 4403 4404 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4405 Instruction *I, ElementCount VF) { 4406 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4407 assert(getWideningDecision(I, VF) == CM_Unknown && 4408 "Decision should not be set yet."); 4409 auto *Group = getInterleavedAccessGroup(I); 4410 assert(Group && "Must have a group."); 4411 4412 // If the instruction's allocated size doesn't equal it's type size, it 4413 // requires padding and will be scalarized. 4414 auto &DL = I->getModule()->getDataLayout(); 4415 auto *ScalarTy = getLoadStoreType(I); 4416 if (hasIrregularType(ScalarTy, DL)) 4417 return false; 4418 4419 // If the group involves a non-integral pointer, we may not be able to 4420 // losslessly cast all values to a common type. 4421 unsigned InterleaveFactor = Group->getFactor(); 4422 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4423 for (unsigned i = 0; i < InterleaveFactor; i++) { 4424 Instruction *Member = Group->getMember(i); 4425 if (!Member) 4426 continue; 4427 auto *MemberTy = getLoadStoreType(Member); 4428 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4429 // Don't coerce non-integral pointers to integers or vice versa. 4430 if (MemberNI != ScalarNI) { 4431 // TODO: Consider adding special nullptr value case here 4432 return false; 4433 } else if (MemberNI && ScalarNI && 4434 ScalarTy->getPointerAddressSpace() != 4435 MemberTy->getPointerAddressSpace()) { 4436 return false; 4437 } 4438 } 4439 4440 // Check if masking is required. 4441 // A Group may need masking for one of two reasons: it resides in a block that 4442 // needs predication, or it was decided to use masking to deal with gaps 4443 // (either a gap at the end of a load-access that may result in a speculative 4444 // load, or any gaps in a store-access). 4445 bool PredicatedAccessRequiresMasking = 4446 blockNeedsPredicationForAnyReason(I->getParent()) && 4447 Legal->isMaskRequired(I); 4448 bool LoadAccessWithGapsRequiresEpilogMasking = 4449 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4450 !isScalarEpilogueAllowed(); 4451 bool StoreAccessWithGapsRequiresMasking = 4452 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4453 if (!PredicatedAccessRequiresMasking && 4454 !LoadAccessWithGapsRequiresEpilogMasking && 4455 !StoreAccessWithGapsRequiresMasking) 4456 return true; 4457 4458 // If masked interleaving is required, we expect that the user/target had 4459 // enabled it, because otherwise it either wouldn't have been created or 4460 // it should have been invalidated by the CostModel. 4461 assert(useMaskedInterleavedAccesses(TTI) && 4462 "Masked interleave-groups for predicated accesses are not enabled."); 4463 4464 if (Group->isReverse()) 4465 return false; 4466 4467 auto *Ty = getLoadStoreType(I); 4468 const Align Alignment = getLoadStoreAlignment(I); 4469 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4470 : TTI.isLegalMaskedStore(Ty, Alignment); 4471 } 4472 4473 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4474 Instruction *I, ElementCount VF) { 4475 // Get and ensure we have a valid memory instruction. 4476 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4477 4478 auto *Ptr = getLoadStorePointerOperand(I); 4479 auto *ScalarTy = getLoadStoreType(I); 4480 4481 // In order to be widened, the pointer should be consecutive, first of all. 4482 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4483 return false; 4484 4485 // If the instruction is a store located in a predicated block, it will be 4486 // scalarized. 4487 if (isScalarWithPredication(I, VF)) 4488 return false; 4489 4490 // If the instruction's allocated size doesn't equal it's type size, it 4491 // requires padding and will be scalarized. 4492 auto &DL = I->getModule()->getDataLayout(); 4493 if (hasIrregularType(ScalarTy, DL)) 4494 return false; 4495 4496 return true; 4497 } 4498 4499 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4500 // We should not collect Uniforms more than once per VF. Right now, 4501 // this function is called from collectUniformsAndScalars(), which 4502 // already does this check. Collecting Uniforms for VF=1 does not make any 4503 // sense. 4504 4505 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4506 "This function should not be visited twice for the same VF"); 4507 4508 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4509 // not analyze again. Uniforms.count(VF) will return 1. 4510 Uniforms[VF].clear(); 4511 4512 // We now know that the loop is vectorizable! 4513 // Collect instructions inside the loop that will remain uniform after 4514 // vectorization. 4515 4516 // Global values, params and instructions outside of current loop are out of 4517 // scope. 4518 auto isOutOfScope = [&](Value *V) -> bool { 4519 Instruction *I = dyn_cast<Instruction>(V); 4520 return (!I || !TheLoop->contains(I)); 4521 }; 4522 4523 // Worklist containing uniform instructions demanding lane 0. 4524 SetVector<Instruction *> Worklist; 4525 BasicBlock *Latch = TheLoop->getLoopLatch(); 4526 4527 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4528 // that are scalar with predication must not be considered uniform after 4529 // vectorization, because that would create an erroneous replicating region 4530 // where only a single instance out of VF should be formed. 4531 // TODO: optimize such seldom cases if found important, see PR40816. 4532 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4533 if (isOutOfScope(I)) { 4534 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4535 << *I << "\n"); 4536 return; 4537 } 4538 if (isScalarWithPredication(I, VF)) { 4539 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4540 << *I << "\n"); 4541 return; 4542 } 4543 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4544 Worklist.insert(I); 4545 }; 4546 4547 // Start with the conditional branch. If the branch condition is an 4548 // instruction contained in the loop that is only used by the branch, it is 4549 // uniform. 4550 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4551 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4552 addToWorklistIfAllowed(Cmp); 4553 4554 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4555 InstWidening WideningDecision = getWideningDecision(I, VF); 4556 assert(WideningDecision != CM_Unknown && 4557 "Widening decision should be ready at this moment"); 4558 4559 // A uniform memory op is itself uniform. We exclude uniform stores 4560 // here as they demand the last lane, not the first one. 4561 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4562 assert(WideningDecision == CM_Scalarize); 4563 return true; 4564 } 4565 4566 return (WideningDecision == CM_Widen || 4567 WideningDecision == CM_Widen_Reverse || 4568 WideningDecision == CM_Interleave); 4569 }; 4570 4571 4572 // Returns true if Ptr is the pointer operand of a memory access instruction 4573 // I, and I is known to not require scalarization. 4574 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4575 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4576 }; 4577 4578 // Holds a list of values which are known to have at least one uniform use. 4579 // Note that there may be other uses which aren't uniform. A "uniform use" 4580 // here is something which only demands lane 0 of the unrolled iterations; 4581 // it does not imply that all lanes produce the same value (e.g. this is not 4582 // the usual meaning of uniform) 4583 SetVector<Value *> HasUniformUse; 4584 4585 // Scan the loop for instructions which are either a) known to have only 4586 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4587 for (auto *BB : TheLoop->blocks()) 4588 for (auto &I : *BB) { 4589 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4590 switch (II->getIntrinsicID()) { 4591 case Intrinsic::sideeffect: 4592 case Intrinsic::experimental_noalias_scope_decl: 4593 case Intrinsic::assume: 4594 case Intrinsic::lifetime_start: 4595 case Intrinsic::lifetime_end: 4596 if (TheLoop->hasLoopInvariantOperands(&I)) 4597 addToWorklistIfAllowed(&I); 4598 break; 4599 default: 4600 break; 4601 } 4602 } 4603 4604 // ExtractValue instructions must be uniform, because the operands are 4605 // known to be loop-invariant. 4606 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4607 assert(isOutOfScope(EVI->getAggregateOperand()) && 4608 "Expected aggregate value to be loop invariant"); 4609 addToWorklistIfAllowed(EVI); 4610 continue; 4611 } 4612 4613 // If there's no pointer operand, there's nothing to do. 4614 auto *Ptr = getLoadStorePointerOperand(&I); 4615 if (!Ptr) 4616 continue; 4617 4618 // A uniform memory op is itself uniform. We exclude uniform stores 4619 // here as they demand the last lane, not the first one. 4620 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4621 addToWorklistIfAllowed(&I); 4622 4623 if (isUniformDecision(&I, VF)) { 4624 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4625 HasUniformUse.insert(Ptr); 4626 } 4627 } 4628 4629 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4630 // demanding) users. Since loops are assumed to be in LCSSA form, this 4631 // disallows uses outside the loop as well. 4632 for (auto *V : HasUniformUse) { 4633 if (isOutOfScope(V)) 4634 continue; 4635 auto *I = cast<Instruction>(V); 4636 auto UsersAreMemAccesses = 4637 llvm::all_of(I->users(), [&](User *U) -> bool { 4638 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4639 }); 4640 if (UsersAreMemAccesses) 4641 addToWorklistIfAllowed(I); 4642 } 4643 4644 // Expand Worklist in topological order: whenever a new instruction 4645 // is added , its users should be already inside Worklist. It ensures 4646 // a uniform instruction will only be used by uniform instructions. 4647 unsigned idx = 0; 4648 while (idx != Worklist.size()) { 4649 Instruction *I = Worklist[idx++]; 4650 4651 for (auto OV : I->operand_values()) { 4652 // isOutOfScope operands cannot be uniform instructions. 4653 if (isOutOfScope(OV)) 4654 continue; 4655 // First order recurrence Phi's should typically be considered 4656 // non-uniform. 4657 auto *OP = dyn_cast<PHINode>(OV); 4658 if (OP && Legal->isFirstOrderRecurrence(OP)) 4659 continue; 4660 // If all the users of the operand are uniform, then add the 4661 // operand into the uniform worklist. 4662 auto *OI = cast<Instruction>(OV); 4663 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4664 auto *J = cast<Instruction>(U); 4665 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4666 })) 4667 addToWorklistIfAllowed(OI); 4668 } 4669 } 4670 4671 // For an instruction to be added into Worklist above, all its users inside 4672 // the loop should also be in Worklist. However, this condition cannot be 4673 // true for phi nodes that form a cyclic dependence. We must process phi 4674 // nodes separately. An induction variable will remain uniform if all users 4675 // of the induction variable and induction variable update remain uniform. 4676 // The code below handles both pointer and non-pointer induction variables. 4677 for (auto &Induction : Legal->getInductionVars()) { 4678 auto *Ind = Induction.first; 4679 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4680 4681 // Determine if all users of the induction variable are uniform after 4682 // vectorization. 4683 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4684 auto *I = cast<Instruction>(U); 4685 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4686 isVectorizedMemAccessUse(I, Ind); 4687 }); 4688 if (!UniformInd) 4689 continue; 4690 4691 // Determine if all users of the induction variable update instruction are 4692 // uniform after vectorization. 4693 auto UniformIndUpdate = 4694 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4695 auto *I = cast<Instruction>(U); 4696 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4697 isVectorizedMemAccessUse(I, IndUpdate); 4698 }); 4699 if (!UniformIndUpdate) 4700 continue; 4701 4702 // The induction variable and its update instruction will remain uniform. 4703 addToWorklistIfAllowed(Ind); 4704 addToWorklistIfAllowed(IndUpdate); 4705 } 4706 4707 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4708 } 4709 4710 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4711 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4712 4713 if (Legal->getRuntimePointerChecking()->Need) { 4714 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4715 "runtime pointer checks needed. Enable vectorization of this " 4716 "loop with '#pragma clang loop vectorize(enable)' when " 4717 "compiling with -Os/-Oz", 4718 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4719 return true; 4720 } 4721 4722 if (!PSE.getPredicate().isAlwaysTrue()) { 4723 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4724 "runtime SCEV checks needed. Enable vectorization of this " 4725 "loop with '#pragma clang loop vectorize(enable)' when " 4726 "compiling with -Os/-Oz", 4727 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4728 return true; 4729 } 4730 4731 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4732 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4733 reportVectorizationFailure("Runtime stride check for small trip count", 4734 "runtime stride == 1 checks needed. Enable vectorization of " 4735 "this loop without such check by compiling with -Os/-Oz", 4736 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4737 return true; 4738 } 4739 4740 return false; 4741 } 4742 4743 ElementCount 4744 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4745 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4746 return ElementCount::getScalable(0); 4747 4748 if (Hints->isScalableVectorizationDisabled()) { 4749 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4750 "ScalableVectorizationDisabled", ORE, TheLoop); 4751 return ElementCount::getScalable(0); 4752 } 4753 4754 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4755 4756 auto MaxScalableVF = ElementCount::getScalable( 4757 std::numeric_limits<ElementCount::ScalarTy>::max()); 4758 4759 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4760 // FIXME: While for scalable vectors this is currently sufficient, this should 4761 // be replaced by a more detailed mechanism that filters out specific VFs, 4762 // instead of invalidating vectorization for a whole set of VFs based on the 4763 // MaxVF. 4764 4765 // Disable scalable vectorization if the loop contains unsupported reductions. 4766 if (!canVectorizeReductions(MaxScalableVF)) { 4767 reportVectorizationInfo( 4768 "Scalable vectorization not supported for the reduction " 4769 "operations found in this loop.", 4770 "ScalableVFUnfeasible", ORE, TheLoop); 4771 return ElementCount::getScalable(0); 4772 } 4773 4774 // Disable scalable vectorization if the loop contains any instructions 4775 // with element types not supported for scalable vectors. 4776 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4777 return !Ty->isVoidTy() && 4778 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4779 })) { 4780 reportVectorizationInfo("Scalable vectorization is not supported " 4781 "for all element types found in this loop.", 4782 "ScalableVFUnfeasible", ORE, TheLoop); 4783 return ElementCount::getScalable(0); 4784 } 4785 4786 if (Legal->isSafeForAnyVectorWidth()) 4787 return MaxScalableVF; 4788 4789 // Limit MaxScalableVF by the maximum safe dependence distance. 4790 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4791 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4792 MaxVScale = 4793 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4794 MaxScalableVF = ElementCount::getScalable( 4795 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 4796 if (!MaxScalableVF) 4797 reportVectorizationInfo( 4798 "Max legal vector width too small, scalable vectorization " 4799 "unfeasible.", 4800 "ScalableVFUnfeasible", ORE, TheLoop); 4801 4802 return MaxScalableVF; 4803 } 4804 4805 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4806 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4807 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4808 unsigned SmallestType, WidestType; 4809 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4810 4811 // Get the maximum safe dependence distance in bits computed by LAA. 4812 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4813 // the memory accesses that is most restrictive (involved in the smallest 4814 // dependence distance). 4815 unsigned MaxSafeElements = 4816 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4817 4818 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4819 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4820 4821 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4822 << ".\n"); 4823 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4824 << ".\n"); 4825 4826 // First analyze the UserVF, fall back if the UserVF should be ignored. 4827 if (UserVF) { 4828 auto MaxSafeUserVF = 4829 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4830 4831 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4832 // If `VF=vscale x N` is safe, then so is `VF=N` 4833 if (UserVF.isScalable()) 4834 return FixedScalableVFPair( 4835 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4836 else 4837 return UserVF; 4838 } 4839 4840 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4841 4842 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4843 // is better to ignore the hint and let the compiler choose a suitable VF. 4844 if (!UserVF.isScalable()) { 4845 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4846 << " is unsafe, clamping to max safe VF=" 4847 << MaxSafeFixedVF << ".\n"); 4848 ORE->emit([&]() { 4849 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4850 TheLoop->getStartLoc(), 4851 TheLoop->getHeader()) 4852 << "User-specified vectorization factor " 4853 << ore::NV("UserVectorizationFactor", UserVF) 4854 << " is unsafe, clamping to maximum safe vectorization factor " 4855 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4856 }); 4857 return MaxSafeFixedVF; 4858 } 4859 4860 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4861 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4862 << " is ignored because scalable vectors are not " 4863 "available.\n"); 4864 ORE->emit([&]() { 4865 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4866 TheLoop->getStartLoc(), 4867 TheLoop->getHeader()) 4868 << "User-specified vectorization factor " 4869 << ore::NV("UserVectorizationFactor", UserVF) 4870 << " is ignored because the target does not support scalable " 4871 "vectors. The compiler will pick a more suitable value."; 4872 }); 4873 } else { 4874 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4875 << " is unsafe. Ignoring scalable UserVF.\n"); 4876 ORE->emit([&]() { 4877 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4878 TheLoop->getStartLoc(), 4879 TheLoop->getHeader()) 4880 << "User-specified vectorization factor " 4881 << ore::NV("UserVectorizationFactor", UserVF) 4882 << " is unsafe. Ignoring the hint to let the compiler pick a " 4883 "more suitable value."; 4884 }); 4885 } 4886 } 4887 4888 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4889 << " / " << WidestType << " bits.\n"); 4890 4891 FixedScalableVFPair Result(ElementCount::getFixed(1), 4892 ElementCount::getScalable(0)); 4893 if (auto MaxVF = 4894 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4895 MaxSafeFixedVF, FoldTailByMasking)) 4896 Result.FixedVF = MaxVF; 4897 4898 if (auto MaxVF = 4899 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4900 MaxSafeScalableVF, FoldTailByMasking)) 4901 if (MaxVF.isScalable()) { 4902 Result.ScalableVF = MaxVF; 4903 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4904 << "\n"); 4905 } 4906 4907 return Result; 4908 } 4909 4910 FixedScalableVFPair 4911 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4912 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4913 // TODO: It may by useful to do since it's still likely to be dynamically 4914 // uniform if the target can skip. 4915 reportVectorizationFailure( 4916 "Not inserting runtime ptr check for divergent target", 4917 "runtime pointer checks needed. Not enabled for divergent target", 4918 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4919 return FixedScalableVFPair::getNone(); 4920 } 4921 4922 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4923 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4924 if (TC == 1) { 4925 reportVectorizationFailure("Single iteration (non) loop", 4926 "loop trip count is one, irrelevant for vectorization", 4927 "SingleIterationLoop", ORE, TheLoop); 4928 return FixedScalableVFPair::getNone(); 4929 } 4930 4931 switch (ScalarEpilogueStatus) { 4932 case CM_ScalarEpilogueAllowed: 4933 return computeFeasibleMaxVF(TC, UserVF, false); 4934 case CM_ScalarEpilogueNotAllowedUsePredicate: 4935 LLVM_FALLTHROUGH; 4936 case CM_ScalarEpilogueNotNeededUsePredicate: 4937 LLVM_DEBUG( 4938 dbgs() << "LV: vector predicate hint/switch found.\n" 4939 << "LV: Not allowing scalar epilogue, creating predicated " 4940 << "vector loop.\n"); 4941 break; 4942 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4943 // fallthrough as a special case of OptForSize 4944 case CM_ScalarEpilogueNotAllowedOptSize: 4945 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4946 LLVM_DEBUG( 4947 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4948 else 4949 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4950 << "count.\n"); 4951 4952 // Bail if runtime checks are required, which are not good when optimising 4953 // for size. 4954 if (runtimeChecksRequired()) 4955 return FixedScalableVFPair::getNone(); 4956 4957 break; 4958 } 4959 4960 // The only loops we can vectorize without a scalar epilogue, are loops with 4961 // a bottom-test and a single exiting block. We'd have to handle the fact 4962 // that not every instruction executes on the last iteration. This will 4963 // require a lane mask which varies through the vector loop body. (TODO) 4964 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4965 // If there was a tail-folding hint/switch, but we can't fold the tail by 4966 // masking, fallback to a vectorization with a scalar epilogue. 4967 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4968 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4969 "scalar epilogue instead.\n"); 4970 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4971 return computeFeasibleMaxVF(TC, UserVF, false); 4972 } 4973 return FixedScalableVFPair::getNone(); 4974 } 4975 4976 // Now try the tail folding 4977 4978 // Invalidate interleave groups that require an epilogue if we can't mask 4979 // the interleave-group. 4980 if (!useMaskedInterleavedAccesses(TTI)) { 4981 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4982 "No decisions should have been taken at this point"); 4983 // Note: There is no need to invalidate any cost modeling decisions here, as 4984 // non where taken so far. 4985 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4986 } 4987 4988 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 4989 // Avoid tail folding if the trip count is known to be a multiple of any VF 4990 // we chose. 4991 // FIXME: The condition below pessimises the case for fixed-width vectors, 4992 // when scalable VFs are also candidates for vectorization. 4993 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 4994 ElementCount MaxFixedVF = MaxFactors.FixedVF; 4995 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 4996 "MaxFixedVF must be a power of 2"); 4997 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 4998 : MaxFixedVF.getFixedValue(); 4999 ScalarEvolution *SE = PSE.getSE(); 5000 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5001 const SCEV *ExitCount = SE->getAddExpr( 5002 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5003 const SCEV *Rem = SE->getURemExpr( 5004 SE->applyLoopGuards(ExitCount, TheLoop), 5005 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5006 if (Rem->isZero()) { 5007 // Accept MaxFixedVF if we do not have a tail. 5008 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5009 return MaxFactors; 5010 } 5011 } 5012 5013 // If we don't know the precise trip count, or if the trip count that we 5014 // found modulo the vectorization factor is not zero, try to fold the tail 5015 // by masking. 5016 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5017 if (Legal->prepareToFoldTailByMasking()) { 5018 FoldTailByMasking = true; 5019 return MaxFactors; 5020 } 5021 5022 // If there was a tail-folding hint/switch, but we can't fold the tail by 5023 // masking, fallback to a vectorization with a scalar epilogue. 5024 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5025 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5026 "scalar epilogue instead.\n"); 5027 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5028 return MaxFactors; 5029 } 5030 5031 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5032 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5033 return FixedScalableVFPair::getNone(); 5034 } 5035 5036 if (TC == 0) { 5037 reportVectorizationFailure( 5038 "Unable to calculate the loop count due to complex control flow", 5039 "unable to calculate the loop count due to complex control flow", 5040 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5041 return FixedScalableVFPair::getNone(); 5042 } 5043 5044 reportVectorizationFailure( 5045 "Cannot optimize for size and vectorize at the same time.", 5046 "cannot optimize for size and vectorize at the same time. " 5047 "Enable vectorization of this loop with '#pragma clang loop " 5048 "vectorize(enable)' when compiling with -Os/-Oz", 5049 "NoTailLoopWithOptForSize", ORE, TheLoop); 5050 return FixedScalableVFPair::getNone(); 5051 } 5052 5053 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5054 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5055 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5056 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5057 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5058 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5059 : TargetTransformInfo::RGK_FixedWidthVector); 5060 5061 // Convenience function to return the minimum of two ElementCounts. 5062 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5063 assert((LHS.isScalable() == RHS.isScalable()) && 5064 "Scalable flags must match"); 5065 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5066 }; 5067 5068 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5069 // Note that both WidestRegister and WidestType may not be a powers of 2. 5070 auto MaxVectorElementCount = ElementCount::get( 5071 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5072 ComputeScalableMaxVF); 5073 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5074 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5075 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5076 5077 if (!MaxVectorElementCount) { 5078 LLVM_DEBUG(dbgs() << "LV: The target has no " 5079 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5080 << " vector registers.\n"); 5081 return ElementCount::getFixed(1); 5082 } 5083 5084 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5085 if (ConstTripCount && 5086 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5087 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5088 // If loop trip count (TC) is known at compile time there is no point in 5089 // choosing VF greater than TC (as done in the loop below). Select maximum 5090 // power of two which doesn't exceed TC. 5091 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5092 // when the TC is less than or equal to the known number of lanes. 5093 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5094 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5095 "exceeding the constant trip count: " 5096 << ClampedConstTripCount << "\n"); 5097 return ElementCount::getFixed(ClampedConstTripCount); 5098 } 5099 5100 TargetTransformInfo::RegisterKind RegKind = 5101 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5102 : TargetTransformInfo::RGK_FixedWidthVector; 5103 ElementCount MaxVF = MaxVectorElementCount; 5104 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5105 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5106 auto MaxVectorElementCountMaxBW = ElementCount::get( 5107 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5108 ComputeScalableMaxVF); 5109 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5110 5111 // Collect all viable vectorization factors larger than the default MaxVF 5112 // (i.e. MaxVectorElementCount). 5113 SmallVector<ElementCount, 8> VFs; 5114 for (ElementCount VS = MaxVectorElementCount * 2; 5115 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5116 VFs.push_back(VS); 5117 5118 // For each VF calculate its register usage. 5119 auto RUs = calculateRegisterUsage(VFs); 5120 5121 // Select the largest VF which doesn't require more registers than existing 5122 // ones. 5123 for (int i = RUs.size() - 1; i >= 0; --i) { 5124 bool Selected = true; 5125 for (auto &pair : RUs[i].MaxLocalUsers) { 5126 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5127 if (pair.second > TargetNumRegisters) 5128 Selected = false; 5129 } 5130 if (Selected) { 5131 MaxVF = VFs[i]; 5132 break; 5133 } 5134 } 5135 if (ElementCount MinVF = 5136 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5137 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5138 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5139 << ") with target's minimum: " << MinVF << '\n'); 5140 MaxVF = MinVF; 5141 } 5142 } 5143 5144 // Invalidate any widening decisions we might have made, in case the loop 5145 // requires prediction (decided later), but we have already made some 5146 // load/store widening decisions. 5147 invalidateCostModelingDecisions(); 5148 } 5149 return MaxVF; 5150 } 5151 5152 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5153 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5154 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5155 auto Min = Attr.getVScaleRangeMin(); 5156 auto Max = Attr.getVScaleRangeMax(); 5157 if (Max && Min == Max) 5158 return Max; 5159 } 5160 5161 return TTI.getVScaleForTuning(); 5162 } 5163 5164 bool LoopVectorizationCostModel::isMoreProfitable( 5165 const VectorizationFactor &A, const VectorizationFactor &B) const { 5166 InstructionCost CostA = A.Cost; 5167 InstructionCost CostB = B.Cost; 5168 5169 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5170 5171 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5172 MaxTripCount) { 5173 // If we are folding the tail and the trip count is a known (possibly small) 5174 // constant, the trip count will be rounded up to an integer number of 5175 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5176 // which we compare directly. When not folding the tail, the total cost will 5177 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5178 // approximated with the per-lane cost below instead of using the tripcount 5179 // as here. 5180 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5181 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5182 return RTCostA < RTCostB; 5183 } 5184 5185 // Improve estimate for the vector width if it is scalable. 5186 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5187 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5188 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5189 if (A.Width.isScalable()) 5190 EstimatedWidthA *= VScale.getValue(); 5191 if (B.Width.isScalable()) 5192 EstimatedWidthB *= VScale.getValue(); 5193 } 5194 5195 // Assume vscale may be larger than 1 (or the value being tuned for), 5196 // so that scalable vectorization is slightly favorable over fixed-width 5197 // vectorization. 5198 if (A.Width.isScalable() && !B.Width.isScalable()) 5199 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5200 5201 // To avoid the need for FP division: 5202 // (CostA / A.Width) < (CostB / B.Width) 5203 // <=> (CostA * B.Width) < (CostB * A.Width) 5204 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5205 } 5206 5207 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5208 const ElementCountSet &VFCandidates) { 5209 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5210 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5211 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5212 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5213 "Expected Scalar VF to be a candidate"); 5214 5215 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5216 ExpectedCost); 5217 VectorizationFactor ChosenFactor = ScalarCost; 5218 5219 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5220 if (ForceVectorization && VFCandidates.size() > 1) { 5221 // Ignore scalar width, because the user explicitly wants vectorization. 5222 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5223 // evaluation. 5224 ChosenFactor.Cost = InstructionCost::getMax(); 5225 } 5226 5227 SmallVector<InstructionVFPair> InvalidCosts; 5228 for (const auto &i : VFCandidates) { 5229 // The cost for scalar VF=1 is already calculated, so ignore it. 5230 if (i.isScalar()) 5231 continue; 5232 5233 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5234 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5235 5236 #ifndef NDEBUG 5237 unsigned AssumedMinimumVscale = 1; 5238 if (Optional<unsigned> VScale = getVScaleForTuning()) 5239 AssumedMinimumVscale = *VScale; 5240 unsigned Width = 5241 Candidate.Width.isScalable() 5242 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5243 : Candidate.Width.getFixedValue(); 5244 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5245 << " costs: " << (Candidate.Cost / Width)); 5246 if (i.isScalable()) 5247 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5248 << AssumedMinimumVscale << ")"); 5249 LLVM_DEBUG(dbgs() << ".\n"); 5250 #endif 5251 5252 if (!C.second && !ForceVectorization) { 5253 LLVM_DEBUG( 5254 dbgs() << "LV: Not considering vector loop of width " << i 5255 << " because it will not generate any vector instructions.\n"); 5256 continue; 5257 } 5258 5259 // If profitable add it to ProfitableVF list. 5260 if (isMoreProfitable(Candidate, ScalarCost)) 5261 ProfitableVFs.push_back(Candidate); 5262 5263 if (isMoreProfitable(Candidate, ChosenFactor)) 5264 ChosenFactor = Candidate; 5265 } 5266 5267 // Emit a report of VFs with invalid costs in the loop. 5268 if (!InvalidCosts.empty()) { 5269 // Group the remarks per instruction, keeping the instruction order from 5270 // InvalidCosts. 5271 std::map<Instruction *, unsigned> Numbering; 5272 unsigned I = 0; 5273 for (auto &Pair : InvalidCosts) 5274 if (!Numbering.count(Pair.first)) 5275 Numbering[Pair.first] = I++; 5276 5277 // Sort the list, first on instruction(number) then on VF. 5278 llvm::sort(InvalidCosts, 5279 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5280 if (Numbering[A.first] != Numbering[B.first]) 5281 return Numbering[A.first] < Numbering[B.first]; 5282 ElementCountComparator ECC; 5283 return ECC(A.second, B.second); 5284 }); 5285 5286 // For a list of ordered instruction-vf pairs: 5287 // [(load, vf1), (load, vf2), (store, vf1)] 5288 // Group the instructions together to emit separate remarks for: 5289 // load (vf1, vf2) 5290 // store (vf1) 5291 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5292 auto Subset = ArrayRef<InstructionVFPair>(); 5293 do { 5294 if (Subset.empty()) 5295 Subset = Tail.take_front(1); 5296 5297 Instruction *I = Subset.front().first; 5298 5299 // If the next instruction is different, or if there are no other pairs, 5300 // emit a remark for the collated subset. e.g. 5301 // [(load, vf1), (load, vf2))] 5302 // to emit: 5303 // remark: invalid costs for 'load' at VF=(vf, vf2) 5304 if (Subset == Tail || Tail[Subset.size()].first != I) { 5305 std::string OutString; 5306 raw_string_ostream OS(OutString); 5307 assert(!Subset.empty() && "Unexpected empty range"); 5308 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5309 for (auto &Pair : Subset) 5310 OS << (Pair.second == Subset.front().second ? "" : ", ") 5311 << Pair.second; 5312 OS << "):"; 5313 if (auto *CI = dyn_cast<CallInst>(I)) 5314 OS << " call to " << CI->getCalledFunction()->getName(); 5315 else 5316 OS << " " << I->getOpcodeName(); 5317 OS.flush(); 5318 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5319 Tail = Tail.drop_front(Subset.size()); 5320 Subset = {}; 5321 } else 5322 // Grow the subset by one element 5323 Subset = Tail.take_front(Subset.size() + 1); 5324 } while (!Tail.empty()); 5325 } 5326 5327 if (!EnableCondStoresVectorization && NumPredStores) { 5328 reportVectorizationFailure("There are conditional stores.", 5329 "store that is conditionally executed prevents vectorization", 5330 "ConditionalStore", ORE, TheLoop); 5331 ChosenFactor = ScalarCost; 5332 } 5333 5334 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5335 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5336 << "LV: Vectorization seems to be not beneficial, " 5337 << "but was forced by a user.\n"); 5338 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5339 return ChosenFactor; 5340 } 5341 5342 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5343 const Loop &L, ElementCount VF) const { 5344 // Cross iteration phis such as reductions need special handling and are 5345 // currently unsupported. 5346 if (any_of(L.getHeader()->phis(), 5347 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5348 return false; 5349 5350 // Phis with uses outside of the loop require special handling and are 5351 // currently unsupported. 5352 for (auto &Entry : Legal->getInductionVars()) { 5353 // Look for uses of the value of the induction at the last iteration. 5354 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5355 for (User *U : PostInc->users()) 5356 if (!L.contains(cast<Instruction>(U))) 5357 return false; 5358 // Look for uses of penultimate value of the induction. 5359 for (User *U : Entry.first->users()) 5360 if (!L.contains(cast<Instruction>(U))) 5361 return false; 5362 } 5363 5364 // Induction variables that are widened require special handling that is 5365 // currently not supported. 5366 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5367 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5368 this->isProfitableToScalarize(Entry.first, VF)); 5369 })) 5370 return false; 5371 5372 // Epilogue vectorization code has not been auditted to ensure it handles 5373 // non-latch exits properly. It may be fine, but it needs auditted and 5374 // tested. 5375 if (L.getExitingBlock() != L.getLoopLatch()) 5376 return false; 5377 5378 return true; 5379 } 5380 5381 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5382 const ElementCount VF) const { 5383 // FIXME: We need a much better cost-model to take different parameters such 5384 // as register pressure, code size increase and cost of extra branches into 5385 // account. For now we apply a very crude heuristic and only consider loops 5386 // with vectorization factors larger than a certain value. 5387 // We also consider epilogue vectorization unprofitable for targets that don't 5388 // consider interleaving beneficial (eg. MVE). 5389 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5390 return false; 5391 // FIXME: We should consider changing the threshold for scalable 5392 // vectors to take VScaleForTuning into account. 5393 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5394 return true; 5395 return false; 5396 } 5397 5398 VectorizationFactor 5399 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5400 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5401 VectorizationFactor Result = VectorizationFactor::Disabled(); 5402 if (!EnableEpilogueVectorization) { 5403 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5404 return Result; 5405 } 5406 5407 if (!isScalarEpilogueAllowed()) { 5408 LLVM_DEBUG( 5409 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5410 "allowed.\n";); 5411 return Result; 5412 } 5413 5414 // Not really a cost consideration, but check for unsupported cases here to 5415 // simplify the logic. 5416 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5417 LLVM_DEBUG( 5418 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5419 "not a supported candidate.\n";); 5420 return Result; 5421 } 5422 5423 if (EpilogueVectorizationForceVF > 1) { 5424 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5425 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5426 if (LVP.hasPlanWithVF(ForcedEC)) 5427 return {ForcedEC, 0, 0}; 5428 else { 5429 LLVM_DEBUG( 5430 dbgs() 5431 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5432 return Result; 5433 } 5434 } 5435 5436 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5437 TheLoop->getHeader()->getParent()->hasMinSize()) { 5438 LLVM_DEBUG( 5439 dbgs() 5440 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5441 return Result; 5442 } 5443 5444 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5445 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5446 "this loop\n"); 5447 return Result; 5448 } 5449 5450 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5451 // the main loop handles 8 lanes per iteration. We could still benefit from 5452 // vectorizing the epilogue loop with VF=4. 5453 ElementCount EstimatedRuntimeVF = MainLoopVF; 5454 if (MainLoopVF.isScalable()) { 5455 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5456 if (Optional<unsigned> VScale = getVScaleForTuning()) 5457 EstimatedRuntimeVF *= *VScale; 5458 } 5459 5460 for (auto &NextVF : ProfitableVFs) 5461 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5462 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5463 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5464 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5465 LVP.hasPlanWithVF(NextVF.Width)) 5466 Result = NextVF; 5467 5468 if (Result != VectorizationFactor::Disabled()) 5469 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5470 << Result.Width << "\n";); 5471 return Result; 5472 } 5473 5474 std::pair<unsigned, unsigned> 5475 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5476 unsigned MinWidth = -1U; 5477 unsigned MaxWidth = 8; 5478 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5479 // For in-loop reductions, no element types are added to ElementTypesInLoop 5480 // if there are no loads/stores in the loop. In this case, check through the 5481 // reduction variables to determine the maximum width. 5482 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5483 // Reset MaxWidth so that we can find the smallest type used by recurrences 5484 // in the loop. 5485 MaxWidth = -1U; 5486 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5487 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5488 // When finding the min width used by the recurrence we need to account 5489 // for casts on the input operands of the recurrence. 5490 MaxWidth = std::min<unsigned>( 5491 MaxWidth, std::min<unsigned>( 5492 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5493 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5494 } 5495 } else { 5496 for (Type *T : ElementTypesInLoop) { 5497 MinWidth = std::min<unsigned>( 5498 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5499 MaxWidth = std::max<unsigned>( 5500 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5501 } 5502 } 5503 return {MinWidth, MaxWidth}; 5504 } 5505 5506 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5507 ElementTypesInLoop.clear(); 5508 // For each block. 5509 for (BasicBlock *BB : TheLoop->blocks()) { 5510 // For each instruction in the loop. 5511 for (Instruction &I : BB->instructionsWithoutDebug()) { 5512 Type *T = I.getType(); 5513 5514 // Skip ignored values. 5515 if (ValuesToIgnore.count(&I)) 5516 continue; 5517 5518 // Only examine Loads, Stores and PHINodes. 5519 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5520 continue; 5521 5522 // Examine PHI nodes that are reduction variables. Update the type to 5523 // account for the recurrence type. 5524 if (auto *PN = dyn_cast<PHINode>(&I)) { 5525 if (!Legal->isReductionVariable(PN)) 5526 continue; 5527 const RecurrenceDescriptor &RdxDesc = 5528 Legal->getReductionVars().find(PN)->second; 5529 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5530 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5531 RdxDesc.getRecurrenceType(), 5532 TargetTransformInfo::ReductionFlags())) 5533 continue; 5534 T = RdxDesc.getRecurrenceType(); 5535 } 5536 5537 // Examine the stored values. 5538 if (auto *ST = dyn_cast<StoreInst>(&I)) 5539 T = ST->getValueOperand()->getType(); 5540 5541 assert(T->isSized() && 5542 "Expected the load/store/recurrence type to be sized"); 5543 5544 ElementTypesInLoop.insert(T); 5545 } 5546 } 5547 } 5548 5549 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5550 unsigned LoopCost) { 5551 // -- The interleave heuristics -- 5552 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5553 // There are many micro-architectural considerations that we can't predict 5554 // at this level. For example, frontend pressure (on decode or fetch) due to 5555 // code size, or the number and capabilities of the execution ports. 5556 // 5557 // We use the following heuristics to select the interleave count: 5558 // 1. If the code has reductions, then we interleave to break the cross 5559 // iteration dependency. 5560 // 2. If the loop is really small, then we interleave to reduce the loop 5561 // overhead. 5562 // 3. We don't interleave if we think that we will spill registers to memory 5563 // due to the increased register pressure. 5564 5565 if (!isScalarEpilogueAllowed()) 5566 return 1; 5567 5568 // We used the distance for the interleave count. 5569 if (Legal->getMaxSafeDepDistBytes() != -1U) 5570 return 1; 5571 5572 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5573 const bool HasReductions = !Legal->getReductionVars().empty(); 5574 // Do not interleave loops with a relatively small known or estimated trip 5575 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5576 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5577 // because with the above conditions interleaving can expose ILP and break 5578 // cross iteration dependences for reductions. 5579 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5580 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5581 return 1; 5582 5583 // If we did not calculate the cost for VF (because the user selected the VF) 5584 // then we calculate the cost of VF here. 5585 if (LoopCost == 0) { 5586 InstructionCost C = expectedCost(VF).first; 5587 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5588 LoopCost = *C.getValue(); 5589 5590 // Loop body is free and there is no need for interleaving. 5591 if (LoopCost == 0) 5592 return 1; 5593 } 5594 5595 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5596 // We divide by these constants so assume that we have at least one 5597 // instruction that uses at least one register. 5598 for (auto& pair : R.MaxLocalUsers) { 5599 pair.second = std::max(pair.second, 1U); 5600 } 5601 5602 // We calculate the interleave count using the following formula. 5603 // Subtract the number of loop invariants from the number of available 5604 // registers. These registers are used by all of the interleaved instances. 5605 // Next, divide the remaining registers by the number of registers that is 5606 // required by the loop, in order to estimate how many parallel instances 5607 // fit without causing spills. All of this is rounded down if necessary to be 5608 // a power of two. We want power of two interleave count to simplify any 5609 // addressing operations or alignment considerations. 5610 // We also want power of two interleave counts to ensure that the induction 5611 // variable of the vector loop wraps to zero, when tail is folded by masking; 5612 // this currently happens when OptForSize, in which case IC is set to 1 above. 5613 unsigned IC = UINT_MAX; 5614 5615 for (auto& pair : R.MaxLocalUsers) { 5616 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5617 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5618 << " registers of " 5619 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5620 if (VF.isScalar()) { 5621 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5622 TargetNumRegisters = ForceTargetNumScalarRegs; 5623 } else { 5624 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5625 TargetNumRegisters = ForceTargetNumVectorRegs; 5626 } 5627 unsigned MaxLocalUsers = pair.second; 5628 unsigned LoopInvariantRegs = 0; 5629 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5630 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5631 5632 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5633 // Don't count the induction variable as interleaved. 5634 if (EnableIndVarRegisterHeur) { 5635 TmpIC = 5636 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5637 std::max(1U, (MaxLocalUsers - 1))); 5638 } 5639 5640 IC = std::min(IC, TmpIC); 5641 } 5642 5643 // Clamp the interleave ranges to reasonable counts. 5644 unsigned MaxInterleaveCount = 5645 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5646 5647 // Check if the user has overridden the max. 5648 if (VF.isScalar()) { 5649 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5650 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5651 } else { 5652 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5653 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5654 } 5655 5656 // If trip count is known or estimated compile time constant, limit the 5657 // interleave count to be less than the trip count divided by VF, provided it 5658 // is at least 1. 5659 // 5660 // For scalable vectors we can't know if interleaving is beneficial. It may 5661 // not be beneficial for small loops if none of the lanes in the second vector 5662 // iterations is enabled. However, for larger loops, there is likely to be a 5663 // similar benefit as for fixed-width vectors. For now, we choose to leave 5664 // the InterleaveCount as if vscale is '1', although if some information about 5665 // the vector is known (e.g. min vector size), we can make a better decision. 5666 if (BestKnownTC) { 5667 MaxInterleaveCount = 5668 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5669 // Make sure MaxInterleaveCount is greater than 0. 5670 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5671 } 5672 5673 assert(MaxInterleaveCount > 0 && 5674 "Maximum interleave count must be greater than 0"); 5675 5676 // Clamp the calculated IC to be between the 1 and the max interleave count 5677 // that the target and trip count allows. 5678 if (IC > MaxInterleaveCount) 5679 IC = MaxInterleaveCount; 5680 else 5681 // Make sure IC is greater than 0. 5682 IC = std::max(1u, IC); 5683 5684 assert(IC > 0 && "Interleave count must be greater than 0."); 5685 5686 // Interleave if we vectorized this loop and there is a reduction that could 5687 // benefit from interleaving. 5688 if (VF.isVector() && HasReductions) { 5689 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5690 return IC; 5691 } 5692 5693 // For any scalar loop that either requires runtime checks or predication we 5694 // are better off leaving this to the unroller. Note that if we've already 5695 // vectorized the loop we will have done the runtime check and so interleaving 5696 // won't require further checks. 5697 bool ScalarInterleavingRequiresPredication = 5698 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5699 return Legal->blockNeedsPredication(BB); 5700 })); 5701 bool ScalarInterleavingRequiresRuntimePointerCheck = 5702 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5703 5704 // We want to interleave small loops in order to reduce the loop overhead and 5705 // potentially expose ILP opportunities. 5706 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5707 << "LV: IC is " << IC << '\n' 5708 << "LV: VF is " << VF << '\n'); 5709 const bool AggressivelyInterleaveReductions = 5710 TTI.enableAggressiveInterleaving(HasReductions); 5711 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5712 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5713 // We assume that the cost overhead is 1 and we use the cost model 5714 // to estimate the cost of the loop and interleave until the cost of the 5715 // loop overhead is about 5% of the cost of the loop. 5716 unsigned SmallIC = 5717 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5718 5719 // Interleave until store/load ports (estimated by max interleave count) are 5720 // saturated. 5721 unsigned NumStores = Legal->getNumStores(); 5722 unsigned NumLoads = Legal->getNumLoads(); 5723 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5724 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5725 5726 // There is little point in interleaving for reductions containing selects 5727 // and compares when VF=1 since it may just create more overhead than it's 5728 // worth for loops with small trip counts. This is because we still have to 5729 // do the final reduction after the loop. 5730 bool HasSelectCmpReductions = 5731 HasReductions && 5732 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5733 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5734 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5735 RdxDesc.getRecurrenceKind()); 5736 }); 5737 if (HasSelectCmpReductions) { 5738 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5739 return 1; 5740 } 5741 5742 // If we have a scalar reduction (vector reductions are already dealt with 5743 // by this point), we can increase the critical path length if the loop 5744 // we're interleaving is inside another loop. For tree-wise reductions 5745 // set the limit to 2, and for ordered reductions it's best to disable 5746 // interleaving entirely. 5747 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5748 bool HasOrderedReductions = 5749 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5750 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5751 return RdxDesc.isOrdered(); 5752 }); 5753 if (HasOrderedReductions) { 5754 LLVM_DEBUG( 5755 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5756 return 1; 5757 } 5758 5759 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5760 SmallIC = std::min(SmallIC, F); 5761 StoresIC = std::min(StoresIC, F); 5762 LoadsIC = std::min(LoadsIC, F); 5763 } 5764 5765 if (EnableLoadStoreRuntimeInterleave && 5766 std::max(StoresIC, LoadsIC) > SmallIC) { 5767 LLVM_DEBUG( 5768 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5769 return std::max(StoresIC, LoadsIC); 5770 } 5771 5772 // If there are scalar reductions and TTI has enabled aggressive 5773 // interleaving for reductions, we will interleave to expose ILP. 5774 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5775 AggressivelyInterleaveReductions) { 5776 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5777 // Interleave no less than SmallIC but not as aggressive as the normal IC 5778 // to satisfy the rare situation when resources are too limited. 5779 return std::max(IC / 2, SmallIC); 5780 } else { 5781 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5782 return SmallIC; 5783 } 5784 } 5785 5786 // Interleave if this is a large loop (small loops are already dealt with by 5787 // this point) that could benefit from interleaving. 5788 if (AggressivelyInterleaveReductions) { 5789 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5790 return IC; 5791 } 5792 5793 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5794 return 1; 5795 } 5796 5797 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5798 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5799 // This function calculates the register usage by measuring the highest number 5800 // of values that are alive at a single location. Obviously, this is a very 5801 // rough estimation. We scan the loop in a topological order in order and 5802 // assign a number to each instruction. We use RPO to ensure that defs are 5803 // met before their users. We assume that each instruction that has in-loop 5804 // users starts an interval. We record every time that an in-loop value is 5805 // used, so we have a list of the first and last occurrences of each 5806 // instruction. Next, we transpose this data structure into a multi map that 5807 // holds the list of intervals that *end* at a specific location. This multi 5808 // map allows us to perform a linear search. We scan the instructions linearly 5809 // and record each time that a new interval starts, by placing it in a set. 5810 // If we find this value in the multi-map then we remove it from the set. 5811 // The max register usage is the maximum size of the set. 5812 // We also search for instructions that are defined outside the loop, but are 5813 // used inside the loop. We need this number separately from the max-interval 5814 // usage number because when we unroll, loop-invariant values do not take 5815 // more register. 5816 LoopBlocksDFS DFS(TheLoop); 5817 DFS.perform(LI); 5818 5819 RegisterUsage RU; 5820 5821 // Each 'key' in the map opens a new interval. The values 5822 // of the map are the index of the 'last seen' usage of the 5823 // instruction that is the key. 5824 using IntervalMap = DenseMap<Instruction *, unsigned>; 5825 5826 // Maps instruction to its index. 5827 SmallVector<Instruction *, 64> IdxToInstr; 5828 // Marks the end of each interval. 5829 IntervalMap EndPoint; 5830 // Saves the list of instruction indices that are used in the loop. 5831 SmallPtrSet<Instruction *, 8> Ends; 5832 // Saves the list of values that are used in the loop but are 5833 // defined outside the loop, such as arguments and constants. 5834 SmallPtrSet<Value *, 8> LoopInvariants; 5835 5836 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5837 for (Instruction &I : BB->instructionsWithoutDebug()) { 5838 IdxToInstr.push_back(&I); 5839 5840 // Save the end location of each USE. 5841 for (Value *U : I.operands()) { 5842 auto *Instr = dyn_cast<Instruction>(U); 5843 5844 // Ignore non-instruction values such as arguments, constants, etc. 5845 if (!Instr) 5846 continue; 5847 5848 // If this instruction is outside the loop then record it and continue. 5849 if (!TheLoop->contains(Instr)) { 5850 LoopInvariants.insert(Instr); 5851 continue; 5852 } 5853 5854 // Overwrite previous end points. 5855 EndPoint[Instr] = IdxToInstr.size(); 5856 Ends.insert(Instr); 5857 } 5858 } 5859 } 5860 5861 // Saves the list of intervals that end with the index in 'key'. 5862 using InstrList = SmallVector<Instruction *, 2>; 5863 DenseMap<unsigned, InstrList> TransposeEnds; 5864 5865 // Transpose the EndPoints to a list of values that end at each index. 5866 for (auto &Interval : EndPoint) 5867 TransposeEnds[Interval.second].push_back(Interval.first); 5868 5869 SmallPtrSet<Instruction *, 8> OpenIntervals; 5870 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5871 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5872 5873 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5874 5875 auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { 5876 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5877 return 0; 5878 return TTI.getRegUsageForType(VectorType::get(Ty, VF)); 5879 }; 5880 5881 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5882 Instruction *I = IdxToInstr[i]; 5883 5884 // Remove all of the instructions that end at this location. 5885 InstrList &List = TransposeEnds[i]; 5886 for (Instruction *ToRemove : List) 5887 OpenIntervals.erase(ToRemove); 5888 5889 // Ignore instructions that are never used within the loop. 5890 if (!Ends.count(I)) 5891 continue; 5892 5893 // Skip ignored values. 5894 if (ValuesToIgnore.count(I)) 5895 continue; 5896 5897 // For each VF find the maximum usage of registers. 5898 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5899 // Count the number of live intervals. 5900 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5901 5902 if (VFs[j].isScalar()) { 5903 for (auto Inst : OpenIntervals) { 5904 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5905 if (RegUsage.find(ClassID) == RegUsage.end()) 5906 RegUsage[ClassID] = 1; 5907 else 5908 RegUsage[ClassID] += 1; 5909 } 5910 } else { 5911 collectUniformsAndScalars(VFs[j]); 5912 for (auto Inst : OpenIntervals) { 5913 // Skip ignored values for VF > 1. 5914 if (VecValuesToIgnore.count(Inst)) 5915 continue; 5916 if (isScalarAfterVectorization(Inst, VFs[j])) { 5917 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5918 if (RegUsage.find(ClassID) == RegUsage.end()) 5919 RegUsage[ClassID] = 1; 5920 else 5921 RegUsage[ClassID] += 1; 5922 } else { 5923 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5924 if (RegUsage.find(ClassID) == RegUsage.end()) 5925 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5926 else 5927 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5928 } 5929 } 5930 } 5931 5932 for (auto& pair : RegUsage) { 5933 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5934 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5935 else 5936 MaxUsages[j][pair.first] = pair.second; 5937 } 5938 } 5939 5940 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5941 << OpenIntervals.size() << '\n'); 5942 5943 // Add the current instruction to the list of open intervals. 5944 OpenIntervals.insert(I); 5945 } 5946 5947 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5948 SmallMapVector<unsigned, unsigned, 4> Invariant; 5949 5950 for (auto Inst : LoopInvariants) { 5951 unsigned Usage = 5952 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5953 unsigned ClassID = 5954 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5955 if (Invariant.find(ClassID) == Invariant.end()) 5956 Invariant[ClassID] = Usage; 5957 else 5958 Invariant[ClassID] += Usage; 5959 } 5960 5961 LLVM_DEBUG({ 5962 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5963 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5964 << " item\n"; 5965 for (const auto &pair : MaxUsages[i]) { 5966 dbgs() << "LV(REG): RegisterClass: " 5967 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5968 << " registers\n"; 5969 } 5970 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5971 << " item\n"; 5972 for (const auto &pair : Invariant) { 5973 dbgs() << "LV(REG): RegisterClass: " 5974 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5975 << " registers\n"; 5976 } 5977 }); 5978 5979 RU.LoopInvariantRegs = Invariant; 5980 RU.MaxLocalUsers = MaxUsages[i]; 5981 RUs[i] = RU; 5982 } 5983 5984 return RUs; 5985 } 5986 5987 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5988 ElementCount VF) { 5989 // TODO: Cost model for emulated masked load/store is completely 5990 // broken. This hack guides the cost model to use an artificially 5991 // high enough value to practically disable vectorization with such 5992 // operations, except where previously deployed legality hack allowed 5993 // using very low cost values. This is to avoid regressions coming simply 5994 // from moving "masked load/store" check from legality to cost model. 5995 // Masked Load/Gather emulation was previously never allowed. 5996 // Limited number of Masked Store/Scatter emulation was allowed. 5997 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 5998 return isa<LoadInst>(I) || 5999 (isa<StoreInst>(I) && 6000 NumPredStores > NumberOfStoresToPredicate); 6001 } 6002 6003 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6004 // If we aren't vectorizing the loop, or if we've already collected the 6005 // instructions to scalarize, there's nothing to do. Collection may already 6006 // have occurred if we have a user-selected VF and are now computing the 6007 // expected cost for interleaving. 6008 if (VF.isScalar() || VF.isZero() || 6009 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6010 return; 6011 6012 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6013 // not profitable to scalarize any instructions, the presence of VF in the 6014 // map will indicate that we've analyzed it already. 6015 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6016 6017 // Find all the instructions that are scalar with predication in the loop and 6018 // determine if it would be better to not if-convert the blocks they are in. 6019 // If so, we also record the instructions to scalarize. 6020 for (BasicBlock *BB : TheLoop->blocks()) { 6021 if (!blockNeedsPredicationForAnyReason(BB)) 6022 continue; 6023 for (Instruction &I : *BB) 6024 if (isScalarWithPredication(&I, VF)) { 6025 ScalarCostsTy ScalarCosts; 6026 // Do not apply discount if scalable, because that would lead to 6027 // invalid scalarization costs. 6028 // Do not apply discount logic if hacked cost is needed 6029 // for emulated masked memrefs. 6030 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6031 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6032 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6033 // Remember that BB will remain after vectorization. 6034 PredicatedBBsAfterVectorization.insert(BB); 6035 } 6036 } 6037 } 6038 6039 int LoopVectorizationCostModel::computePredInstDiscount( 6040 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6041 assert(!isUniformAfterVectorization(PredInst, VF) && 6042 "Instruction marked uniform-after-vectorization will be predicated"); 6043 6044 // Initialize the discount to zero, meaning that the scalar version and the 6045 // vector version cost the same. 6046 InstructionCost Discount = 0; 6047 6048 // Holds instructions to analyze. The instructions we visit are mapped in 6049 // ScalarCosts. Those instructions are the ones that would be scalarized if 6050 // we find that the scalar version costs less. 6051 SmallVector<Instruction *, 8> Worklist; 6052 6053 // Returns true if the given instruction can be scalarized. 6054 auto canBeScalarized = [&](Instruction *I) -> bool { 6055 // We only attempt to scalarize instructions forming a single-use chain 6056 // from the original predicated block that would otherwise be vectorized. 6057 // Although not strictly necessary, we give up on instructions we know will 6058 // already be scalar to avoid traversing chains that are unlikely to be 6059 // beneficial. 6060 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6061 isScalarAfterVectorization(I, VF)) 6062 return false; 6063 6064 // If the instruction is scalar with predication, it will be analyzed 6065 // separately. We ignore it within the context of PredInst. 6066 if (isScalarWithPredication(I, VF)) 6067 return false; 6068 6069 // If any of the instruction's operands are uniform after vectorization, 6070 // the instruction cannot be scalarized. This prevents, for example, a 6071 // masked load from being scalarized. 6072 // 6073 // We assume we will only emit a value for lane zero of an instruction 6074 // marked uniform after vectorization, rather than VF identical values. 6075 // Thus, if we scalarize an instruction that uses a uniform, we would 6076 // create uses of values corresponding to the lanes we aren't emitting code 6077 // for. This behavior can be changed by allowing getScalarValue to clone 6078 // the lane zero values for uniforms rather than asserting. 6079 for (Use &U : I->operands()) 6080 if (auto *J = dyn_cast<Instruction>(U.get())) 6081 if (isUniformAfterVectorization(J, VF)) 6082 return false; 6083 6084 // Otherwise, we can scalarize the instruction. 6085 return true; 6086 }; 6087 6088 // Compute the expected cost discount from scalarizing the entire expression 6089 // feeding the predicated instruction. We currently only consider expressions 6090 // that are single-use instruction chains. 6091 Worklist.push_back(PredInst); 6092 while (!Worklist.empty()) { 6093 Instruction *I = Worklist.pop_back_val(); 6094 6095 // If we've already analyzed the instruction, there's nothing to do. 6096 if (ScalarCosts.find(I) != ScalarCosts.end()) 6097 continue; 6098 6099 // Compute the cost of the vector instruction. Note that this cost already 6100 // includes the scalarization overhead of the predicated instruction. 6101 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6102 6103 // Compute the cost of the scalarized instruction. This cost is the cost of 6104 // the instruction as if it wasn't if-converted and instead remained in the 6105 // predicated block. We will scale this cost by block probability after 6106 // computing the scalarization overhead. 6107 InstructionCost ScalarCost = 6108 VF.getFixedValue() * 6109 getInstructionCost(I, ElementCount::getFixed(1)).first; 6110 6111 // Compute the scalarization overhead of needed insertelement instructions 6112 // and phi nodes. 6113 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6114 ScalarCost += TTI.getScalarizationOverhead( 6115 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6116 APInt::getAllOnes(VF.getFixedValue()), true, false); 6117 ScalarCost += 6118 VF.getFixedValue() * 6119 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6120 } 6121 6122 // Compute the scalarization overhead of needed extractelement 6123 // instructions. For each of the instruction's operands, if the operand can 6124 // be scalarized, add it to the worklist; otherwise, account for the 6125 // overhead. 6126 for (Use &U : I->operands()) 6127 if (auto *J = dyn_cast<Instruction>(U.get())) { 6128 assert(VectorType::isValidElementType(J->getType()) && 6129 "Instruction has non-scalar type"); 6130 if (canBeScalarized(J)) 6131 Worklist.push_back(J); 6132 else if (needsExtract(J, VF)) { 6133 ScalarCost += TTI.getScalarizationOverhead( 6134 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6135 APInt::getAllOnes(VF.getFixedValue()), false, true); 6136 } 6137 } 6138 6139 // Scale the total scalar cost by block probability. 6140 ScalarCost /= getReciprocalPredBlockProb(); 6141 6142 // Compute the discount. A non-negative discount means the vector version 6143 // of the instruction costs more, and scalarizing would be beneficial. 6144 Discount += VectorCost - ScalarCost; 6145 ScalarCosts[I] = ScalarCost; 6146 } 6147 6148 return *Discount.getValue(); 6149 } 6150 6151 LoopVectorizationCostModel::VectorizationCostTy 6152 LoopVectorizationCostModel::expectedCost( 6153 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6154 VectorizationCostTy Cost; 6155 6156 // For each block. 6157 for (BasicBlock *BB : TheLoop->blocks()) { 6158 VectorizationCostTy BlockCost; 6159 6160 // For each instruction in the old loop. 6161 for (Instruction &I : BB->instructionsWithoutDebug()) { 6162 // Skip ignored values. 6163 if (ValuesToIgnore.count(&I) || 6164 (VF.isVector() && VecValuesToIgnore.count(&I))) 6165 continue; 6166 6167 VectorizationCostTy C = getInstructionCost(&I, VF); 6168 6169 // Check if we should override the cost. 6170 if (C.first.isValid() && 6171 ForceTargetInstructionCost.getNumOccurrences() > 0) 6172 C.first = InstructionCost(ForceTargetInstructionCost); 6173 6174 // Keep a list of instructions with invalid costs. 6175 if (Invalid && !C.first.isValid()) 6176 Invalid->emplace_back(&I, VF); 6177 6178 BlockCost.first += C.first; 6179 BlockCost.second |= C.second; 6180 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6181 << " for VF " << VF << " For instruction: " << I 6182 << '\n'); 6183 } 6184 6185 // If we are vectorizing a predicated block, it will have been 6186 // if-converted. This means that the block's instructions (aside from 6187 // stores and instructions that may divide by zero) will now be 6188 // unconditionally executed. For the scalar case, we may not always execute 6189 // the predicated block, if it is an if-else block. Thus, scale the block's 6190 // cost by the probability of executing it. blockNeedsPredication from 6191 // Legal is used so as to not include all blocks in tail folded loops. 6192 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6193 BlockCost.first /= getReciprocalPredBlockProb(); 6194 6195 Cost.first += BlockCost.first; 6196 Cost.second |= BlockCost.second; 6197 } 6198 6199 return Cost; 6200 } 6201 6202 /// Gets Address Access SCEV after verifying that the access pattern 6203 /// is loop invariant except the induction variable dependence. 6204 /// 6205 /// This SCEV can be sent to the Target in order to estimate the address 6206 /// calculation cost. 6207 static const SCEV *getAddressAccessSCEV( 6208 Value *Ptr, 6209 LoopVectorizationLegality *Legal, 6210 PredicatedScalarEvolution &PSE, 6211 const Loop *TheLoop) { 6212 6213 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6214 if (!Gep) 6215 return nullptr; 6216 6217 // We are looking for a gep with all loop invariant indices except for one 6218 // which should be an induction variable. 6219 auto SE = PSE.getSE(); 6220 unsigned NumOperands = Gep->getNumOperands(); 6221 for (unsigned i = 1; i < NumOperands; ++i) { 6222 Value *Opd = Gep->getOperand(i); 6223 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6224 !Legal->isInductionVariable(Opd)) 6225 return nullptr; 6226 } 6227 6228 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6229 return PSE.getSCEV(Ptr); 6230 } 6231 6232 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6233 return Legal->hasStride(I->getOperand(0)) || 6234 Legal->hasStride(I->getOperand(1)); 6235 } 6236 6237 InstructionCost 6238 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6239 ElementCount VF) { 6240 assert(VF.isVector() && 6241 "Scalarization cost of instruction implies vectorization."); 6242 if (VF.isScalable()) 6243 return InstructionCost::getInvalid(); 6244 6245 Type *ValTy = getLoadStoreType(I); 6246 auto SE = PSE.getSE(); 6247 6248 unsigned AS = getLoadStoreAddressSpace(I); 6249 Value *Ptr = getLoadStorePointerOperand(I); 6250 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6251 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6252 // that it is being called from this specific place. 6253 6254 // Figure out whether the access is strided and get the stride value 6255 // if it's known in compile time 6256 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6257 6258 // Get the cost of the scalar memory instruction and address computation. 6259 InstructionCost Cost = 6260 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6261 6262 // Don't pass *I here, since it is scalar but will actually be part of a 6263 // vectorized loop where the user of it is a vectorized instruction. 6264 const Align Alignment = getLoadStoreAlignment(I); 6265 Cost += VF.getKnownMinValue() * 6266 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6267 AS, TTI::TCK_RecipThroughput); 6268 6269 // Get the overhead of the extractelement and insertelement instructions 6270 // we might create due to scalarization. 6271 Cost += getScalarizationOverhead(I, VF); 6272 6273 // If we have a predicated load/store, it will need extra i1 extracts and 6274 // conditional branches, but may not be executed for each vector lane. Scale 6275 // the cost by the probability of executing the predicated block. 6276 if (isPredicatedInst(I, VF)) { 6277 Cost /= getReciprocalPredBlockProb(); 6278 6279 // Add the cost of an i1 extract and a branch 6280 auto *Vec_i1Ty = 6281 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6282 Cost += TTI.getScalarizationOverhead( 6283 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6284 /*Insert=*/false, /*Extract=*/true); 6285 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6286 6287 if (useEmulatedMaskMemRefHack(I, VF)) 6288 // Artificially setting to a high enough value to practically disable 6289 // vectorization with such operations. 6290 Cost = 3000000; 6291 } 6292 6293 return Cost; 6294 } 6295 6296 InstructionCost 6297 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6298 ElementCount VF) { 6299 Type *ValTy = getLoadStoreType(I); 6300 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6301 Value *Ptr = getLoadStorePointerOperand(I); 6302 unsigned AS = getLoadStoreAddressSpace(I); 6303 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6304 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6305 6306 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6307 "Stride should be 1 or -1 for consecutive memory access"); 6308 const Align Alignment = getLoadStoreAlignment(I); 6309 InstructionCost Cost = 0; 6310 if (Legal->isMaskRequired(I)) 6311 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6312 CostKind); 6313 else 6314 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6315 CostKind, I); 6316 6317 bool Reverse = ConsecutiveStride < 0; 6318 if (Reverse) 6319 Cost += 6320 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6321 return Cost; 6322 } 6323 6324 InstructionCost 6325 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6326 ElementCount VF) { 6327 assert(Legal->isUniformMemOp(*I)); 6328 6329 Type *ValTy = getLoadStoreType(I); 6330 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6331 const Align Alignment = getLoadStoreAlignment(I); 6332 unsigned AS = getLoadStoreAddressSpace(I); 6333 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6334 if (isa<LoadInst>(I)) { 6335 return TTI.getAddressComputationCost(ValTy) + 6336 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6337 CostKind) + 6338 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6339 } 6340 StoreInst *SI = cast<StoreInst>(I); 6341 6342 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6343 return TTI.getAddressComputationCost(ValTy) + 6344 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6345 CostKind) + 6346 (isLoopInvariantStoreValue 6347 ? 0 6348 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6349 VF.getKnownMinValue() - 1)); 6350 } 6351 6352 InstructionCost 6353 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6354 ElementCount VF) { 6355 Type *ValTy = getLoadStoreType(I); 6356 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6357 const Align Alignment = getLoadStoreAlignment(I); 6358 const Value *Ptr = getLoadStorePointerOperand(I); 6359 6360 return TTI.getAddressComputationCost(VectorTy) + 6361 TTI.getGatherScatterOpCost( 6362 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6363 TargetTransformInfo::TCK_RecipThroughput, I); 6364 } 6365 6366 InstructionCost 6367 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6368 ElementCount VF) { 6369 // TODO: Once we have support for interleaving with scalable vectors 6370 // we can calculate the cost properly here. 6371 if (VF.isScalable()) 6372 return InstructionCost::getInvalid(); 6373 6374 Type *ValTy = getLoadStoreType(I); 6375 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6376 unsigned AS = getLoadStoreAddressSpace(I); 6377 6378 auto Group = getInterleavedAccessGroup(I); 6379 assert(Group && "Fail to get an interleaved access group."); 6380 6381 unsigned InterleaveFactor = Group->getFactor(); 6382 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6383 6384 // Holds the indices of existing members in the interleaved group. 6385 SmallVector<unsigned, 4> Indices; 6386 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6387 if (Group->getMember(IF)) 6388 Indices.push_back(IF); 6389 6390 // Calculate the cost of the whole interleaved group. 6391 bool UseMaskForGaps = 6392 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6393 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6394 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6395 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6396 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6397 6398 if (Group->isReverse()) { 6399 // TODO: Add support for reversed masked interleaved access. 6400 assert(!Legal->isMaskRequired(I) && 6401 "Reverse masked interleaved access not supported."); 6402 Cost += 6403 Group->getNumMembers() * 6404 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6405 } 6406 return Cost; 6407 } 6408 6409 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6410 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6411 using namespace llvm::PatternMatch; 6412 // Early exit for no inloop reductions 6413 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6414 return None; 6415 auto *VectorTy = cast<VectorType>(Ty); 6416 6417 // We are looking for a pattern of, and finding the minimal acceptable cost: 6418 // reduce(mul(ext(A), ext(B))) or 6419 // reduce(mul(A, B)) or 6420 // reduce(ext(A)) or 6421 // reduce(A). 6422 // The basic idea is that we walk down the tree to do that, finding the root 6423 // reduction instruction in InLoopReductionImmediateChains. From there we find 6424 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6425 // of the components. If the reduction cost is lower then we return it for the 6426 // reduction instruction and 0 for the other instructions in the pattern. If 6427 // it is not we return an invalid cost specifying the orignal cost method 6428 // should be used. 6429 Instruction *RetI = I; 6430 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6431 if (!RetI->hasOneUser()) 6432 return None; 6433 RetI = RetI->user_back(); 6434 } 6435 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6436 RetI->user_back()->getOpcode() == Instruction::Add) { 6437 if (!RetI->hasOneUser()) 6438 return None; 6439 RetI = RetI->user_back(); 6440 } 6441 6442 // Test if the found instruction is a reduction, and if not return an invalid 6443 // cost specifying the parent to use the original cost modelling. 6444 if (!InLoopReductionImmediateChains.count(RetI)) 6445 return None; 6446 6447 // Find the reduction this chain is a part of and calculate the basic cost of 6448 // the reduction on its own. 6449 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6450 Instruction *ReductionPhi = LastChain; 6451 while (!isa<PHINode>(ReductionPhi)) 6452 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6453 6454 const RecurrenceDescriptor &RdxDesc = 6455 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6456 6457 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6458 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6459 6460 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6461 // normal fmul instruction to the cost of the fadd reduction. 6462 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6463 BaseCost += 6464 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6465 6466 // If we're using ordered reductions then we can just return the base cost 6467 // here, since getArithmeticReductionCost calculates the full ordered 6468 // reduction cost when FP reassociation is not allowed. 6469 if (useOrderedReductions(RdxDesc)) 6470 return BaseCost; 6471 6472 // Get the operand that was not the reduction chain and match it to one of the 6473 // patterns, returning the better cost if it is found. 6474 Instruction *RedOp = RetI->getOperand(1) == LastChain 6475 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6476 : dyn_cast<Instruction>(RetI->getOperand(1)); 6477 6478 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6479 6480 Instruction *Op0, *Op1; 6481 if (RedOp && 6482 match(RedOp, 6483 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6484 match(Op0, m_ZExtOrSExt(m_Value())) && 6485 Op0->getOpcode() == Op1->getOpcode() && 6486 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6487 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6488 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6489 6490 // Matched reduce(ext(mul(ext(A), ext(B))) 6491 // Note that the extend opcodes need to all match, or if A==B they will have 6492 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6493 // which is equally fine. 6494 bool IsUnsigned = isa<ZExtInst>(Op0); 6495 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6496 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6497 6498 InstructionCost ExtCost = 6499 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6500 TTI::CastContextHint::None, CostKind, Op0); 6501 InstructionCost MulCost = 6502 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6503 InstructionCost Ext2Cost = 6504 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6505 TTI::CastContextHint::None, CostKind, RedOp); 6506 6507 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6508 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6509 CostKind); 6510 6511 if (RedCost.isValid() && 6512 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6513 return I == RetI ? RedCost : 0; 6514 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6515 !TheLoop->isLoopInvariant(RedOp)) { 6516 // Matched reduce(ext(A)) 6517 bool IsUnsigned = isa<ZExtInst>(RedOp); 6518 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6519 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6520 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6521 CostKind); 6522 6523 InstructionCost ExtCost = 6524 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6525 TTI::CastContextHint::None, CostKind, RedOp); 6526 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6527 return I == RetI ? RedCost : 0; 6528 } else if (RedOp && 6529 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6530 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6531 Op0->getOpcode() == Op1->getOpcode() && 6532 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6533 bool IsUnsigned = isa<ZExtInst>(Op0); 6534 Type *Op0Ty = Op0->getOperand(0)->getType(); 6535 Type *Op1Ty = Op1->getOperand(0)->getType(); 6536 Type *LargestOpTy = 6537 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6538 : Op0Ty; 6539 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6540 6541 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6542 // different sizes. We take the largest type as the ext to reduce, and add 6543 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6544 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6545 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6546 TTI::CastContextHint::None, CostKind, Op0); 6547 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6548 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6549 TTI::CastContextHint::None, CostKind, Op1); 6550 InstructionCost MulCost = 6551 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6552 6553 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6554 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6555 CostKind); 6556 InstructionCost ExtraExtCost = 0; 6557 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6558 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6559 ExtraExtCost = TTI.getCastInstrCost( 6560 ExtraExtOp->getOpcode(), ExtType, 6561 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6562 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6563 } 6564 6565 if (RedCost.isValid() && 6566 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6567 return I == RetI ? RedCost : 0; 6568 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6569 // Matched reduce(mul()) 6570 InstructionCost MulCost = 6571 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6572 6573 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6574 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6575 CostKind); 6576 6577 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6578 return I == RetI ? RedCost : 0; 6579 } 6580 } 6581 6582 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6583 } 6584 6585 InstructionCost 6586 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6587 ElementCount VF) { 6588 // Calculate scalar cost only. Vectorization cost should be ready at this 6589 // moment. 6590 if (VF.isScalar()) { 6591 Type *ValTy = getLoadStoreType(I); 6592 const Align Alignment = getLoadStoreAlignment(I); 6593 unsigned AS = getLoadStoreAddressSpace(I); 6594 6595 return TTI.getAddressComputationCost(ValTy) + 6596 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6597 TTI::TCK_RecipThroughput, I); 6598 } 6599 return getWideningCost(I, VF); 6600 } 6601 6602 LoopVectorizationCostModel::VectorizationCostTy 6603 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6604 ElementCount VF) { 6605 // If we know that this instruction will remain uniform, check the cost of 6606 // the scalar version. 6607 if (isUniformAfterVectorization(I, VF)) 6608 VF = ElementCount::getFixed(1); 6609 6610 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6611 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6612 6613 // Forced scalars do not have any scalarization overhead. 6614 auto ForcedScalar = ForcedScalars.find(VF); 6615 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6616 auto InstSet = ForcedScalar->second; 6617 if (InstSet.count(I)) 6618 return VectorizationCostTy( 6619 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6620 VF.getKnownMinValue()), 6621 false); 6622 } 6623 6624 Type *VectorTy; 6625 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6626 6627 bool TypeNotScalarized = false; 6628 if (VF.isVector() && VectorTy->isVectorTy()) { 6629 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6630 if (VF.isScalable()) 6631 // <vscale x 1 x iN> is assumed to be profitable over iN because 6632 // scalable registers are a distinct register class from scalar ones. 6633 // If we ever find a target which wants to lower scalable vectors 6634 // back to scalars, we'll need to update this code to explicitly 6635 // ask TTI about the register class uses for each part. 6636 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6637 else 6638 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6639 } else 6640 C = InstructionCost::getInvalid(); 6641 } 6642 return VectorizationCostTy(C, TypeNotScalarized); 6643 } 6644 6645 InstructionCost 6646 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6647 ElementCount VF) const { 6648 6649 // There is no mechanism yet to create a scalable scalarization loop, 6650 // so this is currently Invalid. 6651 if (VF.isScalable()) 6652 return InstructionCost::getInvalid(); 6653 6654 if (VF.isScalar()) 6655 return 0; 6656 6657 InstructionCost Cost = 0; 6658 Type *RetTy = ToVectorTy(I->getType(), VF); 6659 if (!RetTy->isVoidTy() && 6660 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6661 Cost += TTI.getScalarizationOverhead( 6662 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6663 false); 6664 6665 // Some targets keep addresses scalar. 6666 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6667 return Cost; 6668 6669 // Some targets support efficient element stores. 6670 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6671 return Cost; 6672 6673 // Collect operands to consider. 6674 CallInst *CI = dyn_cast<CallInst>(I); 6675 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6676 6677 // Skip operands that do not require extraction/scalarization and do not incur 6678 // any overhead. 6679 SmallVector<Type *> Tys; 6680 for (auto *V : filterExtractingOperands(Ops, VF)) 6681 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6682 return Cost + TTI.getOperandsScalarizationOverhead( 6683 filterExtractingOperands(Ops, VF), Tys); 6684 } 6685 6686 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6687 if (VF.isScalar()) 6688 return; 6689 NumPredStores = 0; 6690 for (BasicBlock *BB : TheLoop->blocks()) { 6691 // For each instruction in the old loop. 6692 for (Instruction &I : *BB) { 6693 Value *Ptr = getLoadStorePointerOperand(&I); 6694 if (!Ptr) 6695 continue; 6696 6697 // TODO: We should generate better code and update the cost model for 6698 // predicated uniform stores. Today they are treated as any other 6699 // predicated store (see added test cases in 6700 // invariant-store-vectorization.ll). 6701 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6702 NumPredStores++; 6703 6704 if (Legal->isUniformMemOp(I)) { 6705 // TODO: Avoid replicating loads and stores instead of 6706 // relying on instcombine to remove them. 6707 // Load: Scalar load + broadcast 6708 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6709 InstructionCost Cost; 6710 if (isa<StoreInst>(&I) && VF.isScalable() && 6711 isLegalGatherOrScatter(&I, VF)) { 6712 Cost = getGatherScatterCost(&I, VF); 6713 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6714 } else { 6715 Cost = getUniformMemOpCost(&I, VF); 6716 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6717 } 6718 continue; 6719 } 6720 6721 // We assume that widening is the best solution when possible. 6722 if (memoryInstructionCanBeWidened(&I, VF)) { 6723 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6724 int ConsecutiveStride = Legal->isConsecutivePtr( 6725 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6726 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6727 "Expected consecutive stride."); 6728 InstWidening Decision = 6729 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6730 setWideningDecision(&I, VF, Decision, Cost); 6731 continue; 6732 } 6733 6734 // Choose between Interleaving, Gather/Scatter or Scalarization. 6735 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6736 unsigned NumAccesses = 1; 6737 if (isAccessInterleaved(&I)) { 6738 auto Group = getInterleavedAccessGroup(&I); 6739 assert(Group && "Fail to get an interleaved access group."); 6740 6741 // Make one decision for the whole group. 6742 if (getWideningDecision(&I, VF) != CM_Unknown) 6743 continue; 6744 6745 NumAccesses = Group->getNumMembers(); 6746 if (interleavedAccessCanBeWidened(&I, VF)) 6747 InterleaveCost = getInterleaveGroupCost(&I, VF); 6748 } 6749 6750 InstructionCost GatherScatterCost = 6751 isLegalGatherOrScatter(&I, VF) 6752 ? getGatherScatterCost(&I, VF) * NumAccesses 6753 : InstructionCost::getInvalid(); 6754 6755 InstructionCost ScalarizationCost = 6756 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6757 6758 // Choose better solution for the current VF, 6759 // write down this decision and use it during vectorization. 6760 InstructionCost Cost; 6761 InstWidening Decision; 6762 if (InterleaveCost <= GatherScatterCost && 6763 InterleaveCost < ScalarizationCost) { 6764 Decision = CM_Interleave; 6765 Cost = InterleaveCost; 6766 } else if (GatherScatterCost < ScalarizationCost) { 6767 Decision = CM_GatherScatter; 6768 Cost = GatherScatterCost; 6769 } else { 6770 Decision = CM_Scalarize; 6771 Cost = ScalarizationCost; 6772 } 6773 // If the instructions belongs to an interleave group, the whole group 6774 // receives the same decision. The whole group receives the cost, but 6775 // the cost will actually be assigned to one instruction. 6776 if (auto Group = getInterleavedAccessGroup(&I)) 6777 setWideningDecision(Group, VF, Decision, Cost); 6778 else 6779 setWideningDecision(&I, VF, Decision, Cost); 6780 } 6781 } 6782 6783 // Make sure that any load of address and any other address computation 6784 // remains scalar unless there is gather/scatter support. This avoids 6785 // inevitable extracts into address registers, and also has the benefit of 6786 // activating LSR more, since that pass can't optimize vectorized 6787 // addresses. 6788 if (TTI.prefersVectorizedAddressing()) 6789 return; 6790 6791 // Start with all scalar pointer uses. 6792 SmallPtrSet<Instruction *, 8> AddrDefs; 6793 for (BasicBlock *BB : TheLoop->blocks()) 6794 for (Instruction &I : *BB) { 6795 Instruction *PtrDef = 6796 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6797 if (PtrDef && TheLoop->contains(PtrDef) && 6798 getWideningDecision(&I, VF) != CM_GatherScatter) 6799 AddrDefs.insert(PtrDef); 6800 } 6801 6802 // Add all instructions used to generate the addresses. 6803 SmallVector<Instruction *, 4> Worklist; 6804 append_range(Worklist, AddrDefs); 6805 while (!Worklist.empty()) { 6806 Instruction *I = Worklist.pop_back_val(); 6807 for (auto &Op : I->operands()) 6808 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6809 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6810 AddrDefs.insert(InstOp).second) 6811 Worklist.push_back(InstOp); 6812 } 6813 6814 for (auto *I : AddrDefs) { 6815 if (isa<LoadInst>(I)) { 6816 // Setting the desired widening decision should ideally be handled in 6817 // by cost functions, but since this involves the task of finding out 6818 // if the loaded register is involved in an address computation, it is 6819 // instead changed here when we know this is the case. 6820 InstWidening Decision = getWideningDecision(I, VF); 6821 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6822 // Scalarize a widened load of address. 6823 setWideningDecision( 6824 I, VF, CM_Scalarize, 6825 (VF.getKnownMinValue() * 6826 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6827 else if (auto Group = getInterleavedAccessGroup(I)) { 6828 // Scalarize an interleave group of address loads. 6829 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6830 if (Instruction *Member = Group->getMember(I)) 6831 setWideningDecision( 6832 Member, VF, CM_Scalarize, 6833 (VF.getKnownMinValue() * 6834 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6835 } 6836 } 6837 } else 6838 // Make sure I gets scalarized and a cost estimate without 6839 // scalarization overhead. 6840 ForcedScalars[VF].insert(I); 6841 } 6842 } 6843 6844 InstructionCost 6845 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6846 Type *&VectorTy) { 6847 Type *RetTy = I->getType(); 6848 if (canTruncateToMinimalBitwidth(I, VF)) 6849 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6850 auto SE = PSE.getSE(); 6851 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6852 6853 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6854 ElementCount VF) -> bool { 6855 if (VF.isScalar()) 6856 return true; 6857 6858 auto Scalarized = InstsToScalarize.find(VF); 6859 assert(Scalarized != InstsToScalarize.end() && 6860 "VF not yet analyzed for scalarization profitability"); 6861 return !Scalarized->second.count(I) && 6862 llvm::all_of(I->users(), [&](User *U) { 6863 auto *UI = cast<Instruction>(U); 6864 return !Scalarized->second.count(UI); 6865 }); 6866 }; 6867 (void) hasSingleCopyAfterVectorization; 6868 6869 if (isScalarAfterVectorization(I, VF)) { 6870 // With the exception of GEPs and PHIs, after scalarization there should 6871 // only be one copy of the instruction generated in the loop. This is 6872 // because the VF is either 1, or any instructions that need scalarizing 6873 // have already been dealt with by the the time we get here. As a result, 6874 // it means we don't have to multiply the instruction cost by VF. 6875 assert(I->getOpcode() == Instruction::GetElementPtr || 6876 I->getOpcode() == Instruction::PHI || 6877 (I->getOpcode() == Instruction::BitCast && 6878 I->getType()->isPointerTy()) || 6879 hasSingleCopyAfterVectorization(I, VF)); 6880 VectorTy = RetTy; 6881 } else 6882 VectorTy = ToVectorTy(RetTy, VF); 6883 6884 // TODO: We need to estimate the cost of intrinsic calls. 6885 switch (I->getOpcode()) { 6886 case Instruction::GetElementPtr: 6887 // We mark this instruction as zero-cost because the cost of GEPs in 6888 // vectorized code depends on whether the corresponding memory instruction 6889 // is scalarized or not. Therefore, we handle GEPs with the memory 6890 // instruction cost. 6891 return 0; 6892 case Instruction::Br: { 6893 // In cases of scalarized and predicated instructions, there will be VF 6894 // predicated blocks in the vectorized loop. Each branch around these 6895 // blocks requires also an extract of its vector compare i1 element. 6896 bool ScalarPredicatedBB = false; 6897 BranchInst *BI = cast<BranchInst>(I); 6898 if (VF.isVector() && BI->isConditional() && 6899 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6900 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6901 ScalarPredicatedBB = true; 6902 6903 if (ScalarPredicatedBB) { 6904 // Not possible to scalarize scalable vector with predicated instructions. 6905 if (VF.isScalable()) 6906 return InstructionCost::getInvalid(); 6907 // Return cost for branches around scalarized and predicated blocks. 6908 auto *Vec_i1Ty = 6909 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6910 return ( 6911 TTI.getScalarizationOverhead( 6912 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 6913 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6914 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6915 // The back-edge branch will remain, as will all scalar branches. 6916 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6917 else 6918 // This branch will be eliminated by if-conversion. 6919 return 0; 6920 // Note: We currently assume zero cost for an unconditional branch inside 6921 // a predicated block since it will become a fall-through, although we 6922 // may decide in the future to call TTI for all branches. 6923 } 6924 case Instruction::PHI: { 6925 auto *Phi = cast<PHINode>(I); 6926 6927 // First-order recurrences are replaced by vector shuffles inside the loop. 6928 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6929 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6930 return TTI.getShuffleCost( 6931 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6932 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6933 6934 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6935 // converted into select instructions. We require N - 1 selects per phi 6936 // node, where N is the number of incoming values. 6937 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6938 return (Phi->getNumIncomingValues() - 1) * 6939 TTI.getCmpSelInstrCost( 6940 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6941 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6942 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6943 6944 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6945 } 6946 case Instruction::UDiv: 6947 case Instruction::SDiv: 6948 case Instruction::URem: 6949 case Instruction::SRem: 6950 // If we have a predicated instruction, it may not be executed for each 6951 // vector lane. Get the scalarization cost and scale this amount by the 6952 // probability of executing the predicated block. If the instruction is not 6953 // predicated, we fall through to the next case. 6954 if (VF.isVector() && isScalarWithPredication(I, VF)) { 6955 InstructionCost Cost = 0; 6956 6957 // These instructions have a non-void type, so account for the phi nodes 6958 // that we will create. This cost is likely to be zero. The phi node 6959 // cost, if any, should be scaled by the block probability because it 6960 // models a copy at the end of each predicated block. 6961 Cost += VF.getKnownMinValue() * 6962 TTI.getCFInstrCost(Instruction::PHI, CostKind); 6963 6964 // The cost of the non-predicated instruction. 6965 Cost += VF.getKnownMinValue() * 6966 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6967 6968 // The cost of insertelement and extractelement instructions needed for 6969 // scalarization. 6970 Cost += getScalarizationOverhead(I, VF); 6971 6972 // Scale the cost by the probability of executing the predicated blocks. 6973 // This assumes the predicated block for each vector lane is equally 6974 // likely. 6975 return Cost / getReciprocalPredBlockProb(); 6976 } 6977 LLVM_FALLTHROUGH; 6978 case Instruction::Add: 6979 case Instruction::FAdd: 6980 case Instruction::Sub: 6981 case Instruction::FSub: 6982 case Instruction::Mul: 6983 case Instruction::FMul: 6984 case Instruction::FDiv: 6985 case Instruction::FRem: 6986 case Instruction::Shl: 6987 case Instruction::LShr: 6988 case Instruction::AShr: 6989 case Instruction::And: 6990 case Instruction::Or: 6991 case Instruction::Xor: { 6992 // Since we will replace the stride by 1 the multiplication should go away. 6993 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6994 return 0; 6995 6996 // Detect reduction patterns 6997 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6998 return *RedCost; 6999 7000 // Certain instructions can be cheaper to vectorize if they have a constant 7001 // second vector operand. One example of this are shifts on x86. 7002 Value *Op2 = I->getOperand(1); 7003 TargetTransformInfo::OperandValueProperties Op2VP; 7004 TargetTransformInfo::OperandValueKind Op2VK = 7005 TTI.getOperandInfo(Op2, Op2VP); 7006 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7007 Op2VK = TargetTransformInfo::OK_UniformValue; 7008 7009 SmallVector<const Value *, 4> Operands(I->operand_values()); 7010 return TTI.getArithmeticInstrCost( 7011 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7012 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7013 } 7014 case Instruction::FNeg: { 7015 return TTI.getArithmeticInstrCost( 7016 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7017 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7018 TargetTransformInfo::OP_None, I->getOperand(0), I); 7019 } 7020 case Instruction::Select: { 7021 SelectInst *SI = cast<SelectInst>(I); 7022 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7023 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7024 7025 const Value *Op0, *Op1; 7026 using namespace llvm::PatternMatch; 7027 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7028 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7029 // select x, y, false --> x & y 7030 // select x, true, y --> x | y 7031 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7032 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7033 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7034 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7035 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7036 Op1->getType()->getScalarSizeInBits() == 1); 7037 7038 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7039 return TTI.getArithmeticInstrCost( 7040 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7041 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7042 } 7043 7044 Type *CondTy = SI->getCondition()->getType(); 7045 if (!ScalarCond) 7046 CondTy = VectorType::get(CondTy, VF); 7047 7048 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7049 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7050 Pred = Cmp->getPredicate(); 7051 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7052 CostKind, I); 7053 } 7054 case Instruction::ICmp: 7055 case Instruction::FCmp: { 7056 Type *ValTy = I->getOperand(0)->getType(); 7057 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7058 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7059 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7060 VectorTy = ToVectorTy(ValTy, VF); 7061 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7062 cast<CmpInst>(I)->getPredicate(), CostKind, 7063 I); 7064 } 7065 case Instruction::Store: 7066 case Instruction::Load: { 7067 ElementCount Width = VF; 7068 if (Width.isVector()) { 7069 InstWidening Decision = getWideningDecision(I, Width); 7070 assert(Decision != CM_Unknown && 7071 "CM decision should be taken at this point"); 7072 if (Decision == CM_Scalarize) { 7073 if (VF.isScalable() && isa<StoreInst>(I)) 7074 // We can't scalarize a scalable vector store (even a uniform one 7075 // currently), return an invalid cost so as to prevent vectorization. 7076 return InstructionCost::getInvalid(); 7077 Width = ElementCount::getFixed(1); 7078 } 7079 } 7080 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7081 return getMemoryInstructionCost(I, VF); 7082 } 7083 case Instruction::BitCast: 7084 if (I->getType()->isPointerTy()) 7085 return 0; 7086 LLVM_FALLTHROUGH; 7087 case Instruction::ZExt: 7088 case Instruction::SExt: 7089 case Instruction::FPToUI: 7090 case Instruction::FPToSI: 7091 case Instruction::FPExt: 7092 case Instruction::PtrToInt: 7093 case Instruction::IntToPtr: 7094 case Instruction::SIToFP: 7095 case Instruction::UIToFP: 7096 case Instruction::Trunc: 7097 case Instruction::FPTrunc: { 7098 // Computes the CastContextHint from a Load/Store instruction. 7099 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7100 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7101 "Expected a load or a store!"); 7102 7103 if (VF.isScalar() || !TheLoop->contains(I)) 7104 return TTI::CastContextHint::Normal; 7105 7106 switch (getWideningDecision(I, VF)) { 7107 case LoopVectorizationCostModel::CM_GatherScatter: 7108 return TTI::CastContextHint::GatherScatter; 7109 case LoopVectorizationCostModel::CM_Interleave: 7110 return TTI::CastContextHint::Interleave; 7111 case LoopVectorizationCostModel::CM_Scalarize: 7112 case LoopVectorizationCostModel::CM_Widen: 7113 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7114 : TTI::CastContextHint::Normal; 7115 case LoopVectorizationCostModel::CM_Widen_Reverse: 7116 return TTI::CastContextHint::Reversed; 7117 case LoopVectorizationCostModel::CM_Unknown: 7118 llvm_unreachable("Instr did not go through cost modelling?"); 7119 } 7120 7121 llvm_unreachable("Unhandled case!"); 7122 }; 7123 7124 unsigned Opcode = I->getOpcode(); 7125 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7126 // For Trunc, the context is the only user, which must be a StoreInst. 7127 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7128 if (I->hasOneUse()) 7129 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7130 CCH = ComputeCCH(Store); 7131 } 7132 // For Z/Sext, the context is the operand, which must be a LoadInst. 7133 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7134 Opcode == Instruction::FPExt) { 7135 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7136 CCH = ComputeCCH(Load); 7137 } 7138 7139 // We optimize the truncation of induction variables having constant 7140 // integer steps. The cost of these truncations is the same as the scalar 7141 // operation. 7142 if (isOptimizableIVTruncate(I, VF)) { 7143 auto *Trunc = cast<TruncInst>(I); 7144 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7145 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7146 } 7147 7148 // Detect reduction patterns 7149 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7150 return *RedCost; 7151 7152 Type *SrcScalarTy = I->getOperand(0)->getType(); 7153 Type *SrcVecTy = 7154 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7155 if (canTruncateToMinimalBitwidth(I, VF)) { 7156 // This cast is going to be shrunk. This may remove the cast or it might 7157 // turn it into slightly different cast. For example, if MinBW == 16, 7158 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7159 // 7160 // Calculate the modified src and dest types. 7161 Type *MinVecTy = VectorTy; 7162 if (Opcode == Instruction::Trunc) { 7163 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7164 VectorTy = 7165 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7166 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7167 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7168 VectorTy = 7169 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7170 } 7171 } 7172 7173 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7174 } 7175 case Instruction::Call: { 7176 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7177 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7178 return *RedCost; 7179 bool NeedToScalarize; 7180 CallInst *CI = cast<CallInst>(I); 7181 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7182 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7183 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7184 return std::min(CallCost, IntrinsicCost); 7185 } 7186 return CallCost; 7187 } 7188 case Instruction::ExtractValue: 7189 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7190 case Instruction::Alloca: 7191 // We cannot easily widen alloca to a scalable alloca, as 7192 // the result would need to be a vector of pointers. 7193 if (VF.isScalable()) 7194 return InstructionCost::getInvalid(); 7195 LLVM_FALLTHROUGH; 7196 default: 7197 // This opcode is unknown. Assume that it is the same as 'mul'. 7198 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7199 } // end of switch. 7200 } 7201 7202 char LoopVectorize::ID = 0; 7203 7204 static const char lv_name[] = "Loop Vectorization"; 7205 7206 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7207 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7208 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7209 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7210 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7211 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7212 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7213 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7214 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7215 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7216 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7217 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7218 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7219 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7220 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7221 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7222 7223 namespace llvm { 7224 7225 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7226 7227 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7228 bool VectorizeOnlyWhenForced) { 7229 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7230 } 7231 7232 } // end namespace llvm 7233 7234 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7235 // Check if the pointer operand of a load or store instruction is 7236 // consecutive. 7237 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7238 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7239 return false; 7240 } 7241 7242 void LoopVectorizationCostModel::collectValuesToIgnore() { 7243 // Ignore ephemeral values. 7244 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7245 7246 // Find all stores to invariant variables. Since they are going to sink 7247 // outside the loop we do not need calculate cost for them. 7248 for (BasicBlock *BB : TheLoop->blocks()) 7249 for (Instruction &I : *BB) { 7250 StoreInst *SI; 7251 if ((SI = dyn_cast<StoreInst>(&I)) && 7252 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7253 ValuesToIgnore.insert(&I); 7254 } 7255 7256 // Ignore type-promoting instructions we identified during reduction 7257 // detection. 7258 for (auto &Reduction : Legal->getReductionVars()) { 7259 const RecurrenceDescriptor &RedDes = Reduction.second; 7260 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7261 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7262 } 7263 // Ignore type-casting instructions we identified during induction 7264 // detection. 7265 for (auto &Induction : Legal->getInductionVars()) { 7266 const InductionDescriptor &IndDes = Induction.second; 7267 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7268 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7269 } 7270 } 7271 7272 void LoopVectorizationCostModel::collectInLoopReductions() { 7273 for (auto &Reduction : Legal->getReductionVars()) { 7274 PHINode *Phi = Reduction.first; 7275 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7276 7277 // We don't collect reductions that are type promoted (yet). 7278 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7279 continue; 7280 7281 // If the target would prefer this reduction to happen "in-loop", then we 7282 // want to record it as such. 7283 unsigned Opcode = RdxDesc.getOpcode(); 7284 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7285 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7286 TargetTransformInfo::ReductionFlags())) 7287 continue; 7288 7289 // Check that we can correctly put the reductions into the loop, by 7290 // finding the chain of operations that leads from the phi to the loop 7291 // exit value. 7292 SmallVector<Instruction *, 4> ReductionOperations = 7293 RdxDesc.getReductionOpChain(Phi, TheLoop); 7294 bool InLoop = !ReductionOperations.empty(); 7295 if (InLoop) { 7296 InLoopReductionChains[Phi] = ReductionOperations; 7297 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7298 Instruction *LastChain = Phi; 7299 for (auto *I : ReductionOperations) { 7300 InLoopReductionImmediateChains[I] = LastChain; 7301 LastChain = I; 7302 } 7303 } 7304 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7305 << " reduction for phi: " << *Phi << "\n"); 7306 } 7307 } 7308 7309 // TODO: we could return a pair of values that specify the max VF and 7310 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7311 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7312 // doesn't have a cost model that can choose which plan to execute if 7313 // more than one is generated. 7314 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7315 LoopVectorizationCostModel &CM) { 7316 unsigned WidestType; 7317 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7318 return WidestVectorRegBits / WidestType; 7319 } 7320 7321 VectorizationFactor 7322 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7323 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7324 ElementCount VF = UserVF; 7325 // Outer loop handling: They may require CFG and instruction level 7326 // transformations before even evaluating whether vectorization is profitable. 7327 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7328 // the vectorization pipeline. 7329 if (!OrigLoop->isInnermost()) { 7330 // If the user doesn't provide a vectorization factor, determine a 7331 // reasonable one. 7332 if (UserVF.isZero()) { 7333 VF = ElementCount::getFixed(determineVPlanVF( 7334 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7335 .getFixedSize(), 7336 CM)); 7337 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7338 7339 // Make sure we have a VF > 1 for stress testing. 7340 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7341 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7342 << "overriding computed VF.\n"); 7343 VF = ElementCount::getFixed(4); 7344 } 7345 } 7346 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7347 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7348 "VF needs to be a power of two"); 7349 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7350 << "VF " << VF << " to build VPlans.\n"); 7351 buildVPlans(VF, VF); 7352 7353 // For VPlan build stress testing, we bail out after VPlan construction. 7354 if (VPlanBuildStressTest) 7355 return VectorizationFactor::Disabled(); 7356 7357 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7358 } 7359 7360 LLVM_DEBUG( 7361 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7362 "VPlan-native path.\n"); 7363 return VectorizationFactor::Disabled(); 7364 } 7365 7366 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { 7367 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7368 return (NumRuntimePointerChecks > 7369 VectorizerParams::RuntimeMemoryCheckThreshold && 7370 !Hints.allowReordering()) || 7371 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7372 } 7373 7374 Optional<VectorizationFactor> 7375 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7376 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7377 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7378 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7379 return None; 7380 7381 // Invalidate interleave groups if all blocks of loop will be predicated. 7382 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7383 !useMaskedInterleavedAccesses(*TTI)) { 7384 LLVM_DEBUG( 7385 dbgs() 7386 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7387 "which requires masked-interleaved support.\n"); 7388 if (CM.InterleaveInfo.invalidateGroups()) 7389 // Invalidating interleave groups also requires invalidating all decisions 7390 // based on them, which includes widening decisions and uniform and scalar 7391 // values. 7392 CM.invalidateCostModelingDecisions(); 7393 } 7394 7395 ElementCount MaxUserVF = 7396 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7397 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7398 if (!UserVF.isZero() && UserVFIsLegal) { 7399 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7400 "VF needs to be a power of two"); 7401 // Collect the instructions (and their associated costs) that will be more 7402 // profitable to scalarize. 7403 if (CM.selectUserVectorizationFactor(UserVF)) { 7404 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7405 CM.collectInLoopReductions(); 7406 buildVPlansWithVPRecipes(UserVF, UserVF); 7407 LLVM_DEBUG(printPlans(dbgs())); 7408 return {{UserVF, 0, 0}}; 7409 } else 7410 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7411 "InvalidCost", ORE, OrigLoop); 7412 } 7413 7414 // Populate the set of Vectorization Factor Candidates. 7415 ElementCountSet VFCandidates; 7416 for (auto VF = ElementCount::getFixed(1); 7417 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7418 VFCandidates.insert(VF); 7419 for (auto VF = ElementCount::getScalable(1); 7420 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7421 VFCandidates.insert(VF); 7422 7423 for (const auto &VF : VFCandidates) { 7424 // Collect Uniform and Scalar instructions after vectorization with VF. 7425 CM.collectUniformsAndScalars(VF); 7426 7427 // Collect the instructions (and their associated costs) that will be more 7428 // profitable to scalarize. 7429 if (VF.isVector()) 7430 CM.collectInstsToScalarize(VF); 7431 } 7432 7433 CM.collectInLoopReductions(); 7434 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7435 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7436 7437 LLVM_DEBUG(printPlans(dbgs())); 7438 if (!MaxFactors.hasVector()) 7439 return VectorizationFactor::Disabled(); 7440 7441 // Select the optimal vectorization factor. 7442 return CM.selectVectorizationFactor(VFCandidates); 7443 } 7444 7445 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7446 assert(count_if(VPlans, 7447 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7448 1 && 7449 "Best VF has not a single VPlan."); 7450 7451 for (const VPlanPtr &Plan : VPlans) { 7452 if (Plan->hasVF(VF)) 7453 return *Plan.get(); 7454 } 7455 llvm_unreachable("No plan found!"); 7456 } 7457 7458 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7459 SmallVector<Metadata *, 4> MDs; 7460 // Reserve first location for self reference to the LoopID metadata node. 7461 MDs.push_back(nullptr); 7462 bool IsUnrollMetadata = false; 7463 MDNode *LoopID = L->getLoopID(); 7464 if (LoopID) { 7465 // First find existing loop unrolling disable metadata. 7466 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7467 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7468 if (MD) { 7469 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7470 IsUnrollMetadata = 7471 S && S->getString().startswith("llvm.loop.unroll.disable"); 7472 } 7473 MDs.push_back(LoopID->getOperand(i)); 7474 } 7475 } 7476 7477 if (!IsUnrollMetadata) { 7478 // Add runtime unroll disable metadata. 7479 LLVMContext &Context = L->getHeader()->getContext(); 7480 SmallVector<Metadata *, 1> DisableOperands; 7481 DisableOperands.push_back( 7482 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7483 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7484 MDs.push_back(DisableNode); 7485 MDNode *NewLoopID = MDNode::get(Context, MDs); 7486 // Set operand 0 to refer to the loop id itself. 7487 NewLoopID->replaceOperandWith(0, NewLoopID); 7488 L->setLoopID(NewLoopID); 7489 } 7490 } 7491 7492 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7493 VPlan &BestVPlan, 7494 InnerLoopVectorizer &ILV, 7495 DominatorTree *DT, 7496 bool IsEpilogueVectorization) { 7497 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7498 << '\n'); 7499 7500 // Perform the actual loop transformation. 7501 7502 // 1. Set up the skeleton for vectorization, including vector pre-header and 7503 // middle block. The vector loop is created during VPlan execution. 7504 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7505 Value *CanonicalIVStartValue; 7506 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7507 ILV.createVectorizedLoopSkeleton(); 7508 7509 // Only use noalias metadata when using memory checks guaranteeing no overlap 7510 // across all iterations. 7511 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7512 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7513 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7514 7515 // We currently don't use LoopVersioning for the actual loop cloning but we 7516 // still use it to add the noalias metadata. 7517 // TODO: Find a better way to re-use LoopVersioning functionality to add 7518 // metadata. 7519 State.LVer = std::make_unique<LoopVersioning>( 7520 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7521 PSE.getSE()); 7522 State.LVer->prepareNoAliasMetadata(); 7523 } 7524 7525 ILV.collectPoisonGeneratingRecipes(State); 7526 7527 ILV.printDebugTracesAtStart(); 7528 7529 //===------------------------------------------------===// 7530 // 7531 // Notice: any optimization or new instruction that go 7532 // into the code below should also be implemented in 7533 // the cost-model. 7534 // 7535 //===------------------------------------------------===// 7536 7537 // 2. Copy and widen instructions from the old loop into the new loop. 7538 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7539 ILV.getOrCreateVectorTripCount(nullptr), 7540 CanonicalIVStartValue, State, 7541 IsEpilogueVectorization); 7542 7543 BestVPlan.execute(&State); 7544 7545 // Keep all loop hints from the original loop on the vector loop (we'll 7546 // replace the vectorizer-specific hints below). 7547 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7548 7549 Optional<MDNode *> VectorizedLoopID = 7550 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7551 LLVMLoopVectorizeFollowupVectorized}); 7552 7553 VPBasicBlock *HeaderVPBB = 7554 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7555 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7556 if (VectorizedLoopID) 7557 L->setLoopID(VectorizedLoopID.getValue()); 7558 else { 7559 // Keep all loop hints from the original loop on the vector loop (we'll 7560 // replace the vectorizer-specific hints below). 7561 if (MDNode *LID = OrigLoop->getLoopID()) 7562 L->setLoopID(LID); 7563 7564 LoopVectorizeHints Hints(L, true, *ORE); 7565 Hints.setAlreadyVectorized(); 7566 } 7567 // Disable runtime unrolling when vectorizing the epilogue loop. 7568 if (CanonicalIVStartValue) 7569 AddRuntimeUnrollDisableMetaData(L); 7570 7571 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7572 // predication, updating analyses. 7573 ILV.fixVectorizedLoop(State, BestVPlan); 7574 7575 ILV.printDebugTracesAtEnd(); 7576 } 7577 7578 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7579 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7580 for (const auto &Plan : VPlans) 7581 if (PrintVPlansInDotFormat) 7582 Plan->printDOT(O); 7583 else 7584 Plan->print(O); 7585 } 7586 #endif 7587 7588 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7589 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7590 7591 // We create new control-flow for the vectorized loop, so the original exit 7592 // conditions will be dead after vectorization if it's only used by the 7593 // terminator 7594 SmallVector<BasicBlock*> ExitingBlocks; 7595 OrigLoop->getExitingBlocks(ExitingBlocks); 7596 for (auto *BB : ExitingBlocks) { 7597 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7598 if (!Cmp || !Cmp->hasOneUse()) 7599 continue; 7600 7601 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7602 if (!DeadInstructions.insert(Cmp).second) 7603 continue; 7604 7605 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7606 // TODO: can recurse through operands in general 7607 for (Value *Op : Cmp->operands()) { 7608 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7609 DeadInstructions.insert(cast<Instruction>(Op)); 7610 } 7611 } 7612 7613 // We create new "steps" for induction variable updates to which the original 7614 // induction variables map. An original update instruction will be dead if 7615 // all its users except the induction variable are dead. 7616 auto *Latch = OrigLoop->getLoopLatch(); 7617 for (auto &Induction : Legal->getInductionVars()) { 7618 PHINode *Ind = Induction.first; 7619 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7620 7621 // If the tail is to be folded by masking, the primary induction variable, 7622 // if exists, isn't dead: it will be used for masking. Don't kill it. 7623 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7624 continue; 7625 7626 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7627 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7628 })) 7629 DeadInstructions.insert(IndUpdate); 7630 } 7631 } 7632 7633 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7634 7635 //===--------------------------------------------------------------------===// 7636 // EpilogueVectorizerMainLoop 7637 //===--------------------------------------------------------------------===// 7638 7639 /// This function is partially responsible for generating the control flow 7640 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7641 std::pair<BasicBlock *, Value *> 7642 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7643 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7644 7645 // Workaround! Compute the trip count of the original loop and cache it 7646 // before we start modifying the CFG. This code has a systemic problem 7647 // wherein it tries to run analysis over partially constructed IR; this is 7648 // wrong, and not simply for SCEV. The trip count of the original loop 7649 // simply happens to be prone to hitting this in practice. In theory, we 7650 // can hit the same issue for any SCEV, or ValueTracking query done during 7651 // mutation. See PR49900. 7652 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7653 createVectorLoopSkeleton(""); 7654 7655 // Generate the code to check the minimum iteration count of the vector 7656 // epilogue (see below). 7657 EPI.EpilogueIterationCountCheck = 7658 emitIterationCountCheck(LoopScalarPreHeader, true); 7659 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7660 7661 // Generate the code to check any assumptions that we've made for SCEV 7662 // expressions. 7663 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7664 7665 // Generate the code that checks at runtime if arrays overlap. We put the 7666 // checks into a separate block to make the more common case of few elements 7667 // faster. 7668 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7669 7670 // Generate the iteration count check for the main loop, *after* the check 7671 // for the epilogue loop, so that the path-length is shorter for the case 7672 // that goes directly through the vector epilogue. The longer-path length for 7673 // the main loop is compensated for, by the gain from vectorizing the larger 7674 // trip count. Note: the branch will get updated later on when we vectorize 7675 // the epilogue. 7676 EPI.MainLoopIterationCountCheck = 7677 emitIterationCountCheck(LoopScalarPreHeader, false); 7678 7679 // Generate the induction variable. 7680 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7681 7682 // Skip induction resume value creation here because they will be created in 7683 // the second pass. If we created them here, they wouldn't be used anyway, 7684 // because the vplan in the second pass still contains the inductions from the 7685 // original loop. 7686 7687 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7688 } 7689 7690 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7691 LLVM_DEBUG({ 7692 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7693 << "Main Loop VF:" << EPI.MainLoopVF 7694 << ", Main Loop UF:" << EPI.MainLoopUF 7695 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7696 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7697 }); 7698 } 7699 7700 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7701 DEBUG_WITH_TYPE(VerboseDebug, { 7702 dbgs() << "intermediate fn:\n" 7703 << *OrigLoop->getHeader()->getParent() << "\n"; 7704 }); 7705 } 7706 7707 BasicBlock * 7708 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7709 bool ForEpilogue) { 7710 assert(Bypass && "Expected valid bypass basic block."); 7711 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7712 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7713 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7714 // Reuse existing vector loop preheader for TC checks. 7715 // Note that new preheader block is generated for vector loop. 7716 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7717 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7718 7719 // Generate code to check if the loop's trip count is less than VF * UF of the 7720 // main vector loop. 7721 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7722 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7723 7724 Value *CheckMinIters = Builder.CreateICmp( 7725 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7726 "min.iters.check"); 7727 7728 if (!ForEpilogue) 7729 TCCheckBlock->setName("vector.main.loop.iter.check"); 7730 7731 // Create new preheader for vector loop. 7732 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7733 DT, LI, nullptr, "vector.ph"); 7734 7735 if (ForEpilogue) { 7736 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7737 DT->getNode(Bypass)->getIDom()) && 7738 "TC check is expected to dominate Bypass"); 7739 7740 // Update dominator for Bypass & LoopExit. 7741 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7742 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7743 // For loops with multiple exits, there's no edge from the middle block 7744 // to exit blocks (as the epilogue must run) and thus no need to update 7745 // the immediate dominator of the exit blocks. 7746 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7747 7748 LoopBypassBlocks.push_back(TCCheckBlock); 7749 7750 // Save the trip count so we don't have to regenerate it in the 7751 // vec.epilog.iter.check. This is safe to do because the trip count 7752 // generated here dominates the vector epilog iter check. 7753 EPI.TripCount = Count; 7754 } 7755 7756 ReplaceInstWithInst( 7757 TCCheckBlock->getTerminator(), 7758 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7759 7760 return TCCheckBlock; 7761 } 7762 7763 //===--------------------------------------------------------------------===// 7764 // EpilogueVectorizerEpilogueLoop 7765 //===--------------------------------------------------------------------===// 7766 7767 /// This function is partially responsible for generating the control flow 7768 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7769 std::pair<BasicBlock *, Value *> 7770 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7771 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7772 createVectorLoopSkeleton("vec.epilog."); 7773 7774 // Now, compare the remaining count and if there aren't enough iterations to 7775 // execute the vectorized epilogue skip to the scalar part. 7776 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7777 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7778 LoopVectorPreHeader = 7779 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7780 LI, nullptr, "vec.epilog.ph"); 7781 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7782 VecEpilogueIterationCountCheck); 7783 7784 // Adjust the control flow taking the state info from the main loop 7785 // vectorization into account. 7786 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7787 "expected this to be saved from the previous pass."); 7788 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7789 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7790 7791 DT->changeImmediateDominator(LoopVectorPreHeader, 7792 EPI.MainLoopIterationCountCheck); 7793 7794 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7795 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7796 7797 if (EPI.SCEVSafetyCheck) 7798 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7799 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7800 if (EPI.MemSafetyCheck) 7801 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7802 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7803 7804 DT->changeImmediateDominator( 7805 VecEpilogueIterationCountCheck, 7806 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7807 7808 DT->changeImmediateDominator(LoopScalarPreHeader, 7809 EPI.EpilogueIterationCountCheck); 7810 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7811 // If there is an epilogue which must run, there's no edge from the 7812 // middle block to exit blocks and thus no need to update the immediate 7813 // dominator of the exit blocks. 7814 DT->changeImmediateDominator(LoopExitBlock, 7815 EPI.EpilogueIterationCountCheck); 7816 7817 // Keep track of bypass blocks, as they feed start values to the induction 7818 // phis in the scalar loop preheader. 7819 if (EPI.SCEVSafetyCheck) 7820 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7821 if (EPI.MemSafetyCheck) 7822 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7823 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7824 7825 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7826 // merge control-flow from the latch block and the middle block. Update the 7827 // incoming values here and move the Phi into the preheader. 7828 SmallVector<PHINode *, 4> PhisInBlock; 7829 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7830 PhisInBlock.push_back(&Phi); 7831 7832 for (PHINode *Phi : PhisInBlock) { 7833 Phi->replaceIncomingBlockWith( 7834 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7835 VecEpilogueIterationCountCheck); 7836 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7837 if (EPI.SCEVSafetyCheck) 7838 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7839 if (EPI.MemSafetyCheck) 7840 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7841 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7842 } 7843 7844 // Generate a resume induction for the vector epilogue and put it in the 7845 // vector epilogue preheader 7846 Type *IdxTy = Legal->getWidestInductionType(); 7847 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7848 LoopVectorPreHeader->getFirstNonPHI()); 7849 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7850 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7851 EPI.MainLoopIterationCountCheck); 7852 7853 // Generate induction resume values. These variables save the new starting 7854 // indexes for the scalar loop. They are used to test if there are any tail 7855 // iterations left once the vector loop has completed. 7856 // Note that when the vectorized epilogue is skipped due to iteration count 7857 // check, then the resume value for the induction variable comes from 7858 // the trip count of the main vector loop, hence passing the AdditionalBypass 7859 // argument. 7860 createInductionResumeValues({VecEpilogueIterationCountCheck, 7861 EPI.VectorTripCount} /* AdditionalBypass */); 7862 7863 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7864 } 7865 7866 BasicBlock * 7867 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7868 BasicBlock *Bypass, BasicBlock *Insert) { 7869 7870 assert(EPI.TripCount && 7871 "Expected trip count to have been safed in the first pass."); 7872 assert( 7873 (!isa<Instruction>(EPI.TripCount) || 7874 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7875 "saved trip count does not dominate insertion point."); 7876 Value *TC = EPI.TripCount; 7877 IRBuilder<> Builder(Insert->getTerminator()); 7878 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7879 7880 // Generate code to check if the loop's trip count is less than VF * UF of the 7881 // vector epilogue loop. 7882 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7883 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7884 7885 Value *CheckMinIters = 7886 Builder.CreateICmp(P, Count, 7887 createStepForVF(Builder, Count->getType(), 7888 EPI.EpilogueVF, EPI.EpilogueUF), 7889 "min.epilog.iters.check"); 7890 7891 ReplaceInstWithInst( 7892 Insert->getTerminator(), 7893 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7894 7895 LoopBypassBlocks.push_back(Insert); 7896 return Insert; 7897 } 7898 7899 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7900 LLVM_DEBUG({ 7901 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7902 << "Epilogue Loop VF:" << EPI.EpilogueVF 7903 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7904 }); 7905 } 7906 7907 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7908 DEBUG_WITH_TYPE(VerboseDebug, { 7909 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7910 }); 7911 } 7912 7913 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7914 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7915 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7916 bool PredicateAtRangeStart = Predicate(Range.Start); 7917 7918 for (ElementCount TmpVF = Range.Start * 2; 7919 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7920 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7921 Range.End = TmpVF; 7922 break; 7923 } 7924 7925 return PredicateAtRangeStart; 7926 } 7927 7928 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7929 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7930 /// of VF's starting at a given VF and extending it as much as possible. Each 7931 /// vectorization decision can potentially shorten this sub-range during 7932 /// buildVPlan(). 7933 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7934 ElementCount MaxVF) { 7935 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7936 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7937 VFRange SubRange = {VF, MaxVFPlusOne}; 7938 VPlans.push_back(buildVPlan(SubRange)); 7939 VF = SubRange.End; 7940 } 7941 } 7942 7943 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7944 VPlanPtr &Plan) { 7945 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7946 7947 // Look for cached value. 7948 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7949 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7950 if (ECEntryIt != EdgeMaskCache.end()) 7951 return ECEntryIt->second; 7952 7953 VPValue *SrcMask = createBlockInMask(Src, Plan); 7954 7955 // The terminator has to be a branch inst! 7956 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7957 assert(BI && "Unexpected terminator found"); 7958 7959 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7960 return EdgeMaskCache[Edge] = SrcMask; 7961 7962 // If source is an exiting block, we know the exit edge is dynamically dead 7963 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7964 // adding uses of an otherwise potentially dead instruction. 7965 if (OrigLoop->isLoopExiting(Src)) 7966 return EdgeMaskCache[Edge] = SrcMask; 7967 7968 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 7969 assert(EdgeMask && "No Edge Mask found for condition"); 7970 7971 if (BI->getSuccessor(0) != Dst) 7972 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 7973 7974 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 7975 // The condition is 'SrcMask && EdgeMask', which is equivalent to 7976 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 7977 // The select version does not introduce new UB if SrcMask is false and 7978 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 7979 VPValue *False = Plan->getOrAddVPValue( 7980 ConstantInt::getFalse(BI->getCondition()->getType())); 7981 EdgeMask = 7982 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 7983 } 7984 7985 return EdgeMaskCache[Edge] = EdgeMask; 7986 } 7987 7988 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7989 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7990 7991 // Look for cached value. 7992 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7993 if (BCEntryIt != BlockMaskCache.end()) 7994 return BCEntryIt->second; 7995 7996 // All-one mask is modelled as no-mask following the convention for masked 7997 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7998 VPValue *BlockMask = nullptr; 7999 8000 if (OrigLoop->getHeader() == BB) { 8001 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8002 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8003 8004 // Introduce the early-exit compare IV <= BTC to form header block mask. 8005 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8006 // constructing the desired canonical IV in the header block as its first 8007 // non-phi instructions. 8008 assert(CM.foldTailByMasking() && "must fold the tail"); 8009 VPBasicBlock *HeaderVPBB = 8010 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8011 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8012 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8013 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8014 8015 VPBuilder::InsertPointGuard Guard(Builder); 8016 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8017 if (CM.TTI.emitGetActiveLaneMask()) { 8018 VPValue *TC = Plan->getOrCreateTripCount(); 8019 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8020 } else { 8021 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8022 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8023 } 8024 return BlockMaskCache[BB] = BlockMask; 8025 } 8026 8027 // This is the block mask. We OR all incoming edges. 8028 for (auto *Predecessor : predecessors(BB)) { 8029 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8030 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8031 return BlockMaskCache[BB] = EdgeMask; 8032 8033 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8034 BlockMask = EdgeMask; 8035 continue; 8036 } 8037 8038 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8039 } 8040 8041 return BlockMaskCache[BB] = BlockMask; 8042 } 8043 8044 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8045 ArrayRef<VPValue *> Operands, 8046 VFRange &Range, 8047 VPlanPtr &Plan) { 8048 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8049 "Must be called with either a load or store"); 8050 8051 auto willWiden = [&](ElementCount VF) -> bool { 8052 LoopVectorizationCostModel::InstWidening Decision = 8053 CM.getWideningDecision(I, VF); 8054 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8055 "CM decision should be taken at this point."); 8056 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8057 return true; 8058 if (CM.isScalarAfterVectorization(I, VF) || 8059 CM.isProfitableToScalarize(I, VF)) 8060 return false; 8061 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8062 }; 8063 8064 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8065 return nullptr; 8066 8067 VPValue *Mask = nullptr; 8068 if (Legal->isMaskRequired(I)) 8069 Mask = createBlockInMask(I->getParent(), Plan); 8070 8071 // Determine if the pointer operand of the access is either consecutive or 8072 // reverse consecutive. 8073 LoopVectorizationCostModel::InstWidening Decision = 8074 CM.getWideningDecision(I, Range.Start); 8075 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8076 bool Consecutive = 8077 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8078 8079 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8080 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8081 Consecutive, Reverse); 8082 8083 StoreInst *Store = cast<StoreInst>(I); 8084 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8085 Mask, Consecutive, Reverse); 8086 } 8087 8088 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8089 /// insert a recipe to expand the step for the induction recipe. 8090 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8091 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8092 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8093 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8094 // Returns true if an instruction \p I should be scalarized instead of 8095 // vectorized for the chosen vectorization factor. 8096 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8097 return CM.isScalarAfterVectorization(I, VF) || 8098 CM.isProfitableToScalarize(I, VF); 8099 }; 8100 8101 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8102 [&](ElementCount VF) { 8103 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8104 }, 8105 Range); 8106 assert(IndDesc.getStartValue() == 8107 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8108 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8109 "step must be loop invariant"); 8110 8111 VPValue *Step = 8112 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8113 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8114 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8115 !NeedsScalarIVOnly); 8116 } 8117 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8118 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8119 !NeedsScalarIVOnly); 8120 } 8121 8122 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8123 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8124 8125 // Check if this is an integer or fp induction. If so, build the recipe that 8126 // produces its scalar and vector values. 8127 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8128 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8129 *PSE.getSE(), *OrigLoop, Range); 8130 8131 // Check if this is pointer induction. If so, build the recipe for it. 8132 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8133 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8134 *PSE.getSE()); 8135 return nullptr; 8136 } 8137 8138 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8139 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8140 // Optimize the special case where the source is a constant integer 8141 // induction variable. Notice that we can only optimize the 'trunc' case 8142 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8143 // (c) other casts depend on pointer size. 8144 8145 // Determine whether \p K is a truncation based on an induction variable that 8146 // can be optimized. 8147 auto isOptimizableIVTruncate = 8148 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8149 return [=](ElementCount VF) -> bool { 8150 return CM.isOptimizableIVTruncate(K, VF); 8151 }; 8152 }; 8153 8154 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8155 isOptimizableIVTruncate(I), Range)) { 8156 8157 auto *Phi = cast<PHINode>(I->getOperand(0)); 8158 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8159 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8160 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8161 *PSE.getSE(), *OrigLoop, Range); 8162 } 8163 return nullptr; 8164 } 8165 8166 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8167 ArrayRef<VPValue *> Operands, 8168 VPlanPtr &Plan) { 8169 // If all incoming values are equal, the incoming VPValue can be used directly 8170 // instead of creating a new VPBlendRecipe. 8171 VPValue *FirstIncoming = Operands[0]; 8172 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8173 return FirstIncoming == Inc; 8174 })) { 8175 return Operands[0]; 8176 } 8177 8178 unsigned NumIncoming = Phi->getNumIncomingValues(); 8179 // For in-loop reductions, we do not need to create an additional select. 8180 VPValue *InLoopVal = nullptr; 8181 for (unsigned In = 0; In < NumIncoming; In++) { 8182 PHINode *PhiOp = 8183 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8184 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8185 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8186 InLoopVal = Operands[In]; 8187 } 8188 } 8189 8190 assert((!InLoopVal || NumIncoming == 2) && 8191 "Found an in-loop reduction for PHI with unexpected number of " 8192 "incoming values"); 8193 if (InLoopVal) 8194 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8195 8196 // We know that all PHIs in non-header blocks are converted into selects, so 8197 // we don't have to worry about the insertion order and we can just use the 8198 // builder. At this point we generate the predication tree. There may be 8199 // duplications since this is a simple recursive scan, but future 8200 // optimizations will clean it up. 8201 SmallVector<VPValue *, 2> OperandsWithMask; 8202 8203 for (unsigned In = 0; In < NumIncoming; In++) { 8204 VPValue *EdgeMask = 8205 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8206 assert((EdgeMask || NumIncoming == 1) && 8207 "Multiple predecessors with one having a full mask"); 8208 OperandsWithMask.push_back(Operands[In]); 8209 if (EdgeMask) 8210 OperandsWithMask.push_back(EdgeMask); 8211 } 8212 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8213 } 8214 8215 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8216 ArrayRef<VPValue *> Operands, 8217 VFRange &Range) const { 8218 8219 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8220 [this, CI](ElementCount VF) { 8221 return CM.isScalarWithPredication(CI, VF); 8222 }, 8223 Range); 8224 8225 if (IsPredicated) 8226 return nullptr; 8227 8228 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8229 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8230 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8231 ID == Intrinsic::pseudoprobe || 8232 ID == Intrinsic::experimental_noalias_scope_decl)) 8233 return nullptr; 8234 8235 auto willWiden = [&](ElementCount VF) -> bool { 8236 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8237 // The following case may be scalarized depending on the VF. 8238 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8239 // version of the instruction. 8240 // Is it beneficial to perform intrinsic call compared to lib call? 8241 bool NeedToScalarize = false; 8242 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8243 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8244 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8245 return UseVectorIntrinsic || !NeedToScalarize; 8246 }; 8247 8248 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8249 return nullptr; 8250 8251 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8252 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8253 } 8254 8255 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8256 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8257 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8258 // Instruction should be widened, unless it is scalar after vectorization, 8259 // scalarization is profitable or it is predicated. 8260 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8261 return CM.isScalarAfterVectorization(I, VF) || 8262 CM.isProfitableToScalarize(I, VF) || 8263 CM.isScalarWithPredication(I, VF); 8264 }; 8265 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8266 Range); 8267 } 8268 8269 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8270 ArrayRef<VPValue *> Operands) const { 8271 auto IsVectorizableOpcode = [](unsigned Opcode) { 8272 switch (Opcode) { 8273 case Instruction::Add: 8274 case Instruction::And: 8275 case Instruction::AShr: 8276 case Instruction::BitCast: 8277 case Instruction::FAdd: 8278 case Instruction::FCmp: 8279 case Instruction::FDiv: 8280 case Instruction::FMul: 8281 case Instruction::FNeg: 8282 case Instruction::FPExt: 8283 case Instruction::FPToSI: 8284 case Instruction::FPToUI: 8285 case Instruction::FPTrunc: 8286 case Instruction::FRem: 8287 case Instruction::FSub: 8288 case Instruction::ICmp: 8289 case Instruction::IntToPtr: 8290 case Instruction::LShr: 8291 case Instruction::Mul: 8292 case Instruction::Or: 8293 case Instruction::PtrToInt: 8294 case Instruction::SDiv: 8295 case Instruction::Select: 8296 case Instruction::SExt: 8297 case Instruction::Shl: 8298 case Instruction::SIToFP: 8299 case Instruction::SRem: 8300 case Instruction::Sub: 8301 case Instruction::Trunc: 8302 case Instruction::UDiv: 8303 case Instruction::UIToFP: 8304 case Instruction::URem: 8305 case Instruction::Xor: 8306 case Instruction::ZExt: 8307 case Instruction::Freeze: 8308 return true; 8309 } 8310 return false; 8311 }; 8312 8313 if (!IsVectorizableOpcode(I->getOpcode())) 8314 return nullptr; 8315 8316 // Success: widen this instruction. 8317 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8318 } 8319 8320 void VPRecipeBuilder::fixHeaderPhis() { 8321 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8322 for (VPHeaderPHIRecipe *R : PhisToFix) { 8323 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8324 VPRecipeBase *IncR = 8325 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8326 R->addOperand(IncR->getVPSingleValue()); 8327 } 8328 } 8329 8330 VPBasicBlock *VPRecipeBuilder::handleReplication( 8331 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8332 VPlanPtr &Plan) { 8333 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8334 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8335 Range); 8336 8337 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8338 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8339 Range); 8340 8341 // Even if the instruction is not marked as uniform, there are certain 8342 // intrinsic calls that can be effectively treated as such, so we check for 8343 // them here. Conservatively, we only do this for scalable vectors, since 8344 // for fixed-width VFs we can always fall back on full scalarization. 8345 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8346 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8347 case Intrinsic::assume: 8348 case Intrinsic::lifetime_start: 8349 case Intrinsic::lifetime_end: 8350 // For scalable vectors if one of the operands is variant then we still 8351 // want to mark as uniform, which will generate one instruction for just 8352 // the first lane of the vector. We can't scalarize the call in the same 8353 // way as for fixed-width vectors because we don't know how many lanes 8354 // there are. 8355 // 8356 // The reasons for doing it this way for scalable vectors are: 8357 // 1. For the assume intrinsic generating the instruction for the first 8358 // lane is still be better than not generating any at all. For 8359 // example, the input may be a splat across all lanes. 8360 // 2. For the lifetime start/end intrinsics the pointer operand only 8361 // does anything useful when the input comes from a stack object, 8362 // which suggests it should always be uniform. For non-stack objects 8363 // the effect is to poison the object, which still allows us to 8364 // remove the call. 8365 IsUniform = true; 8366 break; 8367 default: 8368 break; 8369 } 8370 } 8371 8372 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8373 IsUniform, IsPredicated); 8374 setRecipe(I, Recipe); 8375 Plan->addVPValue(I, Recipe); 8376 8377 // Find if I uses a predicated instruction. If so, it will use its scalar 8378 // value. Avoid hoisting the insert-element which packs the scalar value into 8379 // a vector value, as that happens iff all users use the vector value. 8380 for (VPValue *Op : Recipe->operands()) { 8381 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8382 if (!PredR) 8383 continue; 8384 auto *RepR = 8385 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8386 assert(RepR->isPredicated() && 8387 "expected Replicate recipe to be predicated"); 8388 RepR->setAlsoPack(false); 8389 } 8390 8391 // Finalize the recipe for Instr, first if it is not predicated. 8392 if (!IsPredicated) { 8393 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8394 VPBB->appendRecipe(Recipe); 8395 return VPBB; 8396 } 8397 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8398 8399 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8400 assert(SingleSucc && "VPBB must have a single successor when handling " 8401 "predicated replication."); 8402 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8403 // Record predicated instructions for above packing optimizations. 8404 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8405 VPBlockUtils::insertBlockAfter(Region, VPBB); 8406 auto *RegSucc = new VPBasicBlock(); 8407 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8408 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8409 return RegSucc; 8410 } 8411 8412 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8413 VPRecipeBase *PredRecipe, 8414 VPlanPtr &Plan) { 8415 // Instructions marked for predication are replicated and placed under an 8416 // if-then construct to prevent side-effects. 8417 8418 // Generate recipes to compute the block mask for this region. 8419 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8420 8421 // Build the triangular if-then region. 8422 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8423 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8424 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8425 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8426 auto *PHIRecipe = Instr->getType()->isVoidTy() 8427 ? nullptr 8428 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8429 if (PHIRecipe) { 8430 Plan->removeVPValueFor(Instr); 8431 Plan->addVPValue(Instr, PHIRecipe); 8432 } 8433 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8434 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8435 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8436 8437 // Note: first set Entry as region entry and then connect successors starting 8438 // from it in order, to propagate the "parent" of each VPBasicBlock. 8439 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8440 VPBlockUtils::connectBlocks(Pred, Exiting); 8441 8442 return Region; 8443 } 8444 8445 VPRecipeOrVPValueTy 8446 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8447 ArrayRef<VPValue *> Operands, 8448 VFRange &Range, VPlanPtr &Plan) { 8449 // First, check for specific widening recipes that deal with inductions, Phi 8450 // nodes, calls and memory operations. 8451 VPRecipeBase *Recipe; 8452 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8453 if (Phi->getParent() != OrigLoop->getHeader()) 8454 return tryToBlend(Phi, Operands, Plan); 8455 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8456 return toVPRecipeResult(Recipe); 8457 8458 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8459 assert((Legal->isReductionVariable(Phi) || 8460 Legal->isFirstOrderRecurrence(Phi)) && 8461 "can only widen reductions and first-order recurrences here"); 8462 VPValue *StartV = Operands[0]; 8463 if (Legal->isReductionVariable(Phi)) { 8464 const RecurrenceDescriptor &RdxDesc = 8465 Legal->getReductionVars().find(Phi)->second; 8466 assert(RdxDesc.getRecurrenceStartValue() == 8467 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8468 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8469 CM.isInLoopReduction(Phi), 8470 CM.useOrderedReductions(RdxDesc)); 8471 } else { 8472 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8473 } 8474 8475 // Record the incoming value from the backedge, so we can add the incoming 8476 // value from the backedge after all recipes have been created. 8477 recordRecipeOf(cast<Instruction>( 8478 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8479 PhisToFix.push_back(PhiRecipe); 8480 return toVPRecipeResult(PhiRecipe); 8481 } 8482 8483 if (isa<TruncInst>(Instr) && 8484 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8485 Range, *Plan))) 8486 return toVPRecipeResult(Recipe); 8487 8488 // All widen recipes below deal only with VF > 1. 8489 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8490 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8491 return nullptr; 8492 8493 if (auto *CI = dyn_cast<CallInst>(Instr)) 8494 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8495 8496 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8497 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8498 8499 if (!shouldWiden(Instr, Range)) 8500 return nullptr; 8501 8502 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8503 return toVPRecipeResult(new VPWidenGEPRecipe( 8504 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8505 8506 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8507 bool InvariantCond = 8508 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8509 return toVPRecipeResult(new VPWidenSelectRecipe( 8510 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8511 } 8512 8513 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8514 } 8515 8516 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8517 ElementCount MaxVF) { 8518 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8519 8520 // Collect instructions from the original loop that will become trivially dead 8521 // in the vectorized loop. We don't need to vectorize these instructions. For 8522 // example, original induction update instructions can become dead because we 8523 // separately emit induction "steps" when generating code for the new loop. 8524 // Similarly, we create a new latch condition when setting up the structure 8525 // of the new loop, so the old one can become dead. 8526 SmallPtrSet<Instruction *, 4> DeadInstructions; 8527 collectTriviallyDeadInstructions(DeadInstructions); 8528 8529 // Add assume instructions we need to drop to DeadInstructions, to prevent 8530 // them from being added to the VPlan. 8531 // TODO: We only need to drop assumes in blocks that get flattend. If the 8532 // control flow is preserved, we should keep them. 8533 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8534 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8535 8536 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8537 // Dead instructions do not need sinking. Remove them from SinkAfter. 8538 for (Instruction *I : DeadInstructions) 8539 SinkAfter.erase(I); 8540 8541 // Cannot sink instructions after dead instructions (there won't be any 8542 // recipes for them). Instead, find the first non-dead previous instruction. 8543 for (auto &P : Legal->getSinkAfter()) { 8544 Instruction *SinkTarget = P.second; 8545 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8546 (void)FirstInst; 8547 while (DeadInstructions.contains(SinkTarget)) { 8548 assert( 8549 SinkTarget != FirstInst && 8550 "Must find a live instruction (at least the one feeding the " 8551 "first-order recurrence PHI) before reaching beginning of the block"); 8552 SinkTarget = SinkTarget->getPrevNode(); 8553 assert(SinkTarget != P.first && 8554 "sink source equals target, no sinking required"); 8555 } 8556 P.second = SinkTarget; 8557 } 8558 8559 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8560 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8561 VFRange SubRange = {VF, MaxVFPlusOne}; 8562 VPlans.push_back( 8563 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8564 VF = SubRange.End; 8565 } 8566 } 8567 8568 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8569 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8570 // BranchOnCount VPInstruction to the latch. 8571 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8572 bool HasNUW) { 8573 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8574 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8575 8576 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8577 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8578 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8579 Header->insert(CanonicalIVPHI, Header->begin()); 8580 8581 auto *CanonicalIVIncrement = 8582 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8583 : VPInstruction::CanonicalIVIncrement, 8584 {CanonicalIVPHI}, DL); 8585 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8586 8587 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8588 EB->appendRecipe(CanonicalIVIncrement); 8589 8590 auto *BranchOnCount = 8591 new VPInstruction(VPInstruction::BranchOnCount, 8592 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8593 EB->appendRecipe(BranchOnCount); 8594 } 8595 8596 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8597 // original exit block. 8598 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8599 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8600 VPlan &Plan) { 8601 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8602 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8603 // Only handle single-exit loops with unique exit blocks for now. 8604 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8605 return; 8606 8607 // Introduce VPUsers modeling the exit values. 8608 for (PHINode &ExitPhi : ExitBB->phis()) { 8609 Value *IncomingValue = 8610 ExitPhi.getIncomingValueForBlock(ExitingBB); 8611 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8612 Plan.addLiveOut(&ExitPhi, V); 8613 } 8614 } 8615 8616 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8617 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8618 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8619 8620 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8621 8622 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8623 8624 // --------------------------------------------------------------------------- 8625 // Pre-construction: record ingredients whose recipes we'll need to further 8626 // process after constructing the initial VPlan. 8627 // --------------------------------------------------------------------------- 8628 8629 // Mark instructions we'll need to sink later and their targets as 8630 // ingredients whose recipe we'll need to record. 8631 for (auto &Entry : SinkAfter) { 8632 RecipeBuilder.recordRecipeOf(Entry.first); 8633 RecipeBuilder.recordRecipeOf(Entry.second); 8634 } 8635 for (auto &Reduction : CM.getInLoopReductionChains()) { 8636 PHINode *Phi = Reduction.first; 8637 RecurKind Kind = 8638 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8639 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8640 8641 RecipeBuilder.recordRecipeOf(Phi); 8642 for (auto &R : ReductionOperations) { 8643 RecipeBuilder.recordRecipeOf(R); 8644 // For min/max reductions, where we have a pair of icmp/select, we also 8645 // need to record the ICmp recipe, so it can be removed later. 8646 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8647 "Only min/max recurrences allowed for inloop reductions"); 8648 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8649 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8650 } 8651 } 8652 8653 // For each interleave group which is relevant for this (possibly trimmed) 8654 // Range, add it to the set of groups to be later applied to the VPlan and add 8655 // placeholders for its members' Recipes which we'll be replacing with a 8656 // single VPInterleaveRecipe. 8657 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8658 auto applyIG = [IG, this](ElementCount VF) -> bool { 8659 return (VF.isVector() && // Query is illegal for VF == 1 8660 CM.getWideningDecision(IG->getInsertPos(), VF) == 8661 LoopVectorizationCostModel::CM_Interleave); 8662 }; 8663 if (!getDecisionAndClampRange(applyIG, Range)) 8664 continue; 8665 InterleaveGroups.insert(IG); 8666 for (unsigned i = 0; i < IG->getFactor(); i++) 8667 if (Instruction *Member = IG->getMember(i)) 8668 RecipeBuilder.recordRecipeOf(Member); 8669 }; 8670 8671 // --------------------------------------------------------------------------- 8672 // Build initial VPlan: Scan the body of the loop in a topological order to 8673 // visit each basic block after having visited its predecessor basic blocks. 8674 // --------------------------------------------------------------------------- 8675 8676 // Create initial VPlan skeleton, starting with a block for the pre-header, 8677 // followed by a region for the vector loop, followed by the middle block. The 8678 // skeleton vector loop region contains a header and latch block. 8679 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8680 auto Plan = std::make_unique<VPlan>(Preheader); 8681 8682 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8683 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8684 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8685 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8686 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8687 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8688 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8689 8690 Instruction *DLInst = 8691 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8692 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8693 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8694 !CM.foldTailByMasking()); 8695 8696 // Scan the body of the loop in a topological order to visit each basic block 8697 // after having visited its predecessor basic blocks. 8698 LoopBlocksDFS DFS(OrigLoop); 8699 DFS.perform(LI); 8700 8701 VPBasicBlock *VPBB = HeaderVPBB; 8702 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8703 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8704 // Relevant instructions from basic block BB will be grouped into VPRecipe 8705 // ingredients and fill a new VPBasicBlock. 8706 unsigned VPBBsForBB = 0; 8707 if (VPBB != HeaderVPBB) 8708 VPBB->setName(BB->getName()); 8709 Builder.setInsertPoint(VPBB); 8710 8711 // Introduce each ingredient into VPlan. 8712 // TODO: Model and preserve debug intrinsics in VPlan. 8713 for (Instruction &I : BB->instructionsWithoutDebug()) { 8714 Instruction *Instr = &I; 8715 8716 // First filter out irrelevant instructions, to ensure no recipes are 8717 // built for them. 8718 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8719 continue; 8720 8721 SmallVector<VPValue *, 4> Operands; 8722 auto *Phi = dyn_cast<PHINode>(Instr); 8723 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8724 Operands.push_back(Plan->getOrAddVPValue( 8725 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8726 } else { 8727 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8728 Operands = {OpRange.begin(), OpRange.end()}; 8729 } 8730 8731 // Invariant stores inside loop will be deleted and a single store 8732 // with the final reduction value will be added to the exit block 8733 StoreInst *SI; 8734 if ((SI = dyn_cast<StoreInst>(&I)) && 8735 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8736 continue; 8737 8738 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8739 Instr, Operands, Range, Plan)) { 8740 // If Instr can be simplified to an existing VPValue, use it. 8741 if (RecipeOrValue.is<VPValue *>()) { 8742 auto *VPV = RecipeOrValue.get<VPValue *>(); 8743 Plan->addVPValue(Instr, VPV); 8744 // If the re-used value is a recipe, register the recipe for the 8745 // instruction, in case the recipe for Instr needs to be recorded. 8746 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8747 RecipeBuilder.setRecipe(Instr, R); 8748 continue; 8749 } 8750 // Otherwise, add the new recipe. 8751 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8752 for (auto *Def : Recipe->definedValues()) { 8753 auto *UV = Def->getUnderlyingValue(); 8754 Plan->addVPValue(UV, Def); 8755 } 8756 8757 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8758 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8759 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8760 // of the header block. That can happen for truncates of induction 8761 // variables. Those recipes are moved to the phi section of the header 8762 // block after applying SinkAfter, which relies on the original 8763 // position of the trunc. 8764 assert(isa<TruncInst>(Instr)); 8765 InductionsToMove.push_back( 8766 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8767 } 8768 RecipeBuilder.setRecipe(Instr, Recipe); 8769 VPBB->appendRecipe(Recipe); 8770 continue; 8771 } 8772 8773 // Otherwise, if all widening options failed, Instruction is to be 8774 // replicated. This may create a successor for VPBB. 8775 VPBasicBlock *NextVPBB = 8776 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8777 if (NextVPBB != VPBB) { 8778 VPBB = NextVPBB; 8779 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8780 : ""); 8781 } 8782 } 8783 8784 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8785 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8786 } 8787 8788 HeaderVPBB->setName("vector.body"); 8789 8790 // Fold the last, empty block into its predecessor. 8791 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8792 assert(VPBB && "expected to fold last (empty) block"); 8793 // After here, VPBB should not be used. 8794 VPBB = nullptr; 8795 8796 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8797 8798 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8799 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8800 "entry block must be set to a VPRegionBlock having a non-empty entry " 8801 "VPBasicBlock"); 8802 RecipeBuilder.fixHeaderPhis(); 8803 8804 // --------------------------------------------------------------------------- 8805 // Transform initial VPlan: Apply previously taken decisions, in order, to 8806 // bring the VPlan to its final state. 8807 // --------------------------------------------------------------------------- 8808 8809 // Apply Sink-After legal constraints. 8810 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8811 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8812 if (Region && Region->isReplicator()) { 8813 assert(Region->getNumSuccessors() == 1 && 8814 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8815 assert(R->getParent()->size() == 1 && 8816 "A recipe in an original replicator region must be the only " 8817 "recipe in its block"); 8818 return Region; 8819 } 8820 return nullptr; 8821 }; 8822 for (auto &Entry : SinkAfter) { 8823 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8824 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8825 8826 auto *TargetRegion = GetReplicateRegion(Target); 8827 auto *SinkRegion = GetReplicateRegion(Sink); 8828 if (!SinkRegion) { 8829 // If the sink source is not a replicate region, sink the recipe directly. 8830 if (TargetRegion) { 8831 // The target is in a replication region, make sure to move Sink to 8832 // the block after it, not into the replication region itself. 8833 VPBasicBlock *NextBlock = 8834 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8835 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8836 } else 8837 Sink->moveAfter(Target); 8838 continue; 8839 } 8840 8841 // The sink source is in a replicate region. Unhook the region from the CFG. 8842 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8843 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8844 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8845 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8846 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8847 8848 if (TargetRegion) { 8849 // The target recipe is also in a replicate region, move the sink region 8850 // after the target region. 8851 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8852 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8853 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8854 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8855 } else { 8856 // The sink source is in a replicate region, we need to move the whole 8857 // replicate region, which should only contain a single recipe in the 8858 // main block. 8859 auto *SplitBlock = 8860 Target->getParent()->splitAt(std::next(Target->getIterator())); 8861 8862 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8863 8864 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8865 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8866 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8867 } 8868 } 8869 8870 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8871 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8872 8873 // Now that sink-after is done, move induction recipes for optimized truncates 8874 // to the phi section of the header block. 8875 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8876 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8877 8878 // Adjust the recipes for any inloop reductions. 8879 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 8880 RecipeBuilder, Range.Start); 8881 8882 // Introduce a recipe to combine the incoming and previous values of a 8883 // first-order recurrence. 8884 for (VPRecipeBase &R : 8885 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8886 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8887 if (!RecurPhi) 8888 continue; 8889 8890 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8891 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8892 auto *Region = GetReplicateRegion(PrevRecipe); 8893 if (Region) 8894 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); 8895 if (!InsertBlock) { 8896 InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); 8897 VPBlockUtils::insertBlockAfter(InsertBlock, Region); 8898 } 8899 if (Region || PrevRecipe->isPhi()) 8900 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8901 else 8902 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 8903 8904 auto *RecurSplice = cast<VPInstruction>( 8905 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 8906 {RecurPhi, RecurPhi->getBackedgeValue()})); 8907 8908 RecurPhi->replaceAllUsesWith(RecurSplice); 8909 // Set the first operand of RecurSplice to RecurPhi again, after replacing 8910 // all users. 8911 RecurSplice->setOperand(0, RecurPhi); 8912 } 8913 8914 // Interleave memory: for each Interleave Group we marked earlier as relevant 8915 // for this VPlan, replace the Recipes widening its memory instructions with a 8916 // single VPInterleaveRecipe at its insertion point. 8917 for (auto IG : InterleaveGroups) { 8918 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8919 RecipeBuilder.getRecipe(IG->getInsertPos())); 8920 SmallVector<VPValue *, 4> StoredValues; 8921 for (unsigned i = 0; i < IG->getFactor(); ++i) 8922 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8923 auto *StoreR = 8924 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8925 StoredValues.push_back(StoreR->getStoredValue()); 8926 } 8927 8928 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8929 Recipe->getMask()); 8930 VPIG->insertBefore(Recipe); 8931 unsigned J = 0; 8932 for (unsigned i = 0; i < IG->getFactor(); ++i) 8933 if (Instruction *Member = IG->getMember(i)) { 8934 if (!Member->getType()->isVoidTy()) { 8935 VPValue *OriginalV = Plan->getVPValue(Member); 8936 Plan->removeVPValueFor(Member); 8937 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8938 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8939 J++; 8940 } 8941 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8942 } 8943 } 8944 8945 std::string PlanName; 8946 raw_string_ostream RSO(PlanName); 8947 ElementCount VF = Range.Start; 8948 Plan->addVF(VF); 8949 RSO << "Initial VPlan for VF={" << VF; 8950 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8951 Plan->addVF(VF); 8952 RSO << "," << VF; 8953 } 8954 RSO << "},UF>=1"; 8955 RSO.flush(); 8956 Plan->setName(PlanName); 8957 8958 // From this point onwards, VPlan-to-VPlan transformations may change the plan 8959 // in ways that accessing values using original IR values is incorrect. 8960 Plan->disableValue2VPValue(); 8961 8962 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 8963 VPlanTransforms::sinkScalarOperands(*Plan); 8964 VPlanTransforms::mergeReplicateRegions(*Plan); 8965 VPlanTransforms::removeDeadRecipes(*Plan); 8966 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 8967 8968 // Fold Exit block into its predecessor if possible. 8969 // TODO: Fold block earlier once all VPlan transforms properly maintain a 8970 // VPBasicBlock as exit. 8971 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); 8972 8973 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 8974 return Plan; 8975 } 8976 8977 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8978 // Outer loop handling: They may require CFG and instruction level 8979 // transformations before even evaluating whether vectorization is profitable. 8980 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8981 // the vectorization pipeline. 8982 assert(!OrigLoop->isInnermost()); 8983 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8984 8985 // Create new empty VPlan 8986 auto Plan = std::make_unique<VPlan>(); 8987 8988 // Build hierarchical CFG 8989 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8990 HCFGBuilder.buildHierarchicalCFG(); 8991 8992 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8993 VF *= 2) 8994 Plan->addVF(VF); 8995 8996 SmallPtrSet<Instruction *, 1> DeadInstructions; 8997 VPlanTransforms::VPInstructionsToVPRecipes( 8998 OrigLoop, Plan, 8999 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9000 DeadInstructions, *PSE.getSE()); 9001 9002 // Remove the existing terminator of the exiting block of the top-most region. 9003 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9004 auto *Term = 9005 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9006 Term->eraseFromParent(); 9007 9008 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9009 true); 9010 return Plan; 9011 } 9012 9013 // Adjust the recipes for reductions. For in-loop reductions the chain of 9014 // instructions leading from the loop exit instr to the phi need to be converted 9015 // to reductions, with one operand being vector and the other being the scalar 9016 // reduction chain. For other reductions, a select is introduced between the phi 9017 // and live-out recipes when folding the tail. 9018 void LoopVectorizationPlanner::adjustRecipesForReductions( 9019 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9020 ElementCount MinVF) { 9021 for (auto &Reduction : CM.getInLoopReductionChains()) { 9022 PHINode *Phi = Reduction.first; 9023 const RecurrenceDescriptor &RdxDesc = 9024 Legal->getReductionVars().find(Phi)->second; 9025 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9026 9027 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9028 continue; 9029 9030 // ReductionOperations are orders top-down from the phi's use to the 9031 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9032 // which of the two operands will remain scalar and which will be reduced. 9033 // For minmax the chain will be the select instructions. 9034 Instruction *Chain = Phi; 9035 for (Instruction *R : ReductionOperations) { 9036 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9037 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9038 9039 VPValue *ChainOp = Plan->getVPValue(Chain); 9040 unsigned FirstOpId; 9041 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9042 "Only min/max recurrences allowed for inloop reductions"); 9043 // Recognize a call to the llvm.fmuladd intrinsic. 9044 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9045 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9046 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9047 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9048 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9049 "Expected to replace a VPWidenSelectSC"); 9050 FirstOpId = 1; 9051 } else { 9052 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9053 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9054 "Expected to replace a VPWidenSC"); 9055 FirstOpId = 0; 9056 } 9057 unsigned VecOpId = 9058 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9059 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9060 9061 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9062 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9063 : nullptr; 9064 9065 if (IsFMulAdd) { 9066 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9067 // need to create an fmul recipe to use as the vector operand for the 9068 // fadd reduction. 9069 VPInstruction *FMulRecipe = new VPInstruction( 9070 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9071 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9072 WidenRecipe->getParent()->insert(FMulRecipe, 9073 WidenRecipe->getIterator()); 9074 VecOp = FMulRecipe; 9075 } 9076 VPReductionRecipe *RedRecipe = 9077 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9078 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9079 Plan->removeVPValueFor(R); 9080 Plan->addVPValue(R, RedRecipe); 9081 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9082 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9083 WidenRecipe->eraseFromParent(); 9084 9085 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9086 VPRecipeBase *CompareRecipe = 9087 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9088 assert(isa<VPWidenRecipe>(CompareRecipe) && 9089 "Expected to replace a VPWidenSC"); 9090 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9091 "Expected no remaining users"); 9092 CompareRecipe->eraseFromParent(); 9093 } 9094 Chain = R; 9095 } 9096 } 9097 9098 // If tail is folded by masking, introduce selects between the phi 9099 // and the live-out instruction of each reduction, at the beginning of the 9100 // dedicated latch block. 9101 if (CM.foldTailByMasking()) { 9102 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9103 for (VPRecipeBase &R : 9104 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9105 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9106 if (!PhiR || PhiR->isInLoop()) 9107 continue; 9108 VPValue *Cond = 9109 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9110 VPValue *Red = PhiR->getBackedgeValue(); 9111 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9112 "reduction recipe must be defined before latch"); 9113 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9114 } 9115 } 9116 } 9117 9118 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9119 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9120 VPSlotTracker &SlotTracker) const { 9121 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9122 IG->getInsertPos()->printAsOperand(O, false); 9123 O << ", "; 9124 getAddr()->printAsOperand(O, SlotTracker); 9125 VPValue *Mask = getMask(); 9126 if (Mask) { 9127 O << ", "; 9128 Mask->printAsOperand(O, SlotTracker); 9129 } 9130 9131 unsigned OpIdx = 0; 9132 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9133 if (!IG->getMember(i)) 9134 continue; 9135 if (getNumStoreOperands() > 0) { 9136 O << "\n" << Indent << " store "; 9137 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9138 O << " to index " << i; 9139 } else { 9140 O << "\n" << Indent << " "; 9141 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9142 O << " = load from index " << i; 9143 } 9144 ++OpIdx; 9145 } 9146 } 9147 #endif 9148 9149 void VPWidenCallRecipe::execute(VPTransformState &State) { 9150 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9151 *this, State); 9152 } 9153 9154 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9155 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9156 State.setDebugLocFromInst(&I); 9157 9158 // The condition can be loop invariant but still defined inside the 9159 // loop. This means that we can't just use the original 'cond' value. 9160 // We have to take the 'vectorized' value and pick the first lane. 9161 // Instcombine will make this a no-op. 9162 auto *InvarCond = 9163 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9164 9165 for (unsigned Part = 0; Part < State.UF; ++Part) { 9166 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9167 Value *Op0 = State.get(getOperand(1), Part); 9168 Value *Op1 = State.get(getOperand(2), Part); 9169 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9170 State.set(this, Sel, Part); 9171 State.addMetadata(Sel, &I); 9172 } 9173 } 9174 9175 void VPWidenRecipe::execute(VPTransformState &State) { 9176 auto &I = *cast<Instruction>(getUnderlyingValue()); 9177 auto &Builder = State.Builder; 9178 switch (I.getOpcode()) { 9179 case Instruction::Call: 9180 case Instruction::Br: 9181 case Instruction::PHI: 9182 case Instruction::GetElementPtr: 9183 case Instruction::Select: 9184 llvm_unreachable("This instruction is handled by a different recipe."); 9185 case Instruction::UDiv: 9186 case Instruction::SDiv: 9187 case Instruction::SRem: 9188 case Instruction::URem: 9189 case Instruction::Add: 9190 case Instruction::FAdd: 9191 case Instruction::Sub: 9192 case Instruction::FSub: 9193 case Instruction::FNeg: 9194 case Instruction::Mul: 9195 case Instruction::FMul: 9196 case Instruction::FDiv: 9197 case Instruction::FRem: 9198 case Instruction::Shl: 9199 case Instruction::LShr: 9200 case Instruction::AShr: 9201 case Instruction::And: 9202 case Instruction::Or: 9203 case Instruction::Xor: { 9204 // Just widen unops and binops. 9205 State.setDebugLocFromInst(&I); 9206 9207 for (unsigned Part = 0; Part < State.UF; ++Part) { 9208 SmallVector<Value *, 2> Ops; 9209 for (VPValue *VPOp : operands()) 9210 Ops.push_back(State.get(VPOp, Part)); 9211 9212 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9213 9214 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9215 VecOp->copyIRFlags(&I); 9216 9217 // If the instruction is vectorized and was in a basic block that needed 9218 // predication, we can't propagate poison-generating flags (nuw/nsw, 9219 // exact, etc.). The control flow has been linearized and the 9220 // instruction is no longer guarded by the predicate, which could make 9221 // the flag properties to no longer hold. 9222 if (State.MayGeneratePoisonRecipes.contains(this)) 9223 VecOp->dropPoisonGeneratingFlags(); 9224 } 9225 9226 // Use this vector value for all users of the original instruction. 9227 State.set(this, V, Part); 9228 State.addMetadata(V, &I); 9229 } 9230 9231 break; 9232 } 9233 case Instruction::Freeze: { 9234 State.setDebugLocFromInst(&I); 9235 9236 for (unsigned Part = 0; Part < State.UF; ++Part) { 9237 Value *Op = State.get(getOperand(0), Part); 9238 9239 Value *Freeze = Builder.CreateFreeze(Op); 9240 State.set(this, Freeze, Part); 9241 } 9242 break; 9243 } 9244 case Instruction::ICmp: 9245 case Instruction::FCmp: { 9246 // Widen compares. Generate vector compares. 9247 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9248 auto *Cmp = cast<CmpInst>(&I); 9249 State.setDebugLocFromInst(Cmp); 9250 for (unsigned Part = 0; Part < State.UF; ++Part) { 9251 Value *A = State.get(getOperand(0), Part); 9252 Value *B = State.get(getOperand(1), Part); 9253 Value *C = nullptr; 9254 if (FCmp) { 9255 // Propagate fast math flags. 9256 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9257 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9258 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9259 } else { 9260 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9261 } 9262 State.set(this, C, Part); 9263 State.addMetadata(C, &I); 9264 } 9265 9266 break; 9267 } 9268 9269 case Instruction::ZExt: 9270 case Instruction::SExt: 9271 case Instruction::FPToUI: 9272 case Instruction::FPToSI: 9273 case Instruction::FPExt: 9274 case Instruction::PtrToInt: 9275 case Instruction::IntToPtr: 9276 case Instruction::SIToFP: 9277 case Instruction::UIToFP: 9278 case Instruction::Trunc: 9279 case Instruction::FPTrunc: 9280 case Instruction::BitCast: { 9281 auto *CI = cast<CastInst>(&I); 9282 State.setDebugLocFromInst(CI); 9283 9284 /// Vectorize casts. 9285 Type *DestTy = (State.VF.isScalar()) 9286 ? CI->getType() 9287 : VectorType::get(CI->getType(), State.VF); 9288 9289 for (unsigned Part = 0; Part < State.UF; ++Part) { 9290 Value *A = State.get(getOperand(0), Part); 9291 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9292 State.set(this, Cast, Part); 9293 State.addMetadata(Cast, &I); 9294 } 9295 break; 9296 } 9297 default: 9298 // This instruction is not vectorized by simple widening. 9299 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9300 llvm_unreachable("Unhandled instruction!"); 9301 } // end of switch. 9302 } 9303 9304 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9305 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9306 // Construct a vector GEP by widening the operands of the scalar GEP as 9307 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9308 // results in a vector of pointers when at least one operand of the GEP 9309 // is vector-typed. Thus, to keep the representation compact, we only use 9310 // vector-typed operands for loop-varying values. 9311 9312 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9313 // If we are vectorizing, but the GEP has only loop-invariant operands, 9314 // the GEP we build (by only using vector-typed operands for 9315 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9316 // produce a vector of pointers, we need to either arbitrarily pick an 9317 // operand to broadcast, or broadcast a clone of the original GEP. 9318 // Here, we broadcast a clone of the original. 9319 // 9320 // TODO: If at some point we decide to scalarize instructions having 9321 // loop-invariant operands, this special case will no longer be 9322 // required. We would add the scalarization decision to 9323 // collectLoopScalars() and teach getVectorValue() to broadcast 9324 // the lane-zero scalar value. 9325 auto *Clone = State.Builder.Insert(GEP->clone()); 9326 for (unsigned Part = 0; Part < State.UF; ++Part) { 9327 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9328 State.set(this, EntryPart, Part); 9329 State.addMetadata(EntryPart, GEP); 9330 } 9331 } else { 9332 // If the GEP has at least one loop-varying operand, we are sure to 9333 // produce a vector of pointers. But if we are only unrolling, we want 9334 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9335 // produce with the code below will be scalar (if VF == 1) or vector 9336 // (otherwise). Note that for the unroll-only case, we still maintain 9337 // values in the vector mapping with initVector, as we do for other 9338 // instructions. 9339 for (unsigned Part = 0; Part < State.UF; ++Part) { 9340 // The pointer operand of the new GEP. If it's loop-invariant, we 9341 // won't broadcast it. 9342 auto *Ptr = IsPtrLoopInvariant 9343 ? State.get(getOperand(0), VPIteration(0, 0)) 9344 : State.get(getOperand(0), Part); 9345 9346 // Collect all the indices for the new GEP. If any index is 9347 // loop-invariant, we won't broadcast it. 9348 SmallVector<Value *, 4> Indices; 9349 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9350 VPValue *Operand = getOperand(I); 9351 if (IsIndexLoopInvariant[I - 1]) 9352 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9353 else 9354 Indices.push_back(State.get(Operand, Part)); 9355 } 9356 9357 // If the GEP instruction is vectorized and was in a basic block that 9358 // needed predication, we can't propagate the poison-generating 'inbounds' 9359 // flag. The control flow has been linearized and the GEP is no longer 9360 // guarded by the predicate, which could make the 'inbounds' properties to 9361 // no longer hold. 9362 bool IsInBounds = 9363 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9364 9365 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9366 // but it should be a vector, otherwise. 9367 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 9368 Indices, "", IsInBounds); 9369 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9370 "NewGEP is not a pointer vector"); 9371 State.set(this, NewGEP, Part); 9372 State.addMetadata(NewGEP, GEP); 9373 } 9374 } 9375 } 9376 9377 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9378 assert(!State.Instance && "Int or FP induction being replicated."); 9379 9380 Value *Start = getStartValue()->getLiveInIRValue(); 9381 const InductionDescriptor &ID = getInductionDescriptor(); 9382 TruncInst *Trunc = getTruncInst(); 9383 IRBuilderBase &Builder = State.Builder; 9384 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9385 assert(State.VF.isVector() && "must have vector VF"); 9386 9387 // The value from the original loop to which we are mapping the new induction 9388 // variable. 9389 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9390 9391 // Fast-math-flags propagate from the original induction instruction. 9392 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9393 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9394 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9395 9396 // Now do the actual transformations, and start with fetching the step value. 9397 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9398 9399 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9400 "Expected either an induction phi-node or a truncate of it!"); 9401 9402 // Construct the initial value of the vector IV in the vector loop preheader 9403 auto CurrIP = Builder.saveIP(); 9404 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9405 Builder.SetInsertPoint(VectorPH->getTerminator()); 9406 if (isa<TruncInst>(EntryVal)) { 9407 assert(Start->getType()->isIntegerTy() && 9408 "Truncation requires an integer type"); 9409 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9410 Step = Builder.CreateTrunc(Step, TruncType); 9411 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9412 } 9413 9414 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9415 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9416 Value *SteppedStart = getStepVector( 9417 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9418 9419 // We create vector phi nodes for both integer and floating-point induction 9420 // variables. Here, we determine the kind of arithmetic we will perform. 9421 Instruction::BinaryOps AddOp; 9422 Instruction::BinaryOps MulOp; 9423 if (Step->getType()->isIntegerTy()) { 9424 AddOp = Instruction::Add; 9425 MulOp = Instruction::Mul; 9426 } else { 9427 AddOp = ID.getInductionOpcode(); 9428 MulOp = Instruction::FMul; 9429 } 9430 9431 // Multiply the vectorization factor by the step using integer or 9432 // floating-point arithmetic as appropriate. 9433 Type *StepType = Step->getType(); 9434 Value *RuntimeVF; 9435 if (Step->getType()->isFloatingPointTy()) 9436 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9437 else 9438 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9439 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9440 9441 // Create a vector splat to use in the induction update. 9442 // 9443 // FIXME: If the step is non-constant, we create the vector splat with 9444 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9445 // handle a constant vector splat. 9446 Value *SplatVF = isa<Constant>(Mul) 9447 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9448 : Builder.CreateVectorSplat(State.VF, Mul); 9449 Builder.restoreIP(CurrIP); 9450 9451 // We may need to add the step a number of times, depending on the unroll 9452 // factor. The last of those goes into the PHI. 9453 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9454 &*State.CFG.PrevBB->getFirstInsertionPt()); 9455 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9456 Instruction *LastInduction = VecInd; 9457 for (unsigned Part = 0; Part < State.UF; ++Part) { 9458 State.set(this, LastInduction, Part); 9459 9460 if (isa<TruncInst>(EntryVal)) 9461 State.addMetadata(LastInduction, EntryVal); 9462 9463 LastInduction = cast<Instruction>( 9464 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9465 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9466 } 9467 9468 LastInduction->setName("vec.ind.next"); 9469 VecInd->addIncoming(SteppedStart, VectorPH); 9470 // Add induction update using an incorrect block temporarily. The phi node 9471 // will be fixed after VPlan execution. Note that at this point the latch 9472 // block cannot be used, as it does not exist yet. 9473 // TODO: Model increment value in VPlan, by turning the recipe into a 9474 // multi-def and a subclass of VPHeaderPHIRecipe. 9475 VecInd->addIncoming(LastInduction, VectorPH); 9476 } 9477 9478 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9479 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9480 "Not a pointer induction according to InductionDescriptor!"); 9481 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9482 "Unexpected type."); 9483 9484 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9485 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9486 9487 if (onlyScalarsGenerated(State.VF)) { 9488 // This is the normalized GEP that starts counting at zero. 9489 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9490 CanonicalIV, IndDesc.getStep()->getType()); 9491 // Determine the number of scalars we need to generate for each unroll 9492 // iteration. If the instruction is uniform, we only need to generate the 9493 // first lane. Otherwise, we generate all VF values. 9494 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9495 assert((IsUniform || !State.VF.isScalable()) && 9496 "Cannot scalarize a scalable VF"); 9497 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9498 9499 for (unsigned Part = 0; Part < State.UF; ++Part) { 9500 Value *PartStart = 9501 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9502 9503 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9504 Value *Idx = State.Builder.CreateAdd( 9505 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9506 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9507 9508 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9509 State.CFG.PrevBB->getTerminator()); 9510 Value *SclrGep = emitTransformedIndex( 9511 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9512 SclrGep->setName("next.gep"); 9513 State.set(this, SclrGep, VPIteration(Part, Lane)); 9514 } 9515 } 9516 return; 9517 } 9518 9519 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9520 "Induction step not a SCEV constant!"); 9521 Type *PhiType = IndDesc.getStep()->getType(); 9522 9523 // Build a pointer phi 9524 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9525 Type *ScStValueType = ScalarStartValue->getType(); 9526 PHINode *NewPointerPhi = 9527 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9528 9529 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9530 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9531 9532 // A pointer induction, performed by using a gep 9533 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9534 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9535 9536 const SCEV *ScalarStep = IndDesc.getStep(); 9537 SCEVExpander Exp(SE, DL, "induction"); 9538 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9539 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9540 Value *NumUnrolledElems = 9541 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9542 Value *InductionGEP = GetElementPtrInst::Create( 9543 IndDesc.getElementType(), NewPointerPhi, 9544 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9545 InductionLoc); 9546 // Add induction update using an incorrect block temporarily. The phi node 9547 // will be fixed after VPlan execution. Note that at this point the latch 9548 // block cannot be used, as it does not exist yet. 9549 // TODO: Model increment value in VPlan, by turning the recipe into a 9550 // multi-def and a subclass of VPHeaderPHIRecipe. 9551 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9552 9553 // Create UF many actual address geps that use the pointer 9554 // phi as base and a vectorized version of the step value 9555 // (<step*0, ..., step*N>) as offset. 9556 for (unsigned Part = 0; Part < State.UF; ++Part) { 9557 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9558 Value *StartOffsetScalar = 9559 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9560 Value *StartOffset = 9561 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9562 // Create a vector of consecutive numbers from zero to VF. 9563 StartOffset = State.Builder.CreateAdd( 9564 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9565 9566 Value *GEP = State.Builder.CreateGEP( 9567 IndDesc.getElementType(), NewPointerPhi, 9568 State.Builder.CreateMul( 9569 StartOffset, 9570 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9571 "vector.gep")); 9572 State.set(this, GEP, Part); 9573 } 9574 } 9575 9576 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9577 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9578 9579 // Fast-math-flags propagate from the original induction instruction. 9580 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9581 if (IndDesc.getInductionBinOp() && 9582 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9583 State.Builder.setFastMathFlags( 9584 IndDesc.getInductionBinOp()->getFastMathFlags()); 9585 9586 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9587 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9588 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9589 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9590 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9591 ScalarIV = 9592 Ty->isIntegerTy() 9593 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9594 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9595 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9596 getStartValue()->getLiveInIRValue(), Step, 9597 IndDesc); 9598 ScalarIV->setName("offset.idx"); 9599 } 9600 if (TruncToTy) { 9601 assert(Step->getType()->isIntegerTy() && 9602 "Truncation requires an integer step"); 9603 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9604 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9605 } 9606 return ScalarIV; 9607 }; 9608 9609 Value *ScalarIV = CreateScalarIV(Step); 9610 if (State.VF.isVector()) { 9611 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9612 return; 9613 } 9614 9615 for (unsigned Part = 0; Part < State.UF; ++Part) { 9616 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9617 Value *EntryPart; 9618 if (Step->getType()->isFloatingPointTy()) { 9619 Value *StartIdx = 9620 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9621 // Floating-point operations inherit FMF via the builder's flags. 9622 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9623 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9624 ScalarIV, MulOp); 9625 } else { 9626 Value *StartIdx = 9627 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9628 EntryPart = State.Builder.CreateAdd( 9629 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9630 } 9631 State.set(this, EntryPart, Part); 9632 } 9633 } 9634 9635 void VPBlendRecipe::execute(VPTransformState &State) { 9636 State.setDebugLocFromInst(Phi); 9637 // We know that all PHIs in non-header blocks are converted into 9638 // selects, so we don't have to worry about the insertion order and we 9639 // can just use the builder. 9640 // At this point we generate the predication tree. There may be 9641 // duplications since this is a simple recursive scan, but future 9642 // optimizations will clean it up. 9643 9644 unsigned NumIncoming = getNumIncomingValues(); 9645 9646 // Generate a sequence of selects of the form: 9647 // SELECT(Mask3, In3, 9648 // SELECT(Mask2, In2, 9649 // SELECT(Mask1, In1, 9650 // In0))) 9651 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9652 // are essentially undef are taken from In0. 9653 InnerLoopVectorizer::VectorParts Entry(State.UF); 9654 for (unsigned In = 0; In < NumIncoming; ++In) { 9655 for (unsigned Part = 0; Part < State.UF; ++Part) { 9656 // We might have single edge PHIs (blocks) - use an identity 9657 // 'select' for the first PHI operand. 9658 Value *In0 = State.get(getIncomingValue(In), Part); 9659 if (In == 0) 9660 Entry[Part] = In0; // Initialize with the first incoming value. 9661 else { 9662 // Select between the current value and the previous incoming edge 9663 // based on the incoming mask. 9664 Value *Cond = State.get(getMask(In), Part); 9665 Entry[Part] = 9666 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9667 } 9668 } 9669 } 9670 for (unsigned Part = 0; Part < State.UF; ++Part) 9671 State.set(this, Entry[Part], Part); 9672 } 9673 9674 void VPInterleaveRecipe::execute(VPTransformState &State) { 9675 assert(!State.Instance && "Interleave group being replicated."); 9676 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9677 getStoredValues(), getMask()); 9678 } 9679 9680 void VPReductionRecipe::execute(VPTransformState &State) { 9681 assert(!State.Instance && "Reduction being replicated."); 9682 Value *PrevInChain = State.get(getChainOp(), 0); 9683 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9684 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9685 // Propagate the fast-math flags carried by the underlying instruction. 9686 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9687 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9688 for (unsigned Part = 0; Part < State.UF; ++Part) { 9689 Value *NewVecOp = State.get(getVecOp(), Part); 9690 if (VPValue *Cond = getCondOp()) { 9691 Value *NewCond = State.get(Cond, Part); 9692 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9693 Value *Iden = RdxDesc->getRecurrenceIdentity( 9694 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9695 Value *IdenVec = 9696 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9697 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9698 NewVecOp = Select; 9699 } 9700 Value *NewRed; 9701 Value *NextInChain; 9702 if (IsOrdered) { 9703 if (State.VF.isVector()) 9704 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9705 PrevInChain); 9706 else 9707 NewRed = State.Builder.CreateBinOp( 9708 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9709 NewVecOp); 9710 PrevInChain = NewRed; 9711 } else { 9712 PrevInChain = State.get(getChainOp(), Part); 9713 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9714 } 9715 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9716 NextInChain = 9717 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9718 NewRed, PrevInChain); 9719 } else if (IsOrdered) 9720 NextInChain = NewRed; 9721 else 9722 NextInChain = State.Builder.CreateBinOp( 9723 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9724 PrevInChain); 9725 State.set(this, NextInChain, Part); 9726 } 9727 } 9728 9729 void VPReplicateRecipe::execute(VPTransformState &State) { 9730 if (State.Instance) { // Generate a single instance. 9731 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9732 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9733 IsPredicated, State); 9734 // Insert scalar instance packing it into a vector. 9735 if (AlsoPack && State.VF.isVector()) { 9736 // If we're constructing lane 0, initialize to start from poison. 9737 if (State.Instance->Lane.isFirstLane()) { 9738 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9739 Value *Poison = PoisonValue::get( 9740 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9741 State.set(this, Poison, State.Instance->Part); 9742 } 9743 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9744 } 9745 return; 9746 } 9747 9748 // Generate scalar instances for all VF lanes of all UF parts, unless the 9749 // instruction is uniform inwhich case generate only the first lane for each 9750 // of the UF parts. 9751 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9752 assert((!State.VF.isScalable() || IsUniform) && 9753 "Can't scalarize a scalable vector"); 9754 for (unsigned Part = 0; Part < State.UF; ++Part) 9755 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9756 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9757 VPIteration(Part, Lane), IsPredicated, 9758 State); 9759 } 9760 9761 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9762 assert(State.Instance && "Branch on Mask works only on single instance."); 9763 9764 unsigned Part = State.Instance->Part; 9765 unsigned Lane = State.Instance->Lane.getKnownLane(); 9766 9767 Value *ConditionBit = nullptr; 9768 VPValue *BlockInMask = getMask(); 9769 if (BlockInMask) { 9770 ConditionBit = State.get(BlockInMask, Part); 9771 if (ConditionBit->getType()->isVectorTy()) 9772 ConditionBit = State.Builder.CreateExtractElement( 9773 ConditionBit, State.Builder.getInt32(Lane)); 9774 } else // Block in mask is all-one. 9775 ConditionBit = State.Builder.getTrue(); 9776 9777 // Replace the temporary unreachable terminator with a new conditional branch, 9778 // whose two destinations will be set later when they are created. 9779 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9780 assert(isa<UnreachableInst>(CurrentTerminator) && 9781 "Expected to replace unreachable terminator with conditional branch."); 9782 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9783 CondBr->setSuccessor(0, nullptr); 9784 ReplaceInstWithInst(CurrentTerminator, CondBr); 9785 } 9786 9787 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9788 assert(State.Instance && "Predicated instruction PHI works per instance."); 9789 Instruction *ScalarPredInst = 9790 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9791 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9792 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9793 assert(PredicatingBB && "Predicated block has no single predecessor."); 9794 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9795 "operand must be VPReplicateRecipe"); 9796 9797 // By current pack/unpack logic we need to generate only a single phi node: if 9798 // a vector value for the predicated instruction exists at this point it means 9799 // the instruction has vector users only, and a phi for the vector value is 9800 // needed. In this case the recipe of the predicated instruction is marked to 9801 // also do that packing, thereby "hoisting" the insert-element sequence. 9802 // Otherwise, a phi node for the scalar value is needed. 9803 unsigned Part = State.Instance->Part; 9804 if (State.hasVectorValue(getOperand(0), Part)) { 9805 Value *VectorValue = State.get(getOperand(0), Part); 9806 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9807 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9808 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9809 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9810 if (State.hasVectorValue(this, Part)) 9811 State.reset(this, VPhi, Part); 9812 else 9813 State.set(this, VPhi, Part); 9814 // NOTE: Currently we need to update the value of the operand, so the next 9815 // predicated iteration inserts its generated value in the correct vector. 9816 State.reset(getOperand(0), VPhi, Part); 9817 } else { 9818 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9819 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9820 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9821 PredicatingBB); 9822 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9823 if (State.hasScalarValue(this, *State.Instance)) 9824 State.reset(this, Phi, *State.Instance); 9825 else 9826 State.set(this, Phi, *State.Instance); 9827 // NOTE: Currently we need to update the value of the operand, so the next 9828 // predicated iteration inserts its generated value in the correct vector. 9829 State.reset(getOperand(0), Phi, *State.Instance); 9830 } 9831 } 9832 9833 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9834 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9835 9836 // Attempt to issue a wide load. 9837 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9838 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9839 9840 assert((LI || SI) && "Invalid Load/Store instruction"); 9841 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9842 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9843 9844 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9845 9846 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9847 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9848 bool CreateGatherScatter = !Consecutive; 9849 9850 auto &Builder = State.Builder; 9851 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9852 bool isMaskRequired = getMask(); 9853 if (isMaskRequired) 9854 for (unsigned Part = 0; Part < State.UF; ++Part) 9855 BlockInMaskParts[Part] = State.get(getMask(), Part); 9856 9857 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9858 // Calculate the pointer for the specific unroll-part. 9859 GetElementPtrInst *PartPtr = nullptr; 9860 9861 bool InBounds = false; 9862 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9863 InBounds = gep->isInBounds(); 9864 if (Reverse) { 9865 // If the address is consecutive but reversed, then the 9866 // wide store needs to start at the last vector element. 9867 // RunTimeVF = VScale * VF.getKnownMinValue() 9868 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9869 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9870 // NumElt = -Part * RunTimeVF 9871 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9872 // LastLane = 1 - RunTimeVF 9873 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9874 PartPtr = 9875 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9876 PartPtr->setIsInBounds(InBounds); 9877 PartPtr = cast<GetElementPtrInst>( 9878 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9879 PartPtr->setIsInBounds(InBounds); 9880 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9881 BlockInMaskParts[Part] = 9882 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9883 } else { 9884 Value *Increment = 9885 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9886 PartPtr = cast<GetElementPtrInst>( 9887 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9888 PartPtr->setIsInBounds(InBounds); 9889 } 9890 9891 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9892 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9893 }; 9894 9895 // Handle Stores: 9896 if (SI) { 9897 State.setDebugLocFromInst(SI); 9898 9899 for (unsigned Part = 0; Part < State.UF; ++Part) { 9900 Instruction *NewSI = nullptr; 9901 Value *StoredVal = State.get(StoredValue, Part); 9902 if (CreateGatherScatter) { 9903 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9904 Value *VectorGep = State.get(getAddr(), Part); 9905 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9906 MaskPart); 9907 } else { 9908 if (Reverse) { 9909 // If we store to reverse consecutive memory locations, then we need 9910 // to reverse the order of elements in the stored value. 9911 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9912 // We don't want to update the value in the map as it might be used in 9913 // another expression. So don't call resetVectorValue(StoredVal). 9914 } 9915 auto *VecPtr = 9916 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9917 if (isMaskRequired) 9918 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9919 BlockInMaskParts[Part]); 9920 else 9921 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9922 } 9923 State.addMetadata(NewSI, SI); 9924 } 9925 return; 9926 } 9927 9928 // Handle loads. 9929 assert(LI && "Must have a load instruction"); 9930 State.setDebugLocFromInst(LI); 9931 for (unsigned Part = 0; Part < State.UF; ++Part) { 9932 Value *NewLI; 9933 if (CreateGatherScatter) { 9934 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9935 Value *VectorGep = State.get(getAddr(), Part); 9936 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9937 nullptr, "wide.masked.gather"); 9938 State.addMetadata(NewLI, LI); 9939 } else { 9940 auto *VecPtr = 9941 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9942 if (isMaskRequired) 9943 NewLI = Builder.CreateMaskedLoad( 9944 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9945 PoisonValue::get(DataTy), "wide.masked.load"); 9946 else 9947 NewLI = 9948 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9949 9950 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9951 State.addMetadata(NewLI, LI); 9952 if (Reverse) 9953 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9954 } 9955 9956 State.set(getVPSingleValue(), NewLI, Part); 9957 } 9958 } 9959 9960 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9961 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9962 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9963 // for predication. 9964 static ScalarEpilogueLowering getScalarEpilogueLowering( 9965 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9966 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9967 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9968 LoopVectorizationLegality &LVL) { 9969 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9970 // don't look at hints or options, and don't request a scalar epilogue. 9971 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9972 // LoopAccessInfo (due to code dependency and not being able to reliably get 9973 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9974 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9975 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9976 // back to the old way and vectorize with versioning when forced. See D81345.) 9977 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9978 PGSOQueryType::IRPass) && 9979 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9980 return CM_ScalarEpilogueNotAllowedOptSize; 9981 9982 // 2) If set, obey the directives 9983 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9984 switch (PreferPredicateOverEpilogue) { 9985 case PreferPredicateTy::ScalarEpilogue: 9986 return CM_ScalarEpilogueAllowed; 9987 case PreferPredicateTy::PredicateElseScalarEpilogue: 9988 return CM_ScalarEpilogueNotNeededUsePredicate; 9989 case PreferPredicateTy::PredicateOrDontVectorize: 9990 return CM_ScalarEpilogueNotAllowedUsePredicate; 9991 }; 9992 } 9993 9994 // 3) If set, obey the hints 9995 switch (Hints.getPredicate()) { 9996 case LoopVectorizeHints::FK_Enabled: 9997 return CM_ScalarEpilogueNotNeededUsePredicate; 9998 case LoopVectorizeHints::FK_Disabled: 9999 return CM_ScalarEpilogueAllowed; 10000 }; 10001 10002 // 4) if the TTI hook indicates this is profitable, request predication. 10003 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10004 LVL.getLAI())) 10005 return CM_ScalarEpilogueNotNeededUsePredicate; 10006 10007 return CM_ScalarEpilogueAllowed; 10008 } 10009 10010 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10011 // If Values have been set for this Def return the one relevant for \p Part. 10012 if (hasVectorValue(Def, Part)) 10013 return Data.PerPartOutput[Def][Part]; 10014 10015 if (!hasScalarValue(Def, {Part, 0})) { 10016 Value *IRV = Def->getLiveInIRValue(); 10017 Value *B = ILV->getBroadcastInstrs(IRV); 10018 set(Def, B, Part); 10019 return B; 10020 } 10021 10022 Value *ScalarValue = get(Def, {Part, 0}); 10023 // If we aren't vectorizing, we can just copy the scalar map values over 10024 // to the vector map. 10025 if (VF.isScalar()) { 10026 set(Def, ScalarValue, Part); 10027 return ScalarValue; 10028 } 10029 10030 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10031 bool IsUniform = RepR && RepR->isUniform(); 10032 10033 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10034 // Check if there is a scalar value for the selected lane. 10035 if (!hasScalarValue(Def, {Part, LastLane})) { 10036 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10037 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10038 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10039 "unexpected recipe found to be invariant"); 10040 IsUniform = true; 10041 LastLane = 0; 10042 } 10043 10044 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10045 // Set the insert point after the last scalarized instruction or after the 10046 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10047 // will directly follow the scalar definitions. 10048 auto OldIP = Builder.saveIP(); 10049 auto NewIP = 10050 isa<PHINode>(LastInst) 10051 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10052 : std::next(BasicBlock::iterator(LastInst)); 10053 Builder.SetInsertPoint(&*NewIP); 10054 10055 // However, if we are vectorizing, we need to construct the vector values. 10056 // If the value is known to be uniform after vectorization, we can just 10057 // broadcast the scalar value corresponding to lane zero for each unroll 10058 // iteration. Otherwise, we construct the vector values using 10059 // insertelement instructions. Since the resulting vectors are stored in 10060 // State, we will only generate the insertelements once. 10061 Value *VectorValue = nullptr; 10062 if (IsUniform) { 10063 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10064 set(Def, VectorValue, Part); 10065 } else { 10066 // Initialize packing with insertelements to start from undef. 10067 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10068 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10069 set(Def, Undef, Part); 10070 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10071 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10072 VectorValue = get(Def, Part); 10073 } 10074 Builder.restoreIP(OldIP); 10075 return VectorValue; 10076 } 10077 10078 // Process the loop in the VPlan-native vectorization path. This path builds 10079 // VPlan upfront in the vectorization pipeline, which allows to apply 10080 // VPlan-to-VPlan transformations from the very beginning without modifying the 10081 // input LLVM IR. 10082 static bool processLoopInVPlanNativePath( 10083 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10084 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10085 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10086 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10087 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10088 LoopVectorizationRequirements &Requirements) { 10089 10090 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10091 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10092 return false; 10093 } 10094 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10095 Function *F = L->getHeader()->getParent(); 10096 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10097 10098 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10099 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10100 10101 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10102 &Hints, IAI); 10103 // Use the planner for outer loop vectorization. 10104 // TODO: CM is not used at this point inside the planner. Turn CM into an 10105 // optional argument if we don't need it in the future. 10106 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10107 Requirements, ORE); 10108 10109 // Get user vectorization factor. 10110 ElementCount UserVF = Hints.getWidth(); 10111 10112 CM.collectElementTypesForWidening(); 10113 10114 // Plan how to best vectorize, return the best VF and its cost. 10115 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10116 10117 // If we are stress testing VPlan builds, do not attempt to generate vector 10118 // code. Masked vector code generation support will follow soon. 10119 // Also, do not attempt to vectorize if no vector code will be produced. 10120 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10121 return false; 10122 10123 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10124 10125 { 10126 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10127 F->getParent()->getDataLayout()); 10128 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10129 &CM, BFI, PSI, Checks); 10130 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10131 << L->getHeader()->getParent()->getName() << "\"\n"); 10132 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 10133 } 10134 10135 // Mark the loop as already vectorized to avoid vectorizing again. 10136 Hints.setAlreadyVectorized(); 10137 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10138 return true; 10139 } 10140 10141 // Emit a remark if there are stores to floats that required a floating point 10142 // extension. If the vectorized loop was generated with floating point there 10143 // will be a performance penalty from the conversion overhead and the change in 10144 // the vector width. 10145 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10146 SmallVector<Instruction *, 4> Worklist; 10147 for (BasicBlock *BB : L->getBlocks()) { 10148 for (Instruction &Inst : *BB) { 10149 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10150 if (S->getValueOperand()->getType()->isFloatTy()) 10151 Worklist.push_back(S); 10152 } 10153 } 10154 } 10155 10156 // Traverse the floating point stores upwards searching, for floating point 10157 // conversions. 10158 SmallPtrSet<const Instruction *, 4> Visited; 10159 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10160 while (!Worklist.empty()) { 10161 auto *I = Worklist.pop_back_val(); 10162 if (!L->contains(I)) 10163 continue; 10164 if (!Visited.insert(I).second) 10165 continue; 10166 10167 // Emit a remark if the floating point store required a floating 10168 // point conversion. 10169 // TODO: More work could be done to identify the root cause such as a 10170 // constant or a function return type and point the user to it. 10171 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10172 ORE->emit([&]() { 10173 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10174 I->getDebugLoc(), L->getHeader()) 10175 << "floating point conversion changes vector width. " 10176 << "Mixed floating point precision requires an up/down " 10177 << "cast that will negatively impact performance."; 10178 }); 10179 10180 for (Use &Op : I->operands()) 10181 if (auto *OpI = dyn_cast<Instruction>(Op)) 10182 Worklist.push_back(OpI); 10183 } 10184 } 10185 10186 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10187 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10188 !EnableLoopInterleaving), 10189 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10190 !EnableLoopVectorization) {} 10191 10192 bool LoopVectorizePass::processLoop(Loop *L) { 10193 assert((EnableVPlanNativePath || L->isInnermost()) && 10194 "VPlan-native path is not enabled. Only process inner loops."); 10195 10196 #ifndef NDEBUG 10197 const std::string DebugLocStr = getDebugLocString(L); 10198 #endif /* NDEBUG */ 10199 10200 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10201 << L->getHeader()->getParent()->getName() << "' from " 10202 << DebugLocStr << "\n"); 10203 10204 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10205 10206 LLVM_DEBUG( 10207 dbgs() << "LV: Loop hints:" 10208 << " force=" 10209 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10210 ? "disabled" 10211 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10212 ? "enabled" 10213 : "?")) 10214 << " width=" << Hints.getWidth() 10215 << " interleave=" << Hints.getInterleave() << "\n"); 10216 10217 // Function containing loop 10218 Function *F = L->getHeader()->getParent(); 10219 10220 // Looking at the diagnostic output is the only way to determine if a loop 10221 // was vectorized (other than looking at the IR or machine code), so it 10222 // is important to generate an optimization remark for each loop. Most of 10223 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10224 // generated as OptimizationRemark and OptimizationRemarkMissed are 10225 // less verbose reporting vectorized loops and unvectorized loops that may 10226 // benefit from vectorization, respectively. 10227 10228 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10229 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10230 return false; 10231 } 10232 10233 PredicatedScalarEvolution PSE(*SE, *L); 10234 10235 // Check if it is legal to vectorize the loop. 10236 LoopVectorizationRequirements Requirements; 10237 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10238 &Requirements, &Hints, DB, AC, BFI, PSI); 10239 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10240 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10241 Hints.emitRemarkWithHints(); 10242 return false; 10243 } 10244 10245 // Check the function attributes and profiles to find out if this function 10246 // should be optimized for size. 10247 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10248 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10249 10250 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10251 // here. They may require CFG and instruction level transformations before 10252 // even evaluating whether vectorization is profitable. Since we cannot modify 10253 // the incoming IR, we need to build VPlan upfront in the vectorization 10254 // pipeline. 10255 if (!L->isInnermost()) 10256 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10257 ORE, BFI, PSI, Hints, Requirements); 10258 10259 assert(L->isInnermost() && "Inner loop expected."); 10260 10261 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10262 // count by optimizing for size, to minimize overheads. 10263 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10264 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10265 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10266 << "This loop is worth vectorizing only if no scalar " 10267 << "iteration overheads are incurred."); 10268 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10269 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10270 else { 10271 LLVM_DEBUG(dbgs() << "\n"); 10272 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10273 } 10274 } 10275 10276 // Check the function attributes to see if implicit floats are allowed. 10277 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10278 // an integer loop and the vector instructions selected are purely integer 10279 // vector instructions? 10280 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10281 reportVectorizationFailure( 10282 "Can't vectorize when the NoImplicitFloat attribute is used", 10283 "loop not vectorized due to NoImplicitFloat attribute", 10284 "NoImplicitFloat", ORE, L); 10285 Hints.emitRemarkWithHints(); 10286 return false; 10287 } 10288 10289 // Check if the target supports potentially unsafe FP vectorization. 10290 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10291 // for the target we're vectorizing for, to make sure none of the 10292 // additional fp-math flags can help. 10293 if (Hints.isPotentiallyUnsafe() && 10294 TTI->isFPVectorizationPotentiallyUnsafe()) { 10295 reportVectorizationFailure( 10296 "Potentially unsafe FP op prevents vectorization", 10297 "loop not vectorized due to unsafe FP support.", 10298 "UnsafeFP", ORE, L); 10299 Hints.emitRemarkWithHints(); 10300 return false; 10301 } 10302 10303 bool AllowOrderedReductions; 10304 // If the flag is set, use that instead and override the TTI behaviour. 10305 if (ForceOrderedReductions.getNumOccurrences() > 0) 10306 AllowOrderedReductions = ForceOrderedReductions; 10307 else 10308 AllowOrderedReductions = TTI->enableOrderedReductions(); 10309 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10310 ORE->emit([&]() { 10311 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10312 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10313 ExactFPMathInst->getDebugLoc(), 10314 ExactFPMathInst->getParent()) 10315 << "loop not vectorized: cannot prove it is safe to reorder " 10316 "floating-point operations"; 10317 }); 10318 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10319 "reorder floating-point operations\n"); 10320 Hints.emitRemarkWithHints(); 10321 return false; 10322 } 10323 10324 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10325 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10326 10327 // If an override option has been passed in for interleaved accesses, use it. 10328 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10329 UseInterleaved = EnableInterleavedMemAccesses; 10330 10331 // Analyze interleaved memory accesses. 10332 if (UseInterleaved) { 10333 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10334 } 10335 10336 // Use the cost model. 10337 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10338 F, &Hints, IAI); 10339 CM.collectValuesToIgnore(); 10340 CM.collectElementTypesForWidening(); 10341 10342 // Use the planner for vectorization. 10343 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10344 Requirements, ORE); 10345 10346 // Get user vectorization factor and interleave count. 10347 ElementCount UserVF = Hints.getWidth(); 10348 unsigned UserIC = Hints.getInterleave(); 10349 10350 // Plan how to best vectorize, return the best VF and its cost. 10351 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10352 10353 VectorizationFactor VF = VectorizationFactor::Disabled(); 10354 unsigned IC = 1; 10355 10356 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10357 F->getParent()->getDataLayout()); 10358 if (MaybeVF) { 10359 if (LVP.requiresTooManyRuntimeChecks()) { 10360 ORE->emit([&]() { 10361 return OptimizationRemarkAnalysisAliasing( 10362 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10363 L->getHeader()) 10364 << "loop not vectorized: cannot prove it is safe to reorder " 10365 "memory operations"; 10366 }); 10367 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10368 Hints.emitRemarkWithHints(); 10369 return false; 10370 } 10371 VF = *MaybeVF; 10372 // Select the interleave count. 10373 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10374 10375 unsigned SelectedIC = std::max(IC, UserIC); 10376 // Optimistically generate runtime checks if they are needed. Drop them if 10377 // they turn out to not be profitable. 10378 if (VF.Width.isVector() || SelectedIC > 1) 10379 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10380 } 10381 10382 // Identify the diagnostic messages that should be produced. 10383 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10384 bool VectorizeLoop = true, InterleaveLoop = true; 10385 if (VF.Width.isScalar()) { 10386 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10387 VecDiagMsg = std::make_pair( 10388 "VectorizationNotBeneficial", 10389 "the cost-model indicates that vectorization is not beneficial"); 10390 VectorizeLoop = false; 10391 } 10392 10393 if (!MaybeVF && UserIC > 1) { 10394 // Tell the user interleaving was avoided up-front, despite being explicitly 10395 // requested. 10396 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10397 "interleaving should be avoided up front\n"); 10398 IntDiagMsg = std::make_pair( 10399 "InterleavingAvoided", 10400 "Ignoring UserIC, because interleaving was avoided up front"); 10401 InterleaveLoop = false; 10402 } else if (IC == 1 && UserIC <= 1) { 10403 // Tell the user interleaving is not beneficial. 10404 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10405 IntDiagMsg = std::make_pair( 10406 "InterleavingNotBeneficial", 10407 "the cost-model indicates that interleaving is not beneficial"); 10408 InterleaveLoop = false; 10409 if (UserIC == 1) { 10410 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10411 IntDiagMsg.second += 10412 " and is explicitly disabled or interleave count is set to 1"; 10413 } 10414 } else if (IC > 1 && UserIC == 1) { 10415 // Tell the user interleaving is beneficial, but it explicitly disabled. 10416 LLVM_DEBUG( 10417 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10418 IntDiagMsg = std::make_pair( 10419 "InterleavingBeneficialButDisabled", 10420 "the cost-model indicates that interleaving is beneficial " 10421 "but is explicitly disabled or interleave count is set to 1"); 10422 InterleaveLoop = false; 10423 } 10424 10425 // Override IC if user provided an interleave count. 10426 IC = UserIC > 0 ? UserIC : IC; 10427 10428 // Emit diagnostic messages, if any. 10429 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10430 if (!VectorizeLoop && !InterleaveLoop) { 10431 // Do not vectorize or interleaving the loop. 10432 ORE->emit([&]() { 10433 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10434 L->getStartLoc(), L->getHeader()) 10435 << VecDiagMsg.second; 10436 }); 10437 ORE->emit([&]() { 10438 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10439 L->getStartLoc(), L->getHeader()) 10440 << IntDiagMsg.second; 10441 }); 10442 return false; 10443 } else if (!VectorizeLoop && InterleaveLoop) { 10444 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10445 ORE->emit([&]() { 10446 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10447 L->getStartLoc(), L->getHeader()) 10448 << VecDiagMsg.second; 10449 }); 10450 } else if (VectorizeLoop && !InterleaveLoop) { 10451 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10452 << ") in " << DebugLocStr << '\n'); 10453 ORE->emit([&]() { 10454 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10455 L->getStartLoc(), L->getHeader()) 10456 << IntDiagMsg.second; 10457 }); 10458 } else if (VectorizeLoop && InterleaveLoop) { 10459 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10460 << ") in " << DebugLocStr << '\n'); 10461 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10462 } 10463 10464 bool DisableRuntimeUnroll = false; 10465 MDNode *OrigLoopID = L->getLoopID(); 10466 { 10467 using namespace ore; 10468 if (!VectorizeLoop) { 10469 assert(IC > 1 && "interleave count should not be 1 or 0"); 10470 // If we decided that it is not legal to vectorize the loop, then 10471 // interleave it. 10472 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10473 &CM, BFI, PSI, Checks); 10474 10475 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10476 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10477 10478 ORE->emit([&]() { 10479 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10480 L->getHeader()) 10481 << "interleaved loop (interleaved count: " 10482 << NV("InterleaveCount", IC) << ")"; 10483 }); 10484 } else { 10485 // If we decided that it is *legal* to vectorize the loop, then do it. 10486 10487 // Consider vectorizing the epilogue too if it's profitable. 10488 VectorizationFactor EpilogueVF = 10489 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10490 if (EpilogueVF.Width.isVector()) { 10491 10492 // The first pass vectorizes the main loop and creates a scalar epilogue 10493 // to be vectorized by executing the plan (potentially with a different 10494 // factor) again shortly afterwards. 10495 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10496 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10497 EPI, &LVL, &CM, BFI, PSI, Checks); 10498 10499 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10500 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10501 DT, true); 10502 ++LoopsVectorized; 10503 10504 // Second pass vectorizes the epilogue and adjusts the control flow 10505 // edges from the first pass. 10506 EPI.MainLoopVF = EPI.EpilogueVF; 10507 EPI.MainLoopUF = EPI.EpilogueUF; 10508 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10509 ORE, EPI, &LVL, &CM, BFI, PSI, 10510 Checks); 10511 10512 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10513 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10514 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10515 Header->setName("vec.epilog.vector.body"); 10516 10517 // Ensure that the start values for any VPReductionPHIRecipes are 10518 // updated before vectorising the epilogue loop. 10519 for (VPRecipeBase &R : Header->phis()) { 10520 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10521 if (auto *Resume = MainILV.getReductionResumeValue( 10522 ReductionPhi->getRecurrenceDescriptor())) { 10523 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10524 ReductionPhi->setOperand(0, StartVal); 10525 } 10526 } 10527 } 10528 10529 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10530 DT, true); 10531 ++LoopsEpilogueVectorized; 10532 10533 if (!MainILV.areSafetyChecksAdded()) 10534 DisableRuntimeUnroll = true; 10535 } else { 10536 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10537 &LVL, &CM, BFI, PSI, Checks); 10538 10539 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10540 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10541 ++LoopsVectorized; 10542 10543 // Add metadata to disable runtime unrolling a scalar loop when there 10544 // are no runtime checks about strides and memory. A scalar loop that is 10545 // rarely used is not worth unrolling. 10546 if (!LB.areSafetyChecksAdded()) 10547 DisableRuntimeUnroll = true; 10548 } 10549 // Report the vectorization decision. 10550 ORE->emit([&]() { 10551 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10552 L->getHeader()) 10553 << "vectorized loop (vectorization width: " 10554 << NV("VectorizationFactor", VF.Width) 10555 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10556 }); 10557 } 10558 10559 if (ORE->allowExtraAnalysis(LV_NAME)) 10560 checkMixedPrecision(L, ORE); 10561 } 10562 10563 Optional<MDNode *> RemainderLoopID = 10564 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10565 LLVMLoopVectorizeFollowupEpilogue}); 10566 if (RemainderLoopID) { 10567 L->setLoopID(RemainderLoopID.getValue()); 10568 } else { 10569 if (DisableRuntimeUnroll) 10570 AddRuntimeUnrollDisableMetaData(L); 10571 10572 // Mark the loop as already vectorized to avoid vectorizing again. 10573 Hints.setAlreadyVectorized(); 10574 } 10575 10576 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10577 return true; 10578 } 10579 10580 LoopVectorizeResult LoopVectorizePass::runImpl( 10581 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10582 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10583 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10584 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10585 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10586 SE = &SE_; 10587 LI = &LI_; 10588 TTI = &TTI_; 10589 DT = &DT_; 10590 BFI = &BFI_; 10591 TLI = TLI_; 10592 AA = &AA_; 10593 AC = &AC_; 10594 GetLAA = &GetLAA_; 10595 DB = &DB_; 10596 ORE = &ORE_; 10597 PSI = PSI_; 10598 10599 // Don't attempt if 10600 // 1. the target claims to have no vector registers, and 10601 // 2. interleaving won't help ILP. 10602 // 10603 // The second condition is necessary because, even if the target has no 10604 // vector registers, loop vectorization may still enable scalar 10605 // interleaving. 10606 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10607 TTI->getMaxInterleaveFactor(1) < 2) 10608 return LoopVectorizeResult(false, false); 10609 10610 bool Changed = false, CFGChanged = false; 10611 10612 // The vectorizer requires loops to be in simplified form. 10613 // Since simplification may add new inner loops, it has to run before the 10614 // legality and profitability checks. This means running the loop vectorizer 10615 // will simplify all loops, regardless of whether anything end up being 10616 // vectorized. 10617 for (auto &L : *LI) 10618 Changed |= CFGChanged |= 10619 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10620 10621 // Build up a worklist of inner-loops to vectorize. This is necessary as 10622 // the act of vectorizing or partially unrolling a loop creates new loops 10623 // and can invalidate iterators across the loops. 10624 SmallVector<Loop *, 8> Worklist; 10625 10626 for (Loop *L : *LI) 10627 collectSupportedLoops(*L, LI, ORE, Worklist); 10628 10629 LoopsAnalyzed += Worklist.size(); 10630 10631 // Now walk the identified inner loops. 10632 while (!Worklist.empty()) { 10633 Loop *L = Worklist.pop_back_val(); 10634 10635 // For the inner loops we actually process, form LCSSA to simplify the 10636 // transform. 10637 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10638 10639 Changed |= CFGChanged |= processLoop(L); 10640 } 10641 10642 // Process each loop nest in the function. 10643 return LoopVectorizeResult(Changed, CFGChanged); 10644 } 10645 10646 PreservedAnalyses LoopVectorizePass::run(Function &F, 10647 FunctionAnalysisManager &AM) { 10648 auto &LI = AM.getResult<LoopAnalysis>(F); 10649 // There are no loops in the function. Return before computing other expensive 10650 // analyses. 10651 if (LI.empty()) 10652 return PreservedAnalyses::all(); 10653 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10654 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10655 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10656 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10657 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10658 auto &AA = AM.getResult<AAManager>(F); 10659 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10660 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10661 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10662 10663 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10664 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10665 [&](Loop &L) -> const LoopAccessInfo & { 10666 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10667 TLI, TTI, nullptr, nullptr, nullptr}; 10668 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10669 }; 10670 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10671 ProfileSummaryInfo *PSI = 10672 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10673 LoopVectorizeResult Result = 10674 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10675 if (!Result.MadeAnyChange) 10676 return PreservedAnalyses::all(); 10677 PreservedAnalyses PA; 10678 10679 // We currently do not preserve loopinfo/dominator analyses with outer loop 10680 // vectorization. Until this is addressed, mark these analyses as preserved 10681 // only for non-VPlan-native path. 10682 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10683 if (!EnableVPlanNativePath) { 10684 PA.preserve<LoopAnalysis>(); 10685 PA.preserve<DominatorTreeAnalysis>(); 10686 } 10687 10688 if (Result.MadeCFGChange) { 10689 // Making CFG changes likely means a loop got vectorized. Indicate that 10690 // extra simplification passes should be run. 10691 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10692 // be run if runtime checks have been added. 10693 AM.getResult<ShouldRunExtraVectorPasses>(F); 10694 PA.preserve<ShouldRunExtraVectorPasses>(); 10695 } else { 10696 PA.preserveSet<CFGAnalyses>(); 10697 } 10698 return PA; 10699 } 10700 10701 void LoopVectorizePass::printPipeline( 10702 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10703 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10704 OS, MapClassName2PassName); 10705 10706 OS << "<"; 10707 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10708 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10709 OS << ">"; 10710 } 10711