1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/None.h" 69 #include "llvm/ADT/Optional.h" 70 #include "llvm/ADT/STLExtras.h" 71 #include "llvm/ADT/SmallPtrSet.h" 72 #include "llvm/ADT/SmallSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 90 #include "llvm/Analysis/ProfileSummaryInfo.h" 91 #include "llvm/Analysis/ScalarEvolution.h" 92 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 93 #include "llvm/Analysis/TargetLibraryInfo.h" 94 #include "llvm/Analysis/TargetTransformInfo.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/InitializePasses.h" 125 #include "llvm/Pass.h" 126 #include "llvm/Support/Casting.h" 127 #include "llvm/Support/CommandLine.h" 128 #include "llvm/Support/Compiler.h" 129 #include "llvm/Support/Debug.h" 130 #include "llvm/Support/ErrorHandling.h" 131 #include "llvm/Support/InstructionCost.h" 132 #include "llvm/Support/MathExtras.h" 133 #include "llvm/Support/raw_ostream.h" 134 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 135 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 200 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks with a " 202 "vectorize(enable) pragma.")); 203 204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 205 // that predication is preferred, and this lists all options. I.e., the 206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 207 // and predicate the instructions accordingly. If tail-folding fails, there are 208 // different fallback strategies depending on these values: 209 namespace PreferPredicateTy { 210 enum Option { 211 ScalarEpilogue = 0, 212 PredicateElseScalarEpilogue, 213 PredicateOrDontVectorize 214 }; 215 } // namespace PreferPredicateTy 216 217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 218 "prefer-predicate-over-epilogue", 219 cl::init(PreferPredicateTy::ScalarEpilogue), 220 cl::Hidden, 221 cl::desc("Tail-folding and predication preferences over creating a scalar " 222 "epilogue loop."), 223 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 224 "scalar-epilogue", 225 "Don't tail-predicate loops, create scalar epilogue"), 226 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 227 "predicate-else-scalar-epilogue", 228 "prefer tail-folding, create scalar epilogue if tail " 229 "folding fails."), 230 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 231 "predicate-dont-vectorize", 232 "prefers tail-folding, don't attempt vectorization if " 233 "tail-folding fails."))); 234 235 static cl::opt<bool> MaximizeBandwidth( 236 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 237 cl::desc("Maximize bandwidth when selecting vectorization factor which " 238 "will be determined by the smallest type in loop.")); 239 240 static cl::opt<bool> EnableInterleavedMemAccesses( 241 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 242 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 243 244 /// An interleave-group may need masking if it resides in a block that needs 245 /// predication, or in order to mask away gaps. 246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 247 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 248 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 249 250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 251 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 252 cl::desc("We don't interleave loops with a estimated constant trip count " 253 "below this number")); 254 255 static cl::opt<unsigned> ForceTargetNumScalarRegs( 256 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of scalar registers.")); 258 259 static cl::opt<unsigned> ForceTargetNumVectorRegs( 260 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's number of vector registers.")); 262 263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 264 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 265 cl::desc("A flag that overrides the target's max interleave factor for " 266 "scalar loops.")); 267 268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 269 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 270 cl::desc("A flag that overrides the target's max interleave factor for " 271 "vectorized loops.")); 272 273 static cl::opt<unsigned> ForceTargetInstructionCost( 274 "force-target-instruction-cost", cl::init(0), cl::Hidden, 275 cl::desc("A flag that overrides the target's expected cost for " 276 "an instruction to a single constant value. Mostly " 277 "useful for getting consistent testing.")); 278 279 static cl::opt<bool> ForceTargetSupportsScalableVectors( 280 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 281 cl::desc( 282 "Pretend that scalable vectors are supported, even if the target does " 283 "not support them. This flag should only be used for testing.")); 284 285 static cl::opt<unsigned> SmallLoopCost( 286 "small-loop-cost", cl::init(20), cl::Hidden, 287 cl::desc( 288 "The cost of a loop that is considered 'small' by the interleaver.")); 289 290 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 291 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 292 cl::desc("Enable the use of the block frequency analysis to access PGO " 293 "heuristics minimizing code growth in cold regions and being more " 294 "aggressive in hot regions.")); 295 296 // Runtime interleave loops for load/store throughput. 297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 298 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 299 cl::desc( 300 "Enable runtime interleaving until load/store ports are saturated")); 301 302 /// Interleave small loops with scalar reductions. 303 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 304 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 305 cl::desc("Enable interleaving for loops with small iteration counts that " 306 "contain scalar reductions to expose ILP.")); 307 308 /// The number of stores in a loop that are allowed to need predication. 309 static cl::opt<unsigned> NumberOfStoresToPredicate( 310 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 311 cl::desc("Max number of stores to be predicated behind an if.")); 312 313 static cl::opt<bool> EnableIndVarRegisterHeur( 314 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 315 cl::desc("Count the induction variable only once when interleaving")); 316 317 static cl::opt<bool> EnableCondStoresVectorization( 318 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 319 cl::desc("Enable if predication of stores during vectorization.")); 320 321 static cl::opt<unsigned> MaxNestedScalarReductionIC( 322 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 323 cl::desc("The maximum interleave count to use when interleaving a scalar " 324 "reduction in a nested loop.")); 325 326 static cl::opt<bool> 327 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 328 cl::Hidden, 329 cl::desc("Prefer in-loop vector reductions, " 330 "overriding the targets preference.")); 331 332 static cl::opt<bool> ForceOrderedReductions( 333 "force-ordered-reductions", cl::init(false), cl::Hidden, 334 cl::desc("Enable the vectorisation of loops with in-order (strict) " 335 "FP reductions")); 336 337 static cl::opt<bool> PreferPredicatedReductionSelect( 338 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 339 cl::desc( 340 "Prefer predicating a reduction operation over an after loop select.")); 341 342 cl::opt<bool> EnableVPlanNativePath( 343 "enable-vplan-native-path", cl::init(false), cl::Hidden, 344 cl::desc("Enable VPlan-native vectorization path with " 345 "support for outer loop vectorization.")); 346 347 // This flag enables the stress testing of the VPlan H-CFG construction in the 348 // VPlan-native vectorization path. It must be used in conjuction with 349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 350 // verification of the H-CFGs built. 351 static cl::opt<bool> VPlanBuildStressTest( 352 "vplan-build-stress-test", cl::init(false), cl::Hidden, 353 cl::desc( 354 "Build VPlan for every supported loop nest in the function and bail " 355 "out right after the build (stress test the VPlan H-CFG construction " 356 "in the VPlan-native vectorization path).")); 357 358 cl::opt<bool> llvm::EnableLoopInterleaving( 359 "interleave-loops", cl::init(true), cl::Hidden, 360 cl::desc("Enable loop interleaving in Loop vectorization passes")); 361 cl::opt<bool> llvm::EnableLoopVectorization( 362 "vectorize-loops", cl::init(true), cl::Hidden, 363 cl::desc("Run the Loop vectorization passes")); 364 365 cl::opt<bool> PrintVPlansInDotFormat( 366 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 367 cl::desc("Use dot format instead of plain text when dumping VPlans")); 368 369 /// A helper function that returns true if the given type is irregular. The 370 /// type is irregular if its allocated size doesn't equal the store size of an 371 /// element of the corresponding vector type. 372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 373 // Determine if an array of N elements of type Ty is "bitcast compatible" 374 // with a <N x Ty> vector. 375 // This is only true if there is no padding between the array elements. 376 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 377 } 378 379 /// A helper function that returns the reciprocal of the block probability of 380 /// predicated blocks. If we return X, we are assuming the predicated block 381 /// will execute once for every X iterations of the loop header. 382 /// 383 /// TODO: We should use actual block probability here, if available. Currently, 384 /// we always assume predicated blocks have a 50% chance of executing. 385 static unsigned getReciprocalPredBlockProb() { return 2; } 386 387 /// A helper function that returns an integer or floating-point constant with 388 /// value C. 389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 390 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 391 : ConstantFP::get(Ty, C); 392 } 393 394 /// Returns "best known" trip count for the specified loop \p L as defined by 395 /// the following procedure: 396 /// 1) Returns exact trip count if it is known. 397 /// 2) Returns expected trip count according to profile data if any. 398 /// 3) Returns upper bound estimate if it is known. 399 /// 4) Returns None if all of the above failed. 400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 401 // Check if exact trip count is known. 402 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 403 return ExpectedTC; 404 405 // Check if there is an expected trip count available from profile data. 406 if (LoopVectorizeWithBlockFrequency) 407 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 408 return EstimatedTC; 409 410 // Check if upper bound estimate is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 412 return ExpectedTC; 413 414 return None; 415 } 416 417 // Forward declare GeneratedRTChecks. 418 class GeneratedRTChecks; 419 420 namespace llvm { 421 422 AnalysisKey ShouldRunExtraVectorPasses::Key; 423 424 /// InnerLoopVectorizer vectorizes loops which contain only one basic 425 /// block to a specified vectorization factor (VF). 426 /// This class performs the widening of scalars into vectors, or multiple 427 /// scalars. This class also implements the following features: 428 /// * It inserts an epilogue loop for handling loops that don't have iteration 429 /// counts that are known to be a multiple of the vectorization factor. 430 /// * It handles the code generation for reduction variables. 431 /// * Scalarization (implementation using scalars) of un-vectorizable 432 /// instructions. 433 /// InnerLoopVectorizer does not perform any vectorization-legality 434 /// checks, and relies on the caller to check for the different legality 435 /// aspects. The InnerLoopVectorizer relies on the 436 /// LoopVectorizationLegality class to provide information about the induction 437 /// and reduction variables that were found to a given vectorization factor. 438 class InnerLoopVectorizer { 439 public: 440 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 441 LoopInfo *LI, DominatorTree *DT, 442 const TargetLibraryInfo *TLI, 443 const TargetTransformInfo *TTI, AssumptionCache *AC, 444 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 445 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 446 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 447 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 448 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 449 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 450 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 451 PSI(PSI), RTChecks(RTChecks) { 452 // Query this against the original loop and save it here because the profile 453 // of the original loop header may change as the transformation happens. 454 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 455 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 456 } 457 458 virtual ~InnerLoopVectorizer() = default; 459 460 /// Create a new empty loop that will contain vectorized instructions later 461 /// on, while the old loop will be used as the scalar remainder. Control flow 462 /// is generated around the vectorized (and scalar epilogue) loops consisting 463 /// of various checks and bypasses. Return the pre-header block of the new 464 /// loop and the start value for the canonical induction, if it is != 0. The 465 /// latter is the case when vectorizing the epilogue loop. In the case of 466 /// epilogue vectorization, this function is overriden to handle the more 467 /// complex control flow around the loops. 468 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 469 470 /// Widen a single call instruction within the innermost loop. 471 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 472 VPTransformState &State); 473 474 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 475 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 476 477 // Return true if any runtime check is added. 478 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 479 480 /// A type for vectorized values in the new loop. Each value from the 481 /// original loop, when vectorized, is represented by UF vector values in the 482 /// new unrolled loop, where UF is the unroll factor. 483 using VectorParts = SmallVector<Value *, 2>; 484 485 /// Vectorize a single vector PHINode in a block in the VPlan-native path 486 /// only. 487 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 488 VPTransformState &State); 489 490 /// A helper function to scalarize a single Instruction in the innermost loop. 491 /// Generates a sequence of scalar instances for each lane between \p MinLane 492 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 493 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 494 /// Instr's operands. 495 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 496 const VPIteration &Instance, bool IfPredicateInstr, 497 VPTransformState &State); 498 499 /// Construct the vector value of a scalarized value \p V one lane at a time. 500 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 501 VPTransformState &State); 502 503 /// Try to vectorize interleaved access group \p Group with the base address 504 /// given in \p Addr, optionally masking the vector operations if \p 505 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 506 /// values in the vectorized loop. 507 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 508 ArrayRef<VPValue *> VPDefs, 509 VPTransformState &State, VPValue *Addr, 510 ArrayRef<VPValue *> StoredValues, 511 VPValue *BlockInMask = nullptr); 512 513 /// Set the debug location in the builder \p Ptr using the debug location in 514 /// \p V. If \p Ptr is None then it uses the class member's Builder. 515 void setDebugLocFromInst(const Value *V); 516 517 /// Fix the non-induction PHIs in \p Plan. 518 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 519 520 /// Returns true if the reordering of FP operations is not allowed, but we are 521 /// able to vectorize with strict in-order reductions for the given RdxDesc. 522 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 523 524 /// Create a broadcast instruction. This method generates a broadcast 525 /// instruction (shuffle) for loop invariant values and for the induction 526 /// value. If this is the induction variable then we extend it to N, N+1, ... 527 /// this is needed because each iteration in the loop corresponds to a SIMD 528 /// element. 529 virtual Value *getBroadcastInstrs(Value *V); 530 531 /// Add metadata from one instruction to another. 532 /// 533 /// This includes both the original MDs from \p From and additional ones (\see 534 /// addNewMetadata). Use this for *newly created* instructions in the vector 535 /// loop. 536 void addMetadata(Instruction *To, Instruction *From); 537 538 /// Similar to the previous function but it adds the metadata to a 539 /// vector of instructions. 540 void addMetadata(ArrayRef<Value *> To, Instruction *From); 541 542 // Returns the resume value (bc.merge.rdx) for a reduction as 543 // generated by fixReduction. 544 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 545 546 protected: 547 friend class LoopVectorizationPlanner; 548 549 /// A small list of PHINodes. 550 using PhiVector = SmallVector<PHINode *, 4>; 551 552 /// A type for scalarized values in the new loop. Each value from the 553 /// original loop, when scalarized, is represented by UF x VF scalar values 554 /// in the new unrolled loop, where UF is the unroll factor and VF is the 555 /// vectorization factor. 556 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 557 558 /// Set up the values of the IVs correctly when exiting the vector loop. 559 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 560 Value *VectorTripCount, Value *EndValue, 561 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 562 VPlan &Plan); 563 564 /// Handle all cross-iteration phis in the header. 565 void fixCrossIterationPHIs(VPTransformState &State); 566 567 /// Create the exit value of first order recurrences in the middle block and 568 /// update their users. 569 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 570 VPTransformState &State); 571 572 /// Create code for the loop exit value of the reduction. 573 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 574 575 /// Clear NSW/NUW flags from reduction instructions if necessary. 576 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 577 VPTransformState &State); 578 579 /// Iteratively sink the scalarized operands of a predicated instruction into 580 /// the block that was created for it. 581 void sinkScalarOperands(Instruction *PredInst); 582 583 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 584 /// represented as. 585 void truncateToMinimalBitwidths(VPTransformState &State); 586 587 /// Returns (and creates if needed) the original loop trip count. 588 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 589 590 /// Returns (and creates if needed) the trip count of the widened loop. 591 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 592 593 /// Returns a bitcasted value to the requested vector type. 594 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 595 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 596 const DataLayout &DL); 597 598 /// Emit a bypass check to see if the vector trip count is zero, including if 599 /// it overflows. 600 void emitIterationCountCheck(BasicBlock *Bypass); 601 602 /// Emit a bypass check to see if all of the SCEV assumptions we've 603 /// had to make are correct. Returns the block containing the checks or 604 /// nullptr if no checks have been added. 605 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 606 607 /// Emit bypass checks to check any memory assumptions we may have made. 608 /// Returns the block containing the checks or nullptr if no checks have been 609 /// added. 610 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 611 612 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 613 /// vector loop preheader, middle block and scalar preheader. 614 void createVectorLoopSkeleton(StringRef Prefix); 615 616 /// Create new phi nodes for the induction variables to resume iteration count 617 /// in the scalar epilogue, from where the vectorized loop left off. 618 /// In cases where the loop skeleton is more complicated (eg. epilogue 619 /// vectorization) and the resume values can come from an additional bypass 620 /// block, the \p AdditionalBypass pair provides information about the bypass 621 /// block and the end value on the edge from bypass to this loop. 622 void createInductionResumeValues( 623 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 624 625 /// Complete the loop skeleton by adding debug MDs, creating appropriate 626 /// conditional branches in the middle block, preparing the builder and 627 /// running the verifier. Return the preheader of the completed vector loop. 628 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 629 630 /// Add additional metadata to \p To that was not present on \p Orig. 631 /// 632 /// Currently this is used to add the noalias annotations based on the 633 /// inserted memchecks. Use this for instructions that are *cloned* into the 634 /// vector loop. 635 void addNewMetadata(Instruction *To, const Instruction *Orig); 636 637 /// Collect poison-generating recipes that may generate a poison value that is 638 /// used after vectorization, even when their operands are not poison. Those 639 /// recipes meet the following conditions: 640 /// * Contribute to the address computation of a recipe generating a widen 641 /// memory load/store (VPWidenMemoryInstructionRecipe or 642 /// VPInterleaveRecipe). 643 /// * Such a widen memory load/store has at least one underlying Instruction 644 /// that is in a basic block that needs predication and after vectorization 645 /// the generated instruction won't be predicated. 646 void collectPoisonGeneratingRecipes(VPTransformState &State); 647 648 /// Allow subclasses to override and print debug traces before/after vplan 649 /// execution, when trace information is requested. 650 virtual void printDebugTracesAtStart(){}; 651 virtual void printDebugTracesAtEnd(){}; 652 653 /// The original loop. 654 Loop *OrigLoop; 655 656 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 657 /// dynamic knowledge to simplify SCEV expressions and converts them to a 658 /// more usable form. 659 PredicatedScalarEvolution &PSE; 660 661 /// Loop Info. 662 LoopInfo *LI; 663 664 /// Dominator Tree. 665 DominatorTree *DT; 666 667 /// Alias Analysis. 668 AAResults *AA; 669 670 /// Target Library Info. 671 const TargetLibraryInfo *TLI; 672 673 /// Target Transform Info. 674 const TargetTransformInfo *TTI; 675 676 /// Assumption Cache. 677 AssumptionCache *AC; 678 679 /// Interface to emit optimization remarks. 680 OptimizationRemarkEmitter *ORE; 681 682 /// LoopVersioning. It's only set up (non-null) if memchecks were 683 /// used. 684 /// 685 /// This is currently only used to add no-alias metadata based on the 686 /// memchecks. The actually versioning is performed manually. 687 std::unique_ptr<LoopVersioning> LVer; 688 689 /// The vectorization SIMD factor to use. Each vector will have this many 690 /// vector elements. 691 ElementCount VF; 692 693 /// The vectorization unroll factor to use. Each scalar is vectorized to this 694 /// many different vector instructions. 695 unsigned UF; 696 697 /// The builder that we use 698 IRBuilder<> Builder; 699 700 // --- Vectorization state --- 701 702 /// The vector-loop preheader. 703 BasicBlock *LoopVectorPreHeader; 704 705 /// The scalar-loop preheader. 706 BasicBlock *LoopScalarPreHeader; 707 708 /// Middle Block between the vector and the scalar. 709 BasicBlock *LoopMiddleBlock; 710 711 /// The unique ExitBlock of the scalar loop if one exists. Note that 712 /// there can be multiple exiting edges reaching this block. 713 BasicBlock *LoopExitBlock; 714 715 /// The scalar loop body. 716 BasicBlock *LoopScalarBody; 717 718 /// A list of all bypass blocks. The first block is the entry of the loop. 719 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 720 721 /// Store instructions that were predicated. 722 SmallVector<Instruction *, 4> PredicatedInstructions; 723 724 /// Trip count of the original loop. 725 Value *TripCount = nullptr; 726 727 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 728 Value *VectorTripCount = nullptr; 729 730 /// The legality analysis. 731 LoopVectorizationLegality *Legal; 732 733 /// The profitablity analysis. 734 LoopVectorizationCostModel *Cost; 735 736 // Record whether runtime checks are added. 737 bool AddedSafetyChecks = false; 738 739 // Holds the end values for each induction variable. We save the end values 740 // so we can later fix-up the external users of the induction variables. 741 DenseMap<PHINode *, Value *> IVEndValues; 742 743 /// BFI and PSI are used to check for profile guided size optimizations. 744 BlockFrequencyInfo *BFI; 745 ProfileSummaryInfo *PSI; 746 747 // Whether this loop should be optimized for size based on profile guided size 748 // optimizatios. 749 bool OptForSizeBasedOnProfile; 750 751 /// Structure to hold information about generated runtime checks, responsible 752 /// for cleaning the checks, if vectorization turns out unprofitable. 753 GeneratedRTChecks &RTChecks; 754 755 // Holds the resume values for reductions in the loops, used to set the 756 // correct start value of reduction PHIs when vectorizing the epilogue. 757 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 758 ReductionResumeValues; 759 }; 760 761 class InnerLoopUnroller : public InnerLoopVectorizer { 762 public: 763 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 764 LoopInfo *LI, DominatorTree *DT, 765 const TargetLibraryInfo *TLI, 766 const TargetTransformInfo *TTI, AssumptionCache *AC, 767 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 768 LoopVectorizationLegality *LVL, 769 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 770 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 771 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 772 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 773 BFI, PSI, Check) {} 774 775 private: 776 Value *getBroadcastInstrs(Value *V) override; 777 }; 778 779 /// Encapsulate information regarding vectorization of a loop and its epilogue. 780 /// This information is meant to be updated and used across two stages of 781 /// epilogue vectorization. 782 struct EpilogueLoopVectorizationInfo { 783 ElementCount MainLoopVF = ElementCount::getFixed(0); 784 unsigned MainLoopUF = 0; 785 ElementCount EpilogueVF = ElementCount::getFixed(0); 786 unsigned EpilogueUF = 0; 787 BasicBlock *MainLoopIterationCountCheck = nullptr; 788 BasicBlock *EpilogueIterationCountCheck = nullptr; 789 BasicBlock *SCEVSafetyCheck = nullptr; 790 BasicBlock *MemSafetyCheck = nullptr; 791 Value *TripCount = nullptr; 792 Value *VectorTripCount = nullptr; 793 794 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 795 ElementCount EVF, unsigned EUF) 796 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 797 assert(EUF == 1 && 798 "A high UF for the epilogue loop is likely not beneficial."); 799 } 800 }; 801 802 /// An extension of the inner loop vectorizer that creates a skeleton for a 803 /// vectorized loop that has its epilogue (residual) also vectorized. 804 /// The idea is to run the vplan on a given loop twice, firstly to setup the 805 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 806 /// from the first step and vectorize the epilogue. This is achieved by 807 /// deriving two concrete strategy classes from this base class and invoking 808 /// them in succession from the loop vectorizer planner. 809 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 810 public: 811 InnerLoopAndEpilogueVectorizer( 812 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 813 DominatorTree *DT, const TargetLibraryInfo *TLI, 814 const TargetTransformInfo *TTI, AssumptionCache *AC, 815 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 816 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 817 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 818 GeneratedRTChecks &Checks) 819 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 820 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 821 Checks), 822 EPI(EPI) {} 823 824 // Override this function to handle the more complex control flow around the 825 // three loops. 826 std::pair<BasicBlock *, Value *> 827 createVectorizedLoopSkeleton() final override { 828 return createEpilogueVectorizedLoopSkeleton(); 829 } 830 831 /// The interface for creating a vectorized skeleton using one of two 832 /// different strategies, each corresponding to one execution of the vplan 833 /// as described above. 834 virtual std::pair<BasicBlock *, Value *> 835 createEpilogueVectorizedLoopSkeleton() = 0; 836 837 /// Holds and updates state information required to vectorize the main loop 838 /// and its epilogue in two separate passes. This setup helps us avoid 839 /// regenerating and recomputing runtime safety checks. It also helps us to 840 /// shorten the iteration-count-check path length for the cases where the 841 /// iteration count of the loop is so small that the main vector loop is 842 /// completely skipped. 843 EpilogueLoopVectorizationInfo &EPI; 844 }; 845 846 /// A specialized derived class of inner loop vectorizer that performs 847 /// vectorization of *main* loops in the process of vectorizing loops and their 848 /// epilogues. 849 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 850 public: 851 EpilogueVectorizerMainLoop( 852 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 853 DominatorTree *DT, const TargetLibraryInfo *TLI, 854 const TargetTransformInfo *TTI, AssumptionCache *AC, 855 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 856 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 857 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 858 GeneratedRTChecks &Check) 859 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 860 EPI, LVL, CM, BFI, PSI, Check) {} 861 /// Implements the interface for creating a vectorized skeleton using the 862 /// *main loop* strategy (ie the first pass of vplan execution). 863 std::pair<BasicBlock *, Value *> 864 createEpilogueVectorizedLoopSkeleton() final override; 865 866 protected: 867 /// Emits an iteration count bypass check once for the main loop (when \p 868 /// ForEpilogue is false) and once for the epilogue loop (when \p 869 /// ForEpilogue is true). 870 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 871 void printDebugTracesAtStart() override; 872 void printDebugTracesAtEnd() override; 873 }; 874 875 // A specialized derived class of inner loop vectorizer that performs 876 // vectorization of *epilogue* loops in the process of vectorizing loops and 877 // their epilogues. 878 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 879 public: 880 EpilogueVectorizerEpilogueLoop( 881 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 882 DominatorTree *DT, const TargetLibraryInfo *TLI, 883 const TargetTransformInfo *TTI, AssumptionCache *AC, 884 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 885 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 886 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 887 GeneratedRTChecks &Checks) 888 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 889 EPI, LVL, CM, BFI, PSI, Checks) { 890 TripCount = EPI.TripCount; 891 } 892 /// Implements the interface for creating a vectorized skeleton using the 893 /// *epilogue loop* strategy (ie the second pass of vplan execution). 894 std::pair<BasicBlock *, Value *> 895 createEpilogueVectorizedLoopSkeleton() final override; 896 897 protected: 898 /// Emits an iteration count bypass check after the main vector loop has 899 /// finished to see if there are any iterations left to execute by either 900 /// the vector epilogue or the scalar epilogue. 901 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 902 BasicBlock *Bypass, 903 BasicBlock *Insert); 904 void printDebugTracesAtStart() override; 905 void printDebugTracesAtEnd() override; 906 }; 907 } // end namespace llvm 908 909 /// Look for a meaningful debug location on the instruction or it's 910 /// operands. 911 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 912 if (!I) 913 return I; 914 915 DebugLoc Empty; 916 if (I->getDebugLoc() != Empty) 917 return I; 918 919 for (Use &Op : I->operands()) { 920 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 921 if (OpInst->getDebugLoc() != Empty) 922 return OpInst; 923 } 924 925 return I; 926 } 927 928 void InnerLoopVectorizer::setDebugLocFromInst( 929 const Value *V) { 930 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 931 const DILocation *DIL = Inst->getDebugLoc(); 932 933 // When a FSDiscriminator is enabled, we don't need to add the multiply 934 // factors to the discriminators. 935 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 936 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 937 // FIXME: For scalable vectors, assume vscale=1. 938 auto NewDIL = 939 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 940 if (NewDIL) 941 Builder.SetCurrentDebugLocation(NewDIL.getValue()); 942 else 943 LLVM_DEBUG(dbgs() 944 << "Failed to create new discriminator: " 945 << DIL->getFilename() << " Line: " << DIL->getLine()); 946 } else 947 Builder.SetCurrentDebugLocation(DIL); 948 } else 949 Builder.SetCurrentDebugLocation(DebugLoc()); 950 } 951 952 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 953 /// is passed, the message relates to that particular instruction. 954 #ifndef NDEBUG 955 static void debugVectorizationMessage(const StringRef Prefix, 956 const StringRef DebugMsg, 957 Instruction *I) { 958 dbgs() << "LV: " << Prefix << DebugMsg; 959 if (I != nullptr) 960 dbgs() << " " << *I; 961 else 962 dbgs() << '.'; 963 dbgs() << '\n'; 964 } 965 #endif 966 967 /// Create an analysis remark that explains why vectorization failed 968 /// 969 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 970 /// RemarkName is the identifier for the remark. If \p I is passed it is an 971 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 972 /// the location of the remark. \return the remark object that can be 973 /// streamed to. 974 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 975 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 976 Value *CodeRegion = TheLoop->getHeader(); 977 DebugLoc DL = TheLoop->getStartLoc(); 978 979 if (I) { 980 CodeRegion = I->getParent(); 981 // If there is no debug location attached to the instruction, revert back to 982 // using the loop's. 983 if (I->getDebugLoc()) 984 DL = I->getDebugLoc(); 985 } 986 987 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 988 } 989 990 namespace llvm { 991 992 /// Return a value for Step multiplied by VF. 993 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 994 int64_t Step) { 995 assert(Ty->isIntegerTy() && "Expected an integer step"); 996 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 997 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 998 } 999 1000 /// Return the runtime value for VF. 1001 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1002 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1003 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1004 } 1005 1006 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1007 ElementCount VF) { 1008 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1009 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1010 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1011 return B.CreateUIToFP(RuntimeVF, FTy); 1012 } 1013 1014 void reportVectorizationFailure(const StringRef DebugMsg, 1015 const StringRef OREMsg, const StringRef ORETag, 1016 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1017 Instruction *I) { 1018 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1019 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1020 ORE->emit( 1021 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1022 << "loop not vectorized: " << OREMsg); 1023 } 1024 1025 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1026 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1027 Instruction *I) { 1028 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1029 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1030 ORE->emit( 1031 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1032 << Msg); 1033 } 1034 1035 } // end namespace llvm 1036 1037 #ifndef NDEBUG 1038 /// \return string containing a file name and a line # for the given loop. 1039 static std::string getDebugLocString(const Loop *L) { 1040 std::string Result; 1041 if (L) { 1042 raw_string_ostream OS(Result); 1043 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1044 LoopDbgLoc.print(OS); 1045 else 1046 // Just print the module name. 1047 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1048 OS.flush(); 1049 } 1050 return Result; 1051 } 1052 #endif 1053 1054 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1055 const Instruction *Orig) { 1056 // If the loop was versioned with memchecks, add the corresponding no-alias 1057 // metadata. 1058 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1059 LVer->annotateInstWithNoAlias(To, Orig); 1060 } 1061 1062 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1063 VPTransformState &State) { 1064 1065 // Collect recipes in the backward slice of `Root` that may generate a poison 1066 // value that is used after vectorization. 1067 SmallPtrSet<VPRecipeBase *, 16> Visited; 1068 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1069 SmallVector<VPRecipeBase *, 16> Worklist; 1070 Worklist.push_back(Root); 1071 1072 // Traverse the backward slice of Root through its use-def chain. 1073 while (!Worklist.empty()) { 1074 VPRecipeBase *CurRec = Worklist.back(); 1075 Worklist.pop_back(); 1076 1077 if (!Visited.insert(CurRec).second) 1078 continue; 1079 1080 // Prune search if we find another recipe generating a widen memory 1081 // instruction. Widen memory instructions involved in address computation 1082 // will lead to gather/scatter instructions, which don't need to be 1083 // handled. 1084 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1085 isa<VPInterleaveRecipe>(CurRec) || 1086 isa<VPScalarIVStepsRecipe>(CurRec) || 1087 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1088 continue; 1089 1090 // This recipe contributes to the address computation of a widen 1091 // load/store. Collect recipe if its underlying instruction has 1092 // poison-generating flags. 1093 Instruction *Instr = CurRec->getUnderlyingInstr(); 1094 if (Instr && Instr->hasPoisonGeneratingFlags()) 1095 State.MayGeneratePoisonRecipes.insert(CurRec); 1096 1097 // Add new definitions to the worklist. 1098 for (VPValue *operand : CurRec->operands()) 1099 if (VPDef *OpDef = operand->getDef()) 1100 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1101 } 1102 }); 1103 1104 // Traverse all the recipes in the VPlan and collect the poison-generating 1105 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1106 // VPInterleaveRecipe. 1107 auto Iter = depth_first( 1108 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1109 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1110 for (VPRecipeBase &Recipe : *VPBB) { 1111 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1112 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1113 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1114 if (AddrDef && WidenRec->isConsecutive() && 1115 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1116 collectPoisonGeneratingInstrsInBackwardSlice( 1117 cast<VPRecipeBase>(AddrDef)); 1118 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1119 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1120 if (AddrDef) { 1121 // Check if any member of the interleave group needs predication. 1122 const InterleaveGroup<Instruction> *InterGroup = 1123 InterleaveRec->getInterleaveGroup(); 1124 bool NeedPredication = false; 1125 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1126 I < NumMembers; ++I) { 1127 Instruction *Member = InterGroup->getMember(I); 1128 if (Member) 1129 NeedPredication |= 1130 Legal->blockNeedsPredication(Member->getParent()); 1131 } 1132 1133 if (NeedPredication) 1134 collectPoisonGeneratingInstrsInBackwardSlice( 1135 cast<VPRecipeBase>(AddrDef)); 1136 } 1137 } 1138 } 1139 } 1140 } 1141 1142 void InnerLoopVectorizer::addMetadata(Instruction *To, 1143 Instruction *From) { 1144 propagateMetadata(To, From); 1145 addNewMetadata(To, From); 1146 } 1147 1148 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1149 Instruction *From) { 1150 for (Value *V : To) { 1151 if (Instruction *I = dyn_cast<Instruction>(V)) 1152 addMetadata(I, From); 1153 } 1154 } 1155 1156 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1157 const RecurrenceDescriptor &RdxDesc) { 1158 auto It = ReductionResumeValues.find(&RdxDesc); 1159 assert(It != ReductionResumeValues.end() && 1160 "Expected to find a resume value for the reduction."); 1161 return It->second; 1162 } 1163 1164 namespace llvm { 1165 1166 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1167 // lowered. 1168 enum ScalarEpilogueLowering { 1169 1170 // The default: allowing scalar epilogues. 1171 CM_ScalarEpilogueAllowed, 1172 1173 // Vectorization with OptForSize: don't allow epilogues. 1174 CM_ScalarEpilogueNotAllowedOptSize, 1175 1176 // A special case of vectorisation with OptForSize: loops with a very small 1177 // trip count are considered for vectorization under OptForSize, thereby 1178 // making sure the cost of their loop body is dominant, free of runtime 1179 // guards and scalar iteration overheads. 1180 CM_ScalarEpilogueNotAllowedLowTripLoop, 1181 1182 // Loop hint predicate indicating an epilogue is undesired. 1183 CM_ScalarEpilogueNotNeededUsePredicate, 1184 1185 // Directive indicating we must either tail fold or not vectorize 1186 CM_ScalarEpilogueNotAllowedUsePredicate 1187 }; 1188 1189 /// ElementCountComparator creates a total ordering for ElementCount 1190 /// for the purposes of using it in a set structure. 1191 struct ElementCountComparator { 1192 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1193 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1194 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1195 } 1196 }; 1197 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1198 1199 /// LoopVectorizationCostModel - estimates the expected speedups due to 1200 /// vectorization. 1201 /// In many cases vectorization is not profitable. This can happen because of 1202 /// a number of reasons. In this class we mainly attempt to predict the 1203 /// expected speedup/slowdowns due to the supported instruction set. We use the 1204 /// TargetTransformInfo to query the different backends for the cost of 1205 /// different operations. 1206 class LoopVectorizationCostModel { 1207 public: 1208 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1209 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1210 LoopVectorizationLegality *Legal, 1211 const TargetTransformInfo &TTI, 1212 const TargetLibraryInfo *TLI, DemandedBits *DB, 1213 AssumptionCache *AC, 1214 OptimizationRemarkEmitter *ORE, const Function *F, 1215 const LoopVectorizeHints *Hints, 1216 InterleavedAccessInfo &IAI) 1217 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1218 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1219 Hints(Hints), InterleaveInfo(IAI) {} 1220 1221 /// \return An upper bound for the vectorization factors (both fixed and 1222 /// scalable). If the factors are 0, vectorization and interleaving should be 1223 /// avoided up front. 1224 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1225 1226 /// \return True if runtime checks are required for vectorization, and false 1227 /// otherwise. 1228 bool runtimeChecksRequired(); 1229 1230 /// \return The most profitable vectorization factor and the cost of that VF. 1231 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1232 /// then this vectorization factor will be selected if vectorization is 1233 /// possible. 1234 VectorizationFactor 1235 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1236 1237 VectorizationFactor 1238 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1239 const LoopVectorizationPlanner &LVP); 1240 1241 /// Setup cost-based decisions for user vectorization factor. 1242 /// \return true if the UserVF is a feasible VF to be chosen. 1243 bool selectUserVectorizationFactor(ElementCount UserVF) { 1244 collectUniformsAndScalars(UserVF); 1245 collectInstsToScalarize(UserVF); 1246 return expectedCost(UserVF).first.isValid(); 1247 } 1248 1249 /// \return The size (in bits) of the smallest and widest types in the code 1250 /// that needs to be vectorized. We ignore values that remain scalar such as 1251 /// 64 bit loop indices. 1252 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1253 1254 /// \return The desired interleave count. 1255 /// If interleave count has been specified by metadata it will be returned. 1256 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1257 /// are the selected vectorization factor and the cost of the selected VF. 1258 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1259 1260 /// Memory access instruction may be vectorized in more than one way. 1261 /// Form of instruction after vectorization depends on cost. 1262 /// This function takes cost-based decisions for Load/Store instructions 1263 /// and collects them in a map. This decisions map is used for building 1264 /// the lists of loop-uniform and loop-scalar instructions. 1265 /// The calculated cost is saved with widening decision in order to 1266 /// avoid redundant calculations. 1267 void setCostBasedWideningDecision(ElementCount VF); 1268 1269 /// A struct that represents some properties of the register usage 1270 /// of a loop. 1271 struct RegisterUsage { 1272 /// Holds the number of loop invariant values that are used in the loop. 1273 /// The key is ClassID of target-provided register class. 1274 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1275 /// Holds the maximum number of concurrent live intervals in the loop. 1276 /// The key is ClassID of target-provided register class. 1277 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1278 }; 1279 1280 /// \return Returns information about the register usages of the loop for the 1281 /// given vectorization factors. 1282 SmallVector<RegisterUsage, 8> 1283 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1284 1285 /// Collect values we want to ignore in the cost model. 1286 void collectValuesToIgnore(); 1287 1288 /// Collect all element types in the loop for which widening is needed. 1289 void collectElementTypesForWidening(); 1290 1291 /// Split reductions into those that happen in the loop, and those that happen 1292 /// outside. In loop reductions are collected into InLoopReductionChains. 1293 void collectInLoopReductions(); 1294 1295 /// Returns true if we should use strict in-order reductions for the given 1296 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1297 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1298 /// of FP operations. 1299 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1300 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1301 } 1302 1303 /// \returns The smallest bitwidth each instruction can be represented with. 1304 /// The vector equivalents of these instructions should be truncated to this 1305 /// type. 1306 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1307 return MinBWs; 1308 } 1309 1310 /// \returns True if it is more profitable to scalarize instruction \p I for 1311 /// vectorization factor \p VF. 1312 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1313 assert(VF.isVector() && 1314 "Profitable to scalarize relevant only for VF > 1."); 1315 1316 // Cost model is not run in the VPlan-native path - return conservative 1317 // result until this changes. 1318 if (EnableVPlanNativePath) 1319 return false; 1320 1321 auto Scalars = InstsToScalarize.find(VF); 1322 assert(Scalars != InstsToScalarize.end() && 1323 "VF not yet analyzed for scalarization profitability"); 1324 return Scalars->second.find(I) != Scalars->second.end(); 1325 } 1326 1327 /// Returns true if \p I is known to be uniform after vectorization. 1328 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1329 if (VF.isScalar()) 1330 return true; 1331 1332 // Cost model is not run in the VPlan-native path - return conservative 1333 // result until this changes. 1334 if (EnableVPlanNativePath) 1335 return false; 1336 1337 auto UniformsPerVF = Uniforms.find(VF); 1338 assert(UniformsPerVF != Uniforms.end() && 1339 "VF not yet analyzed for uniformity"); 1340 return UniformsPerVF->second.count(I); 1341 } 1342 1343 /// Returns true if \p I is known to be scalar after vectorization. 1344 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1345 if (VF.isScalar()) 1346 return true; 1347 1348 // Cost model is not run in the VPlan-native path - return conservative 1349 // result until this changes. 1350 if (EnableVPlanNativePath) 1351 return false; 1352 1353 auto ScalarsPerVF = Scalars.find(VF); 1354 assert(ScalarsPerVF != Scalars.end() && 1355 "Scalar values are not calculated for VF"); 1356 return ScalarsPerVF->second.count(I); 1357 } 1358 1359 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1360 /// for vectorization factor \p VF. 1361 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1362 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1363 !isProfitableToScalarize(I, VF) && 1364 !isScalarAfterVectorization(I, VF); 1365 } 1366 1367 /// Decision that was taken during cost calculation for memory instruction. 1368 enum InstWidening { 1369 CM_Unknown, 1370 CM_Widen, // For consecutive accesses with stride +1. 1371 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1372 CM_Interleave, 1373 CM_GatherScatter, 1374 CM_Scalarize 1375 }; 1376 1377 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1378 /// instruction \p I and vector width \p VF. 1379 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1380 InstructionCost Cost) { 1381 assert(VF.isVector() && "Expected VF >=2"); 1382 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1383 } 1384 1385 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1386 /// interleaving group \p Grp and vector width \p VF. 1387 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1388 ElementCount VF, InstWidening W, 1389 InstructionCost Cost) { 1390 assert(VF.isVector() && "Expected VF >=2"); 1391 /// Broadcast this decicion to all instructions inside the group. 1392 /// But the cost will be assigned to one instruction only. 1393 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1394 if (auto *I = Grp->getMember(i)) { 1395 if (Grp->getInsertPos() == I) 1396 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1397 else 1398 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1399 } 1400 } 1401 } 1402 1403 /// Return the cost model decision for the given instruction \p I and vector 1404 /// width \p VF. Return CM_Unknown if this instruction did not pass 1405 /// through the cost modeling. 1406 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1407 assert(VF.isVector() && "Expected VF to be a vector VF"); 1408 // Cost model is not run in the VPlan-native path - return conservative 1409 // result until this changes. 1410 if (EnableVPlanNativePath) 1411 return CM_GatherScatter; 1412 1413 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1414 auto Itr = WideningDecisions.find(InstOnVF); 1415 if (Itr == WideningDecisions.end()) 1416 return CM_Unknown; 1417 return Itr->second.first; 1418 } 1419 1420 /// Return the vectorization cost for the given instruction \p I and vector 1421 /// width \p VF. 1422 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1423 assert(VF.isVector() && "Expected VF >=2"); 1424 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1425 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1426 "The cost is not calculated"); 1427 return WideningDecisions[InstOnVF].second; 1428 } 1429 1430 /// Return True if instruction \p I is an optimizable truncate whose operand 1431 /// is an induction variable. Such a truncate will be removed by adding a new 1432 /// induction variable with the destination type. 1433 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1434 // If the instruction is not a truncate, return false. 1435 auto *Trunc = dyn_cast<TruncInst>(I); 1436 if (!Trunc) 1437 return false; 1438 1439 // Get the source and destination types of the truncate. 1440 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1441 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1442 1443 // If the truncate is free for the given types, return false. Replacing a 1444 // free truncate with an induction variable would add an induction variable 1445 // update instruction to each iteration of the loop. We exclude from this 1446 // check the primary induction variable since it will need an update 1447 // instruction regardless. 1448 Value *Op = Trunc->getOperand(0); 1449 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1450 return false; 1451 1452 // If the truncated value is not an induction variable, return false. 1453 return Legal->isInductionPhi(Op); 1454 } 1455 1456 /// Collects the instructions to scalarize for each predicated instruction in 1457 /// the loop. 1458 void collectInstsToScalarize(ElementCount VF); 1459 1460 /// Collect Uniform and Scalar values for the given \p VF. 1461 /// The sets depend on CM decision for Load/Store instructions 1462 /// that may be vectorized as interleave, gather-scatter or scalarized. 1463 void collectUniformsAndScalars(ElementCount VF) { 1464 // Do the analysis once. 1465 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1466 return; 1467 setCostBasedWideningDecision(VF); 1468 collectLoopUniforms(VF); 1469 collectLoopScalars(VF); 1470 } 1471 1472 /// Returns true if the target machine supports masked store operation 1473 /// for the given \p DataType and kind of access to \p Ptr. 1474 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1475 return Legal->isConsecutivePtr(DataType, Ptr) && 1476 TTI.isLegalMaskedStore(DataType, Alignment); 1477 } 1478 1479 /// Returns true if the target machine supports masked load operation 1480 /// for the given \p DataType and kind of access to \p Ptr. 1481 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1482 return Legal->isConsecutivePtr(DataType, Ptr) && 1483 TTI.isLegalMaskedLoad(DataType, Alignment); 1484 } 1485 1486 /// Returns true if the target machine can represent \p V as a masked gather 1487 /// or scatter operation. 1488 bool isLegalGatherOrScatter(Value *V, 1489 ElementCount VF = ElementCount::getFixed(1)) { 1490 bool LI = isa<LoadInst>(V); 1491 bool SI = isa<StoreInst>(V); 1492 if (!LI && !SI) 1493 return false; 1494 auto *Ty = getLoadStoreType(V); 1495 Align Align = getLoadStoreAlignment(V); 1496 if (VF.isVector()) 1497 Ty = VectorType::get(Ty, VF); 1498 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1499 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1500 } 1501 1502 /// Returns true if the target machine supports all of the reduction 1503 /// variables found for the given VF. 1504 bool canVectorizeReductions(ElementCount VF) const { 1505 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1506 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1507 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1508 })); 1509 } 1510 1511 /// Returns true if \p I is an instruction that will be scalarized with 1512 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1513 /// instructions include conditional stores and instructions that may divide 1514 /// by zero. 1515 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1516 1517 // Returns true if \p I is an instruction that will be predicated either 1518 // through scalar predication or masked load/store or masked gather/scatter. 1519 // \p VF is the vectorization factor that will be used to vectorize \p I. 1520 // Superset of instructions that return true for isScalarWithPredication. 1521 bool isPredicatedInst(Instruction *I, ElementCount VF, 1522 bool IsKnownUniform = false) { 1523 // When we know the load is uniform and the original scalar loop was not 1524 // predicated we don't need to mark it as a predicated instruction. Any 1525 // vectorised blocks created when tail-folding are something artificial we 1526 // have introduced and we know there is always at least one active lane. 1527 // That's why we call Legal->blockNeedsPredication here because it doesn't 1528 // query tail-folding. 1529 if (IsKnownUniform && isa<LoadInst>(I) && 1530 !Legal->blockNeedsPredication(I->getParent())) 1531 return false; 1532 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1533 return false; 1534 // Loads and stores that need some form of masked operation are predicated 1535 // instructions. 1536 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1537 return Legal->isMaskRequired(I); 1538 return isScalarWithPredication(I, VF); 1539 } 1540 1541 /// Returns true if \p I is a memory instruction with consecutive memory 1542 /// access that can be widened. 1543 bool 1544 memoryInstructionCanBeWidened(Instruction *I, 1545 ElementCount VF = ElementCount::getFixed(1)); 1546 1547 /// Returns true if \p I is a memory instruction in an interleaved-group 1548 /// of memory accesses that can be vectorized with wide vector loads/stores 1549 /// and shuffles. 1550 bool 1551 interleavedAccessCanBeWidened(Instruction *I, 1552 ElementCount VF = ElementCount::getFixed(1)); 1553 1554 /// Check if \p Instr belongs to any interleaved access group. 1555 bool isAccessInterleaved(Instruction *Instr) { 1556 return InterleaveInfo.isInterleaved(Instr); 1557 } 1558 1559 /// Get the interleaved access group that \p Instr belongs to. 1560 const InterleaveGroup<Instruction> * 1561 getInterleavedAccessGroup(Instruction *Instr) { 1562 return InterleaveInfo.getInterleaveGroup(Instr); 1563 } 1564 1565 /// Returns true if we're required to use a scalar epilogue for at least 1566 /// the final iteration of the original loop. 1567 bool requiresScalarEpilogue(ElementCount VF) const { 1568 if (!isScalarEpilogueAllowed()) 1569 return false; 1570 // If we might exit from anywhere but the latch, must run the exiting 1571 // iteration in scalar form. 1572 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1573 return true; 1574 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1575 } 1576 1577 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1578 /// loop hint annotation. 1579 bool isScalarEpilogueAllowed() const { 1580 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1581 } 1582 1583 /// Returns true if all loop blocks should be masked to fold tail loop. 1584 bool foldTailByMasking() const { return FoldTailByMasking; } 1585 1586 /// Returns true if the instructions in this block requires predication 1587 /// for any reason, e.g. because tail folding now requires a predicate 1588 /// or because the block in the original loop was predicated. 1589 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1590 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1591 } 1592 1593 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1594 /// nodes to the chain of instructions representing the reductions. Uses a 1595 /// MapVector to ensure deterministic iteration order. 1596 using ReductionChainMap = 1597 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1598 1599 /// Return the chain of instructions representing an inloop reduction. 1600 const ReductionChainMap &getInLoopReductionChains() const { 1601 return InLoopReductionChains; 1602 } 1603 1604 /// Returns true if the Phi is part of an inloop reduction. 1605 bool isInLoopReduction(PHINode *Phi) const { 1606 return InLoopReductionChains.count(Phi); 1607 } 1608 1609 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1610 /// with factor VF. Return the cost of the instruction, including 1611 /// scalarization overhead if it's needed. 1612 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1613 1614 /// Estimate cost of a call instruction CI if it were vectorized with factor 1615 /// VF. Return the cost of the instruction, including scalarization overhead 1616 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1617 /// scalarized - 1618 /// i.e. either vector version isn't available, or is too expensive. 1619 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1620 bool &NeedToScalarize) const; 1621 1622 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1623 /// that of B. 1624 bool isMoreProfitable(const VectorizationFactor &A, 1625 const VectorizationFactor &B) const; 1626 1627 /// Invalidates decisions already taken by the cost model. 1628 void invalidateCostModelingDecisions() { 1629 WideningDecisions.clear(); 1630 Uniforms.clear(); 1631 Scalars.clear(); 1632 } 1633 1634 private: 1635 unsigned NumPredStores = 0; 1636 1637 /// Convenience function that returns the value of vscale_range iff 1638 /// vscale_range.min == vscale_range.max or otherwise returns the value 1639 /// returned by the corresponding TLI method. 1640 Optional<unsigned> getVScaleForTuning() const; 1641 1642 /// \return An upper bound for the vectorization factors for both 1643 /// fixed and scalable vectorization, where the minimum-known number of 1644 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1645 /// disabled or unsupported, then the scalable part will be equal to 1646 /// ElementCount::getScalable(0). 1647 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1648 ElementCount UserVF, 1649 bool FoldTailByMasking); 1650 1651 /// \return the maximized element count based on the targets vector 1652 /// registers and the loop trip-count, but limited to a maximum safe VF. 1653 /// This is a helper function of computeFeasibleMaxVF. 1654 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1655 unsigned SmallestType, 1656 unsigned WidestType, 1657 ElementCount MaxSafeVF, 1658 bool FoldTailByMasking); 1659 1660 /// \return the maximum legal scalable VF, based on the safe max number 1661 /// of elements. 1662 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1663 1664 /// The vectorization cost is a combination of the cost itself and a boolean 1665 /// indicating whether any of the contributing operations will actually 1666 /// operate on vector values after type legalization in the backend. If this 1667 /// latter value is false, then all operations will be scalarized (i.e. no 1668 /// vectorization has actually taken place). 1669 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1670 1671 /// Returns the expected execution cost. The unit of the cost does 1672 /// not matter because we use the 'cost' units to compare different 1673 /// vector widths. The cost that is returned is *not* normalized by 1674 /// the factor width. If \p Invalid is not nullptr, this function 1675 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1676 /// each instruction that has an Invalid cost for the given VF. 1677 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1678 VectorizationCostTy 1679 expectedCost(ElementCount VF, 1680 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1681 1682 /// Returns the execution time cost of an instruction for a given vector 1683 /// width. Vector width of one means scalar. 1684 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1685 1686 /// The cost-computation logic from getInstructionCost which provides 1687 /// the vector type as an output parameter. 1688 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1689 Type *&VectorTy); 1690 1691 /// Return the cost of instructions in an inloop reduction pattern, if I is 1692 /// part of that pattern. 1693 Optional<InstructionCost> 1694 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1695 TTI::TargetCostKind CostKind); 1696 1697 /// Calculate vectorization cost of memory instruction \p I. 1698 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1699 1700 /// The cost computation for scalarized memory instruction. 1701 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1702 1703 /// The cost computation for interleaving group of memory instructions. 1704 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1705 1706 /// The cost computation for Gather/Scatter instruction. 1707 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1708 1709 /// The cost computation for widening instruction \p I with consecutive 1710 /// memory access. 1711 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1712 1713 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1714 /// Load: scalar load + broadcast. 1715 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1716 /// element) 1717 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1718 1719 /// Estimate the overhead of scalarizing an instruction. This is a 1720 /// convenience wrapper for the type-based getScalarizationOverhead API. 1721 InstructionCost getScalarizationOverhead(Instruction *I, 1722 ElementCount VF) const; 1723 1724 /// Returns whether the instruction is a load or store and will be a emitted 1725 /// as a vector operation. 1726 bool isConsecutiveLoadOrStore(Instruction *I); 1727 1728 /// Returns true if an artificially high cost for emulated masked memrefs 1729 /// should be used. 1730 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1731 1732 /// Map of scalar integer values to the smallest bitwidth they can be legally 1733 /// represented as. The vector equivalents of these values should be truncated 1734 /// to this type. 1735 MapVector<Instruction *, uint64_t> MinBWs; 1736 1737 /// A type representing the costs for instructions if they were to be 1738 /// scalarized rather than vectorized. The entries are Instruction-Cost 1739 /// pairs. 1740 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1741 1742 /// A set containing all BasicBlocks that are known to present after 1743 /// vectorization as a predicated block. 1744 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1745 1746 /// Records whether it is allowed to have the original scalar loop execute at 1747 /// least once. This may be needed as a fallback loop in case runtime 1748 /// aliasing/dependence checks fail, or to handle the tail/remainder 1749 /// iterations when the trip count is unknown or doesn't divide by the VF, 1750 /// or as a peel-loop to handle gaps in interleave-groups. 1751 /// Under optsize and when the trip count is very small we don't allow any 1752 /// iterations to execute in the scalar loop. 1753 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1754 1755 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1756 bool FoldTailByMasking = false; 1757 1758 /// A map holding scalar costs for different vectorization factors. The 1759 /// presence of a cost for an instruction in the mapping indicates that the 1760 /// instruction will be scalarized when vectorizing with the associated 1761 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1762 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1763 1764 /// Holds the instructions known to be uniform after vectorization. 1765 /// The data is collected per VF. 1766 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1767 1768 /// Holds the instructions known to be scalar after vectorization. 1769 /// The data is collected per VF. 1770 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1771 1772 /// Holds the instructions (address computations) that are forced to be 1773 /// scalarized. 1774 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1775 1776 /// PHINodes of the reductions that should be expanded in-loop along with 1777 /// their associated chains of reduction operations, in program order from top 1778 /// (PHI) to bottom 1779 ReductionChainMap InLoopReductionChains; 1780 1781 /// A Map of inloop reduction operations and their immediate chain operand. 1782 /// FIXME: This can be removed once reductions can be costed correctly in 1783 /// vplan. This was added to allow quick lookup to the inloop operations, 1784 /// without having to loop through InLoopReductionChains. 1785 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1786 1787 /// Returns the expected difference in cost from scalarizing the expression 1788 /// feeding a predicated instruction \p PredInst. The instructions to 1789 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1790 /// non-negative return value implies the expression will be scalarized. 1791 /// Currently, only single-use chains are considered for scalarization. 1792 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1793 ElementCount VF); 1794 1795 /// Collect the instructions that are uniform after vectorization. An 1796 /// instruction is uniform if we represent it with a single scalar value in 1797 /// the vectorized loop corresponding to each vector iteration. Examples of 1798 /// uniform instructions include pointer operands of consecutive or 1799 /// interleaved memory accesses. Note that although uniformity implies an 1800 /// instruction will be scalar, the reverse is not true. In general, a 1801 /// scalarized instruction will be represented by VF scalar values in the 1802 /// vectorized loop, each corresponding to an iteration of the original 1803 /// scalar loop. 1804 void collectLoopUniforms(ElementCount VF); 1805 1806 /// Collect the instructions that are scalar after vectorization. An 1807 /// instruction is scalar if it is known to be uniform or will be scalarized 1808 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1809 /// to the list if they are used by a load/store instruction that is marked as 1810 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1811 /// VF values in the vectorized loop, each corresponding to an iteration of 1812 /// the original scalar loop. 1813 void collectLoopScalars(ElementCount VF); 1814 1815 /// Keeps cost model vectorization decision and cost for instructions. 1816 /// Right now it is used for memory instructions only. 1817 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1818 std::pair<InstWidening, InstructionCost>>; 1819 1820 DecisionList WideningDecisions; 1821 1822 /// Returns true if \p V is expected to be vectorized and it needs to be 1823 /// extracted. 1824 bool needsExtract(Value *V, ElementCount VF) const { 1825 Instruction *I = dyn_cast<Instruction>(V); 1826 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1827 TheLoop->isLoopInvariant(I)) 1828 return false; 1829 1830 // Assume we can vectorize V (and hence we need extraction) if the 1831 // scalars are not computed yet. This can happen, because it is called 1832 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1833 // the scalars are collected. That should be a safe assumption in most 1834 // cases, because we check if the operands have vectorizable types 1835 // beforehand in LoopVectorizationLegality. 1836 return Scalars.find(VF) == Scalars.end() || 1837 !isScalarAfterVectorization(I, VF); 1838 }; 1839 1840 /// Returns a range containing only operands needing to be extracted. 1841 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1842 ElementCount VF) const { 1843 return SmallVector<Value *, 4>(make_filter_range( 1844 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1845 } 1846 1847 /// Determines if we have the infrastructure to vectorize loop \p L and its 1848 /// epilogue, assuming the main loop is vectorized by \p VF. 1849 bool isCandidateForEpilogueVectorization(const Loop &L, 1850 const ElementCount VF) const; 1851 1852 /// Returns true if epilogue vectorization is considered profitable, and 1853 /// false otherwise. 1854 /// \p VF is the vectorization factor chosen for the original loop. 1855 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1856 1857 public: 1858 /// The loop that we evaluate. 1859 Loop *TheLoop; 1860 1861 /// Predicated scalar evolution analysis. 1862 PredicatedScalarEvolution &PSE; 1863 1864 /// Loop Info analysis. 1865 LoopInfo *LI; 1866 1867 /// Vectorization legality. 1868 LoopVectorizationLegality *Legal; 1869 1870 /// Vector target information. 1871 const TargetTransformInfo &TTI; 1872 1873 /// Target Library Info. 1874 const TargetLibraryInfo *TLI; 1875 1876 /// Demanded bits analysis. 1877 DemandedBits *DB; 1878 1879 /// Assumption cache. 1880 AssumptionCache *AC; 1881 1882 /// Interface to emit optimization remarks. 1883 OptimizationRemarkEmitter *ORE; 1884 1885 const Function *TheFunction; 1886 1887 /// Loop Vectorize Hint. 1888 const LoopVectorizeHints *Hints; 1889 1890 /// The interleave access information contains groups of interleaved accesses 1891 /// with the same stride and close to each other. 1892 InterleavedAccessInfo &InterleaveInfo; 1893 1894 /// Values to ignore in the cost model. 1895 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1896 1897 /// Values to ignore in the cost model when VF > 1. 1898 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1899 1900 /// All element types found in the loop. 1901 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1902 1903 /// Profitable vector factors. 1904 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1905 }; 1906 } // end namespace llvm 1907 1908 /// Helper struct to manage generating runtime checks for vectorization. 1909 /// 1910 /// The runtime checks are created up-front in temporary blocks to allow better 1911 /// estimating the cost and un-linked from the existing IR. After deciding to 1912 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1913 /// temporary blocks are completely removed. 1914 class GeneratedRTChecks { 1915 /// Basic block which contains the generated SCEV checks, if any. 1916 BasicBlock *SCEVCheckBlock = nullptr; 1917 1918 /// The value representing the result of the generated SCEV checks. If it is 1919 /// nullptr, either no SCEV checks have been generated or they have been used. 1920 Value *SCEVCheckCond = nullptr; 1921 1922 /// Basic block which contains the generated memory runtime checks, if any. 1923 BasicBlock *MemCheckBlock = nullptr; 1924 1925 /// The value representing the result of the generated memory runtime checks. 1926 /// If it is nullptr, either no memory runtime checks have been generated or 1927 /// they have been used. 1928 Value *MemRuntimeCheckCond = nullptr; 1929 1930 DominatorTree *DT; 1931 LoopInfo *LI; 1932 1933 SCEVExpander SCEVExp; 1934 SCEVExpander MemCheckExp; 1935 1936 public: 1937 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1938 const DataLayout &DL) 1939 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1940 MemCheckExp(SE, DL, "scev.check") {} 1941 1942 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1943 /// accurately estimate the cost of the runtime checks. The blocks are 1944 /// un-linked from the IR and is added back during vector code generation. If 1945 /// there is no vector code generation, the check blocks are removed 1946 /// completely. 1947 void Create(Loop *L, const LoopAccessInfo &LAI, 1948 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1949 1950 BasicBlock *LoopHeader = L->getHeader(); 1951 BasicBlock *Preheader = L->getLoopPreheader(); 1952 1953 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1954 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1955 // may be used by SCEVExpander. The blocks will be un-linked from their 1956 // predecessors and removed from LI & DT at the end of the function. 1957 if (!UnionPred.isAlwaysTrue()) { 1958 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1959 nullptr, "vector.scevcheck"); 1960 1961 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1962 &UnionPred, SCEVCheckBlock->getTerminator()); 1963 } 1964 1965 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1966 if (RtPtrChecking.Need) { 1967 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1968 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1969 "vector.memcheck"); 1970 1971 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1972 if (DiffChecks) { 1973 MemRuntimeCheckCond = addDiffRuntimeChecks( 1974 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1975 [VF](IRBuilderBase &B, unsigned Bits) { 1976 return getRuntimeVF(B, B.getIntNTy(Bits), VF); 1977 }, 1978 IC); 1979 } else { 1980 MemRuntimeCheckCond = 1981 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1982 RtPtrChecking.getChecks(), MemCheckExp); 1983 } 1984 assert(MemRuntimeCheckCond && 1985 "no RT checks generated although RtPtrChecking " 1986 "claimed checks are required"); 1987 } 1988 1989 if (!MemCheckBlock && !SCEVCheckBlock) 1990 return; 1991 1992 // Unhook the temporary block with the checks, update various places 1993 // accordingly. 1994 if (SCEVCheckBlock) 1995 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1996 if (MemCheckBlock) 1997 MemCheckBlock->replaceAllUsesWith(Preheader); 1998 1999 if (SCEVCheckBlock) { 2000 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2001 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2002 Preheader->getTerminator()->eraseFromParent(); 2003 } 2004 if (MemCheckBlock) { 2005 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2006 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2007 Preheader->getTerminator()->eraseFromParent(); 2008 } 2009 2010 DT->changeImmediateDominator(LoopHeader, Preheader); 2011 if (MemCheckBlock) { 2012 DT->eraseNode(MemCheckBlock); 2013 LI->removeBlock(MemCheckBlock); 2014 } 2015 if (SCEVCheckBlock) { 2016 DT->eraseNode(SCEVCheckBlock); 2017 LI->removeBlock(SCEVCheckBlock); 2018 } 2019 } 2020 2021 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2022 /// unused. 2023 ~GeneratedRTChecks() { 2024 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2025 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2026 if (!SCEVCheckCond) 2027 SCEVCleaner.markResultUsed(); 2028 2029 if (!MemRuntimeCheckCond) 2030 MemCheckCleaner.markResultUsed(); 2031 2032 if (MemRuntimeCheckCond) { 2033 auto &SE = *MemCheckExp.getSE(); 2034 // Memory runtime check generation creates compares that use expanded 2035 // values. Remove them before running the SCEVExpanderCleaners. 2036 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2037 if (MemCheckExp.isInsertedInstruction(&I)) 2038 continue; 2039 SE.forgetValue(&I); 2040 I.eraseFromParent(); 2041 } 2042 } 2043 MemCheckCleaner.cleanup(); 2044 SCEVCleaner.cleanup(); 2045 2046 if (SCEVCheckCond) 2047 SCEVCheckBlock->eraseFromParent(); 2048 if (MemRuntimeCheckCond) 2049 MemCheckBlock->eraseFromParent(); 2050 } 2051 2052 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2053 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2054 /// depending on the generated condition. 2055 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2056 BasicBlock *LoopVectorPreHeader, 2057 BasicBlock *LoopExitBlock) { 2058 if (!SCEVCheckCond) 2059 return nullptr; 2060 2061 Value *Cond = SCEVCheckCond; 2062 // Mark the check as used, to prevent it from being removed during cleanup. 2063 SCEVCheckCond = nullptr; 2064 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2065 if (C->isZero()) 2066 return nullptr; 2067 2068 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2069 2070 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2071 // Create new preheader for vector loop. 2072 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2073 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2074 2075 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2076 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2077 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2078 SCEVCheckBlock); 2079 2080 DT->addNewBlock(SCEVCheckBlock, Pred); 2081 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2082 2083 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2084 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2085 return SCEVCheckBlock; 2086 } 2087 2088 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2089 /// the branches to branch to the vector preheader or \p Bypass, depending on 2090 /// the generated condition. 2091 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2092 BasicBlock *LoopVectorPreHeader) { 2093 // Check if we generated code that checks in runtime if arrays overlap. 2094 if (!MemRuntimeCheckCond) 2095 return nullptr; 2096 2097 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2098 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2099 MemCheckBlock); 2100 2101 DT->addNewBlock(MemCheckBlock, Pred); 2102 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2103 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2104 2105 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2106 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2107 2108 ReplaceInstWithInst( 2109 MemCheckBlock->getTerminator(), 2110 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2111 MemCheckBlock->getTerminator()->setDebugLoc( 2112 Pred->getTerminator()->getDebugLoc()); 2113 2114 // Mark the check as used, to prevent it from being removed during cleanup. 2115 MemRuntimeCheckCond = nullptr; 2116 return MemCheckBlock; 2117 } 2118 }; 2119 2120 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2121 // vectorization. The loop needs to be annotated with #pragma omp simd 2122 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2123 // vector length information is not provided, vectorization is not considered 2124 // explicit. Interleave hints are not allowed either. These limitations will be 2125 // relaxed in the future. 2126 // Please, note that we are currently forced to abuse the pragma 'clang 2127 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2128 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2129 // provides *explicit vectorization hints* (LV can bypass legal checks and 2130 // assume that vectorization is legal). However, both hints are implemented 2131 // using the same metadata (llvm.loop.vectorize, processed by 2132 // LoopVectorizeHints). This will be fixed in the future when the native IR 2133 // representation for pragma 'omp simd' is introduced. 2134 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2135 OptimizationRemarkEmitter *ORE) { 2136 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2137 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2138 2139 // Only outer loops with an explicit vectorization hint are supported. 2140 // Unannotated outer loops are ignored. 2141 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2142 return false; 2143 2144 Function *Fn = OuterLp->getHeader()->getParent(); 2145 if (!Hints.allowVectorization(Fn, OuterLp, 2146 true /*VectorizeOnlyWhenForced*/)) { 2147 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2148 return false; 2149 } 2150 2151 if (Hints.getInterleave() > 1) { 2152 // TODO: Interleave support is future work. 2153 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2154 "outer loops.\n"); 2155 Hints.emitRemarkWithHints(); 2156 return false; 2157 } 2158 2159 return true; 2160 } 2161 2162 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2163 OptimizationRemarkEmitter *ORE, 2164 SmallVectorImpl<Loop *> &V) { 2165 // Collect inner loops and outer loops without irreducible control flow. For 2166 // now, only collect outer loops that have explicit vectorization hints. If we 2167 // are stress testing the VPlan H-CFG construction, we collect the outermost 2168 // loop of every loop nest. 2169 if (L.isInnermost() || VPlanBuildStressTest || 2170 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2171 LoopBlocksRPO RPOT(&L); 2172 RPOT.perform(LI); 2173 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2174 V.push_back(&L); 2175 // TODO: Collect inner loops inside marked outer loops in case 2176 // vectorization fails for the outer loop. Do not invoke 2177 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2178 // already known to be reducible. We can use an inherited attribute for 2179 // that. 2180 return; 2181 } 2182 } 2183 for (Loop *InnerL : L) 2184 collectSupportedLoops(*InnerL, LI, ORE, V); 2185 } 2186 2187 namespace { 2188 2189 /// The LoopVectorize Pass. 2190 struct LoopVectorize : public FunctionPass { 2191 /// Pass identification, replacement for typeid 2192 static char ID; 2193 2194 LoopVectorizePass Impl; 2195 2196 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2197 bool VectorizeOnlyWhenForced = false) 2198 : FunctionPass(ID), 2199 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2200 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2201 } 2202 2203 bool runOnFunction(Function &F) override { 2204 if (skipFunction(F)) 2205 return false; 2206 2207 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2208 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2209 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2210 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2211 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2212 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2213 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2214 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2215 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2216 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2217 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2218 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2219 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2220 2221 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2222 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2223 2224 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2225 GetLAA, *ORE, PSI).MadeAnyChange; 2226 } 2227 2228 void getAnalysisUsage(AnalysisUsage &AU) const override { 2229 AU.addRequired<AssumptionCacheTracker>(); 2230 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2231 AU.addRequired<DominatorTreeWrapperPass>(); 2232 AU.addRequired<LoopInfoWrapperPass>(); 2233 AU.addRequired<ScalarEvolutionWrapperPass>(); 2234 AU.addRequired<TargetTransformInfoWrapperPass>(); 2235 AU.addRequired<AAResultsWrapperPass>(); 2236 AU.addRequired<LoopAccessLegacyAnalysis>(); 2237 AU.addRequired<DemandedBitsWrapperPass>(); 2238 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2239 AU.addRequired<InjectTLIMappingsLegacy>(); 2240 2241 // We currently do not preserve loopinfo/dominator analyses with outer loop 2242 // vectorization. Until this is addressed, mark these analyses as preserved 2243 // only for non-VPlan-native path. 2244 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2245 if (!EnableVPlanNativePath) { 2246 AU.addPreserved<LoopInfoWrapperPass>(); 2247 AU.addPreserved<DominatorTreeWrapperPass>(); 2248 } 2249 2250 AU.addPreserved<BasicAAWrapperPass>(); 2251 AU.addPreserved<GlobalsAAWrapperPass>(); 2252 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2253 } 2254 }; 2255 2256 } // end anonymous namespace 2257 2258 //===----------------------------------------------------------------------===// 2259 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2260 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2261 //===----------------------------------------------------------------------===// 2262 2263 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2264 // We need to place the broadcast of invariant variables outside the loop, 2265 // but only if it's proven safe to do so. Else, broadcast will be inside 2266 // vector loop body. 2267 Instruction *Instr = dyn_cast<Instruction>(V); 2268 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2269 (!Instr || 2270 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2271 // Place the code for broadcasting invariant variables in the new preheader. 2272 IRBuilder<>::InsertPointGuard Guard(Builder); 2273 if (SafeToHoist) 2274 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2275 2276 // Broadcast the scalar into all locations in the vector. 2277 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2278 2279 return Shuf; 2280 } 2281 2282 /// This function adds 2283 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2284 /// to each vector element of Val. The sequence starts at StartIndex. 2285 /// \p Opcode is relevant for FP induction variable. 2286 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2287 Instruction::BinaryOps BinOp, ElementCount VF, 2288 IRBuilderBase &Builder) { 2289 assert(VF.isVector() && "only vector VFs are supported"); 2290 2291 // Create and check the types. 2292 auto *ValVTy = cast<VectorType>(Val->getType()); 2293 ElementCount VLen = ValVTy->getElementCount(); 2294 2295 Type *STy = Val->getType()->getScalarType(); 2296 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2297 "Induction Step must be an integer or FP"); 2298 assert(Step->getType() == STy && "Step has wrong type"); 2299 2300 SmallVector<Constant *, 8> Indices; 2301 2302 // Create a vector of consecutive numbers from zero to VF. 2303 VectorType *InitVecValVTy = ValVTy; 2304 if (STy->isFloatingPointTy()) { 2305 Type *InitVecValSTy = 2306 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2307 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2308 } 2309 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2310 2311 // Splat the StartIdx 2312 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2313 2314 if (STy->isIntegerTy()) { 2315 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2316 Step = Builder.CreateVectorSplat(VLen, Step); 2317 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2318 // FIXME: The newly created binary instructions should contain nsw/nuw 2319 // flags, which can be found from the original scalar operations. 2320 Step = Builder.CreateMul(InitVec, Step); 2321 return Builder.CreateAdd(Val, Step, "induction"); 2322 } 2323 2324 // Floating point induction. 2325 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2326 "Binary Opcode should be specified for FP induction"); 2327 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2328 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2329 2330 Step = Builder.CreateVectorSplat(VLen, Step); 2331 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2332 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2333 } 2334 2335 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2336 /// variable on which to base the steps, \p Step is the size of the step. 2337 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2338 const InductionDescriptor &ID, VPValue *Def, 2339 VPTransformState &State) { 2340 IRBuilderBase &Builder = State.Builder; 2341 // We shouldn't have to build scalar steps if we aren't vectorizing. 2342 assert(State.VF.isVector() && "VF should be greater than one"); 2343 // Get the value type and ensure it and the step have the same integer type. 2344 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2345 assert(ScalarIVTy == Step->getType() && 2346 "Val and Step should have the same type"); 2347 2348 // We build scalar steps for both integer and floating-point induction 2349 // variables. Here, we determine the kind of arithmetic we will perform. 2350 Instruction::BinaryOps AddOp; 2351 Instruction::BinaryOps MulOp; 2352 if (ScalarIVTy->isIntegerTy()) { 2353 AddOp = Instruction::Add; 2354 MulOp = Instruction::Mul; 2355 } else { 2356 AddOp = ID.getInductionOpcode(); 2357 MulOp = Instruction::FMul; 2358 } 2359 2360 // Determine the number of scalars we need to generate for each unroll 2361 // iteration. 2362 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2363 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2364 // Compute the scalar steps and save the results in State. 2365 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2366 ScalarIVTy->getScalarSizeInBits()); 2367 Type *VecIVTy = nullptr; 2368 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2369 if (!FirstLaneOnly && State.VF.isScalable()) { 2370 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2371 UnitStepVec = 2372 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2373 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2374 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2375 } 2376 2377 for (unsigned Part = 0; Part < State.UF; ++Part) { 2378 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2379 2380 if (!FirstLaneOnly && State.VF.isScalable()) { 2381 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2382 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2383 if (ScalarIVTy->isFloatingPointTy()) 2384 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2385 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2386 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2387 State.set(Def, Add, Part); 2388 // It's useful to record the lane values too for the known minimum number 2389 // of elements so we do those below. This improves the code quality when 2390 // trying to extract the first element, for example. 2391 } 2392 2393 if (ScalarIVTy->isFloatingPointTy()) 2394 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2395 2396 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2397 Value *StartIdx = Builder.CreateBinOp( 2398 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2399 // The step returned by `createStepForVF` is a runtime-evaluated value 2400 // when VF is scalable. Otherwise, it should be folded into a Constant. 2401 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2402 "Expected StartIdx to be folded to a constant when VF is not " 2403 "scalable"); 2404 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2405 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2406 State.set(Def, Add, VPIteration(Part, Lane)); 2407 } 2408 } 2409 } 2410 2411 // Generate code for the induction step. Note that induction steps are 2412 // required to be loop-invariant 2413 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2414 Instruction *InsertBefore, 2415 Loop *OrigLoop = nullptr) { 2416 const DataLayout &DL = SE.getDataLayout(); 2417 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2418 "Induction step should be loop invariant"); 2419 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2420 return E->getValue(); 2421 2422 SCEVExpander Exp(SE, DL, "induction"); 2423 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2424 } 2425 2426 /// Compute the transformed value of Index at offset StartValue using step 2427 /// StepValue. 2428 /// For integer induction, returns StartValue + Index * StepValue. 2429 /// For pointer induction, returns StartValue[Index * StepValue]. 2430 /// FIXME: The newly created binary instructions should contain nsw/nuw 2431 /// flags, which can be found from the original scalar operations. 2432 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2433 Value *StartValue, Value *Step, 2434 const InductionDescriptor &ID) { 2435 assert(Index->getType()->getScalarType() == Step->getType() && 2436 "Index scalar type does not match StepValue type"); 2437 2438 // Note: the IR at this point is broken. We cannot use SE to create any new 2439 // SCEV and then expand it, hoping that SCEV's simplification will give us 2440 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2441 // lead to various SCEV crashes. So all we can do is to use builder and rely 2442 // on InstCombine for future simplifications. Here we handle some trivial 2443 // cases only. 2444 auto CreateAdd = [&B](Value *X, Value *Y) { 2445 assert(X->getType() == Y->getType() && "Types don't match!"); 2446 if (auto *CX = dyn_cast<ConstantInt>(X)) 2447 if (CX->isZero()) 2448 return Y; 2449 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2450 if (CY->isZero()) 2451 return X; 2452 return B.CreateAdd(X, Y); 2453 }; 2454 2455 // We allow X to be a vector type, in which case Y will potentially be 2456 // splatted into a vector with the same element count. 2457 auto CreateMul = [&B](Value *X, Value *Y) { 2458 assert(X->getType()->getScalarType() == Y->getType() && 2459 "Types don't match!"); 2460 if (auto *CX = dyn_cast<ConstantInt>(X)) 2461 if (CX->isOne()) 2462 return Y; 2463 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2464 if (CY->isOne()) 2465 return X; 2466 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2467 if (XVTy && !isa<VectorType>(Y->getType())) 2468 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2469 return B.CreateMul(X, Y); 2470 }; 2471 2472 switch (ID.getKind()) { 2473 case InductionDescriptor::IK_IntInduction: { 2474 assert(!isa<VectorType>(Index->getType()) && 2475 "Vector indices not supported for integer inductions yet"); 2476 assert(Index->getType() == StartValue->getType() && 2477 "Index type does not match StartValue type"); 2478 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2479 return B.CreateSub(StartValue, Index); 2480 auto *Offset = CreateMul(Index, Step); 2481 return CreateAdd(StartValue, Offset); 2482 } 2483 case InductionDescriptor::IK_PtrInduction: { 2484 assert(isa<Constant>(Step) && 2485 "Expected constant step for pointer induction"); 2486 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2487 } 2488 case InductionDescriptor::IK_FpInduction: { 2489 assert(!isa<VectorType>(Index->getType()) && 2490 "Vector indices not supported for FP inductions yet"); 2491 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2492 auto InductionBinOp = ID.getInductionBinOp(); 2493 assert(InductionBinOp && 2494 (InductionBinOp->getOpcode() == Instruction::FAdd || 2495 InductionBinOp->getOpcode() == Instruction::FSub) && 2496 "Original bin op should be defined for FP induction"); 2497 2498 Value *MulExp = B.CreateFMul(Step, Index); 2499 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2500 "induction"); 2501 } 2502 case InductionDescriptor::IK_NoInduction: 2503 return nullptr; 2504 } 2505 llvm_unreachable("invalid enum"); 2506 } 2507 2508 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2509 const VPIteration &Instance, 2510 VPTransformState &State) { 2511 Value *ScalarInst = State.get(Def, Instance); 2512 Value *VectorValue = State.get(Def, Instance.Part); 2513 VectorValue = Builder.CreateInsertElement( 2514 VectorValue, ScalarInst, 2515 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2516 State.set(Def, VectorValue, Instance.Part); 2517 } 2518 2519 // Return whether we allow using masked interleave-groups (for dealing with 2520 // strided loads/stores that reside in predicated blocks, or for dealing 2521 // with gaps). 2522 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2523 // If an override option has been passed in for interleaved accesses, use it. 2524 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2525 return EnableMaskedInterleavedMemAccesses; 2526 2527 return TTI.enableMaskedInterleavedAccessVectorization(); 2528 } 2529 2530 // Try to vectorize the interleave group that \p Instr belongs to. 2531 // 2532 // E.g. Translate following interleaved load group (factor = 3): 2533 // for (i = 0; i < N; i+=3) { 2534 // R = Pic[i]; // Member of index 0 2535 // G = Pic[i+1]; // Member of index 1 2536 // B = Pic[i+2]; // Member of index 2 2537 // ... // do something to R, G, B 2538 // } 2539 // To: 2540 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2541 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2542 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2543 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2544 // 2545 // Or translate following interleaved store group (factor = 3): 2546 // for (i = 0; i < N; i+=3) { 2547 // ... do something to R, G, B 2548 // Pic[i] = R; // Member of index 0 2549 // Pic[i+1] = G; // Member of index 1 2550 // Pic[i+2] = B; // Member of index 2 2551 // } 2552 // To: 2553 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2554 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2555 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2556 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2557 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2558 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2559 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2560 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2561 VPValue *BlockInMask) { 2562 Instruction *Instr = Group->getInsertPos(); 2563 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2564 2565 // Prepare for the vector type of the interleaved load/store. 2566 Type *ScalarTy = getLoadStoreType(Instr); 2567 unsigned InterleaveFactor = Group->getFactor(); 2568 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2569 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2570 2571 // Prepare for the new pointers. 2572 SmallVector<Value *, 2> AddrParts; 2573 unsigned Index = Group->getIndex(Instr); 2574 2575 // TODO: extend the masked interleaved-group support to reversed access. 2576 assert((!BlockInMask || !Group->isReverse()) && 2577 "Reversed masked interleave-group not supported."); 2578 2579 // If the group is reverse, adjust the index to refer to the last vector lane 2580 // instead of the first. We adjust the index from the first vector lane, 2581 // rather than directly getting the pointer for lane VF - 1, because the 2582 // pointer operand of the interleaved access is supposed to be uniform. For 2583 // uniform instructions, we're only required to generate a value for the 2584 // first vector lane in each unroll iteration. 2585 if (Group->isReverse()) 2586 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2587 2588 for (unsigned Part = 0; Part < UF; Part++) { 2589 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2590 setDebugLocFromInst(AddrPart); 2591 2592 // Notice current instruction could be any index. Need to adjust the address 2593 // to the member of index 0. 2594 // 2595 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2596 // b = A[i]; // Member of index 0 2597 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2598 // 2599 // E.g. A[i+1] = a; // Member of index 1 2600 // A[i] = b; // Member of index 0 2601 // A[i+2] = c; // Member of index 2 (Current instruction) 2602 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2603 2604 bool InBounds = false; 2605 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2606 InBounds = gep->isInBounds(); 2607 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2608 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2609 2610 // Cast to the vector pointer type. 2611 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2612 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2613 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2614 } 2615 2616 setDebugLocFromInst(Instr); 2617 Value *PoisonVec = PoisonValue::get(VecTy); 2618 2619 Value *MaskForGaps = nullptr; 2620 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2621 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2622 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2623 } 2624 2625 // Vectorize the interleaved load group. 2626 if (isa<LoadInst>(Instr)) { 2627 // For each unroll part, create a wide load for the group. 2628 SmallVector<Value *, 2> NewLoads; 2629 for (unsigned Part = 0; Part < UF; Part++) { 2630 Instruction *NewLoad; 2631 if (BlockInMask || MaskForGaps) { 2632 assert(useMaskedInterleavedAccesses(*TTI) && 2633 "masked interleaved groups are not allowed."); 2634 Value *GroupMask = MaskForGaps; 2635 if (BlockInMask) { 2636 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2637 Value *ShuffledMask = Builder.CreateShuffleVector( 2638 BlockInMaskPart, 2639 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2640 "interleaved.mask"); 2641 GroupMask = MaskForGaps 2642 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2643 MaskForGaps) 2644 : ShuffledMask; 2645 } 2646 NewLoad = 2647 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2648 GroupMask, PoisonVec, "wide.masked.vec"); 2649 } 2650 else 2651 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2652 Group->getAlign(), "wide.vec"); 2653 Group->addMetadata(NewLoad); 2654 NewLoads.push_back(NewLoad); 2655 } 2656 2657 // For each member in the group, shuffle out the appropriate data from the 2658 // wide loads. 2659 unsigned J = 0; 2660 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2661 Instruction *Member = Group->getMember(I); 2662 2663 // Skip the gaps in the group. 2664 if (!Member) 2665 continue; 2666 2667 auto StrideMask = 2668 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2669 for (unsigned Part = 0; Part < UF; Part++) { 2670 Value *StridedVec = Builder.CreateShuffleVector( 2671 NewLoads[Part], StrideMask, "strided.vec"); 2672 2673 // If this member has different type, cast the result type. 2674 if (Member->getType() != ScalarTy) { 2675 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2676 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2677 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2678 } 2679 2680 if (Group->isReverse()) 2681 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2682 2683 State.set(VPDefs[J], StridedVec, Part); 2684 } 2685 ++J; 2686 } 2687 return; 2688 } 2689 2690 // The sub vector type for current instruction. 2691 auto *SubVT = VectorType::get(ScalarTy, VF); 2692 2693 // Vectorize the interleaved store group. 2694 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2695 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2696 "masked interleaved groups are not allowed."); 2697 assert((!MaskForGaps || !VF.isScalable()) && 2698 "masking gaps for scalable vectors is not yet supported."); 2699 for (unsigned Part = 0; Part < UF; Part++) { 2700 // Collect the stored vector from each member. 2701 SmallVector<Value *, 4> StoredVecs; 2702 for (unsigned i = 0; i < InterleaveFactor; i++) { 2703 assert((Group->getMember(i) || MaskForGaps) && 2704 "Fail to get a member from an interleaved store group"); 2705 Instruction *Member = Group->getMember(i); 2706 2707 // Skip the gaps in the group. 2708 if (!Member) { 2709 Value *Undef = PoisonValue::get(SubVT); 2710 StoredVecs.push_back(Undef); 2711 continue; 2712 } 2713 2714 Value *StoredVec = State.get(StoredValues[i], Part); 2715 2716 if (Group->isReverse()) 2717 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2718 2719 // If this member has different type, cast it to a unified type. 2720 2721 if (StoredVec->getType() != SubVT) 2722 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2723 2724 StoredVecs.push_back(StoredVec); 2725 } 2726 2727 // Concatenate all vectors into a wide vector. 2728 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2729 2730 // Interleave the elements in the wide vector. 2731 Value *IVec = Builder.CreateShuffleVector( 2732 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2733 "interleaved.vec"); 2734 2735 Instruction *NewStoreInstr; 2736 if (BlockInMask || MaskForGaps) { 2737 Value *GroupMask = MaskForGaps; 2738 if (BlockInMask) { 2739 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2740 Value *ShuffledMask = Builder.CreateShuffleVector( 2741 BlockInMaskPart, 2742 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2743 "interleaved.mask"); 2744 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2745 ShuffledMask, MaskForGaps) 2746 : ShuffledMask; 2747 } 2748 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2749 Group->getAlign(), GroupMask); 2750 } else 2751 NewStoreInstr = 2752 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2753 2754 Group->addMetadata(NewStoreInstr); 2755 } 2756 } 2757 2758 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2759 VPReplicateRecipe *RepRecipe, 2760 const VPIteration &Instance, 2761 bool IfPredicateInstr, 2762 VPTransformState &State) { 2763 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2764 2765 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2766 // the first lane and part. 2767 if (isa<NoAliasScopeDeclInst>(Instr)) 2768 if (!Instance.isFirstIteration()) 2769 return; 2770 2771 // Does this instruction return a value ? 2772 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2773 2774 Instruction *Cloned = Instr->clone(); 2775 if (!IsVoidRetTy) 2776 Cloned->setName(Instr->getName() + ".cloned"); 2777 2778 // If the scalarized instruction contributes to the address computation of a 2779 // widen masked load/store which was in a basic block that needed predication 2780 // and is not predicated after vectorization, we can't propagate 2781 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2782 // instruction could feed a poison value to the base address of the widen 2783 // load/store. 2784 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2785 Cloned->dropPoisonGeneratingFlags(); 2786 2787 if (Instr->getDebugLoc()) 2788 setDebugLocFromInst(Instr); 2789 2790 // Replace the operands of the cloned instructions with their scalar 2791 // equivalents in the new loop. 2792 for (auto &I : enumerate(RepRecipe->operands())) { 2793 auto InputInstance = Instance; 2794 VPValue *Operand = I.value(); 2795 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2796 if (OperandR && OperandR->isUniform()) 2797 InputInstance.Lane = VPLane::getFirstLane(); 2798 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2799 } 2800 addNewMetadata(Cloned, Instr); 2801 2802 // Place the cloned scalar in the new loop. 2803 State.Builder.Insert(Cloned); 2804 2805 State.set(RepRecipe, Cloned, Instance); 2806 2807 // If we just cloned a new assumption, add it the assumption cache. 2808 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2809 AC->registerAssumption(II); 2810 2811 // End if-block. 2812 if (IfPredicateInstr) 2813 PredicatedInstructions.push_back(Cloned); 2814 } 2815 2816 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2817 if (TripCount) 2818 return TripCount; 2819 2820 assert(InsertBlock); 2821 IRBuilder<> Builder(InsertBlock->getTerminator()); 2822 // Find the loop boundaries. 2823 ScalarEvolution *SE = PSE.getSE(); 2824 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2825 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2826 "Invalid loop count"); 2827 2828 Type *IdxTy = Legal->getWidestInductionType(); 2829 assert(IdxTy && "No type for induction"); 2830 2831 // The exit count might have the type of i64 while the phi is i32. This can 2832 // happen if we have an induction variable that is sign extended before the 2833 // compare. The only way that we get a backedge taken count is that the 2834 // induction variable was signed and as such will not overflow. In such a case 2835 // truncation is legal. 2836 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2837 IdxTy->getPrimitiveSizeInBits()) 2838 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2839 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2840 2841 // Get the total trip count from the count by adding 1. 2842 const SCEV *ExitCount = SE->getAddExpr( 2843 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2844 2845 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2846 2847 // Expand the trip count and place the new instructions in the preheader. 2848 // Notice that the pre-header does not change, only the loop body. 2849 SCEVExpander Exp(*SE, DL, "induction"); 2850 2851 // Count holds the overall loop count (N). 2852 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2853 InsertBlock->getTerminator()); 2854 2855 if (TripCount->getType()->isPointerTy()) 2856 TripCount = 2857 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2858 InsertBlock->getTerminator()); 2859 2860 return TripCount; 2861 } 2862 2863 Value * 2864 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2865 if (VectorTripCount) 2866 return VectorTripCount; 2867 2868 Value *TC = getOrCreateTripCount(InsertBlock); 2869 IRBuilder<> Builder(InsertBlock->getTerminator()); 2870 2871 Type *Ty = TC->getType(); 2872 // This is where we can make the step a runtime constant. 2873 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2874 2875 // If the tail is to be folded by masking, round the number of iterations N 2876 // up to a multiple of Step instead of rounding down. This is done by first 2877 // adding Step-1 and then rounding down. Note that it's ok if this addition 2878 // overflows: the vector induction variable will eventually wrap to zero given 2879 // that it starts at zero and its Step is a power of two; the loop will then 2880 // exit, with the last early-exit vector comparison also producing all-true. 2881 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2882 // is accounted for in emitIterationCountCheck that adds an overflow check. 2883 if (Cost->foldTailByMasking()) { 2884 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2885 "VF*UF must be a power of 2 when folding tail by masking"); 2886 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2887 TC = Builder.CreateAdd( 2888 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2889 } 2890 2891 // Now we need to generate the expression for the part of the loop that the 2892 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2893 // iterations are not required for correctness, or N - Step, otherwise. Step 2894 // is equal to the vectorization factor (number of SIMD elements) times the 2895 // unroll factor (number of SIMD instructions). 2896 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2897 2898 // There are cases where we *must* run at least one iteration in the remainder 2899 // loop. See the cost model for when this can happen. If the step evenly 2900 // divides the trip count, we set the remainder to be equal to the step. If 2901 // the step does not evenly divide the trip count, no adjustment is necessary 2902 // since there will already be scalar iterations. Note that the minimum 2903 // iterations check ensures that N >= Step. 2904 if (Cost->requiresScalarEpilogue(VF)) { 2905 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2906 R = Builder.CreateSelect(IsZero, Step, R); 2907 } 2908 2909 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2910 2911 return VectorTripCount; 2912 } 2913 2914 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2915 const DataLayout &DL) { 2916 // Verify that V is a vector type with same number of elements as DstVTy. 2917 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2918 unsigned VF = DstFVTy->getNumElements(); 2919 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2920 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2921 Type *SrcElemTy = SrcVecTy->getElementType(); 2922 Type *DstElemTy = DstFVTy->getElementType(); 2923 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2924 "Vector elements must have same size"); 2925 2926 // Do a direct cast if element types are castable. 2927 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2928 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2929 } 2930 // V cannot be directly casted to desired vector type. 2931 // May happen when V is a floating point vector but DstVTy is a vector of 2932 // pointers or vice-versa. Handle this using a two-step bitcast using an 2933 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2934 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2935 "Only one type should be a pointer type"); 2936 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2937 "Only one type should be a floating point type"); 2938 Type *IntTy = 2939 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2940 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2941 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2942 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2943 } 2944 2945 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2946 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2947 // Reuse existing vector loop preheader for TC checks. 2948 // Note that new preheader block is generated for vector loop. 2949 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2950 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2951 2952 // Generate code to check if the loop's trip count is less than VF * UF, or 2953 // equal to it in case a scalar epilogue is required; this implies that the 2954 // vector trip count is zero. This check also covers the case where adding one 2955 // to the backedge-taken count overflowed leading to an incorrect trip count 2956 // of zero. In this case we will also jump to the scalar loop. 2957 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2958 : ICmpInst::ICMP_ULT; 2959 2960 // If tail is to be folded, vector loop takes care of all iterations. 2961 Type *CountTy = Count->getType(); 2962 Value *CheckMinIters = Builder.getFalse(); 2963 Value *Step = createStepForVF(Builder, CountTy, VF, UF); 2964 if (!Cost->foldTailByMasking()) 2965 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2966 else if (VF.isScalable()) { 2967 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2968 // an overflow to zero when updating induction variables and so an 2969 // additional overflow check is required before entering the vector loop. 2970 2971 // Get the maximum unsigned value for the type. 2972 Value *MaxUIntTripCount = 2973 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2974 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2975 2976 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2977 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); 2978 } 2979 // Create new preheader for vector loop. 2980 LoopVectorPreHeader = 2981 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2982 "vector.ph"); 2983 2984 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2985 DT->getNode(Bypass)->getIDom()) && 2986 "TC check is expected to dominate Bypass"); 2987 2988 // Update dominator for Bypass & LoopExit (if needed). 2989 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2990 if (!Cost->requiresScalarEpilogue(VF)) 2991 // If there is an epilogue which must run, there's no edge from the 2992 // middle block to exit blocks and thus no need to update the immediate 2993 // dominator of the exit blocks. 2994 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2995 2996 ReplaceInstWithInst( 2997 TCCheckBlock->getTerminator(), 2998 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2999 LoopBypassBlocks.push_back(TCCheckBlock); 3000 } 3001 3002 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 3003 3004 BasicBlock *const SCEVCheckBlock = 3005 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3006 if (!SCEVCheckBlock) 3007 return nullptr; 3008 3009 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3010 (OptForSizeBasedOnProfile && 3011 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3012 "Cannot SCEV check stride or overflow when optimizing for size"); 3013 3014 3015 // Update dominator only if this is first RT check. 3016 if (LoopBypassBlocks.empty()) { 3017 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3018 if (!Cost->requiresScalarEpilogue(VF)) 3019 // If there is an epilogue which must run, there's no edge from the 3020 // middle block to exit blocks and thus no need to update the immediate 3021 // dominator of the exit blocks. 3022 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3023 } 3024 3025 LoopBypassBlocks.push_back(SCEVCheckBlock); 3026 AddedSafetyChecks = true; 3027 return SCEVCheckBlock; 3028 } 3029 3030 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3031 // VPlan-native path does not do any analysis for runtime checks currently. 3032 if (EnableVPlanNativePath) 3033 return nullptr; 3034 3035 BasicBlock *const MemCheckBlock = 3036 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3037 3038 // Check if we generated code that checks in runtime if arrays overlap. We put 3039 // the checks into a separate block to make the more common case of few 3040 // elements faster. 3041 if (!MemCheckBlock) 3042 return nullptr; 3043 3044 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3045 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3046 "Cannot emit memory checks when optimizing for size, unless forced " 3047 "to vectorize."); 3048 ORE->emit([&]() { 3049 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3050 OrigLoop->getStartLoc(), 3051 OrigLoop->getHeader()) 3052 << "Code-size may be reduced by not forcing " 3053 "vectorization, or by source-code modifications " 3054 "eliminating the need for runtime checks " 3055 "(e.g., adding 'restrict')."; 3056 }); 3057 } 3058 3059 LoopBypassBlocks.push_back(MemCheckBlock); 3060 3061 AddedSafetyChecks = true; 3062 3063 // Only use noalias metadata when using memory checks guaranteeing no overlap 3064 // across all iterations. 3065 if (!Legal->getLAI()->getRuntimePointerChecking()->getDiffChecks()) { 3066 // We currently don't use LoopVersioning for the actual loop cloning but we 3067 // still use it to add the noalias metadata. 3068 LVer = std::make_unique<LoopVersioning>( 3069 *Legal->getLAI(), 3070 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3071 DT, PSE.getSE()); 3072 LVer->prepareNoAliasMetadata(); 3073 } 3074 return MemCheckBlock; 3075 } 3076 3077 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3078 LoopScalarBody = OrigLoop->getHeader(); 3079 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3080 assert(LoopVectorPreHeader && "Invalid loop structure"); 3081 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3082 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3083 "multiple exit loop without required epilogue?"); 3084 3085 LoopMiddleBlock = 3086 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3087 LI, nullptr, Twine(Prefix) + "middle.block"); 3088 LoopScalarPreHeader = 3089 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3090 nullptr, Twine(Prefix) + "scalar.ph"); 3091 3092 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3093 3094 // Set up the middle block terminator. Two cases: 3095 // 1) If we know that we must execute the scalar epilogue, emit an 3096 // unconditional branch. 3097 // 2) Otherwise, we must have a single unique exit block (due to how we 3098 // implement the multiple exit case). In this case, set up a conditonal 3099 // branch from the middle block to the loop scalar preheader, and the 3100 // exit block. completeLoopSkeleton will update the condition to use an 3101 // iteration check, if required to decide whether to execute the remainder. 3102 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3103 BranchInst::Create(LoopScalarPreHeader) : 3104 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3105 Builder.getTrue()); 3106 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3107 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3108 3109 // Update dominator for loop exit. During skeleton creation, only the vector 3110 // pre-header and the middle block are created. The vector loop is entirely 3111 // created during VPlan exection. 3112 if (!Cost->requiresScalarEpilogue(VF)) 3113 // If there is an epilogue which must run, there's no edge from the 3114 // middle block to exit blocks and thus no need to update the immediate 3115 // dominator of the exit blocks. 3116 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3117 } 3118 3119 void InnerLoopVectorizer::createInductionResumeValues( 3120 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3121 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3122 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3123 "Inconsistent information about additional bypass."); 3124 3125 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3126 assert(VectorTripCount && "Expected valid arguments"); 3127 // We are going to resume the execution of the scalar loop. 3128 // Go over all of the induction variables that we found and fix the 3129 // PHIs that are left in the scalar version of the loop. 3130 // The starting values of PHI nodes depend on the counter of the last 3131 // iteration in the vectorized loop. 3132 // If we come from a bypass edge then we need to start from the original 3133 // start value. 3134 Instruction *OldInduction = Legal->getPrimaryInduction(); 3135 for (auto &InductionEntry : Legal->getInductionVars()) { 3136 PHINode *OrigPhi = InductionEntry.first; 3137 InductionDescriptor II = InductionEntry.second; 3138 3139 // Create phi nodes to merge from the backedge-taken check block. 3140 PHINode *BCResumeVal = 3141 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3142 LoopScalarPreHeader->getTerminator()); 3143 // Copy original phi DL over to the new one. 3144 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3145 Value *&EndValue = IVEndValues[OrigPhi]; 3146 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3147 if (OrigPhi == OldInduction) { 3148 // We know what the end value is. 3149 EndValue = VectorTripCount; 3150 } else { 3151 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3152 3153 // Fast-math-flags propagate from the original induction instruction. 3154 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3155 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3156 3157 Type *StepType = II.getStep()->getType(); 3158 Instruction::CastOps CastOp = 3159 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3160 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3161 Value *Step = 3162 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3163 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3164 EndValue->setName("ind.end"); 3165 3166 // Compute the end value for the additional bypass (if applicable). 3167 if (AdditionalBypass.first) { 3168 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3169 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3170 StepType, true); 3171 Value *Step = 3172 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3173 VTC = 3174 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3175 EndValueFromAdditionalBypass = 3176 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3177 EndValueFromAdditionalBypass->setName("ind.end"); 3178 } 3179 } 3180 // The new PHI merges the original incoming value, in case of a bypass, 3181 // or the value at the end of the vectorized loop. 3182 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3183 3184 // Fix the scalar body counter (PHI node). 3185 // The old induction's phi node in the scalar body needs the truncated 3186 // value. 3187 for (BasicBlock *BB : LoopBypassBlocks) 3188 BCResumeVal->addIncoming(II.getStartValue(), BB); 3189 3190 if (AdditionalBypass.first) 3191 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3192 EndValueFromAdditionalBypass); 3193 3194 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3195 } 3196 } 3197 3198 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3199 // The trip counts should be cached by now. 3200 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3201 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3202 3203 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3204 3205 // Add a check in the middle block to see if we have completed 3206 // all of the iterations in the first vector loop. Three cases: 3207 // 1) If we require a scalar epilogue, there is no conditional branch as 3208 // we unconditionally branch to the scalar preheader. Do nothing. 3209 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3210 // Thus if tail is to be folded, we know we don't need to run the 3211 // remainder and we can use the previous value for the condition (true). 3212 // 3) Otherwise, construct a runtime check. 3213 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3214 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3215 Count, VectorTripCount, "cmp.n", 3216 LoopMiddleBlock->getTerminator()); 3217 3218 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3219 // of the corresponding compare because they may have ended up with 3220 // different line numbers and we want to avoid awkward line stepping while 3221 // debugging. Eg. if the compare has got a line number inside the loop. 3222 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3223 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3224 } 3225 3226 #ifdef EXPENSIVE_CHECKS 3227 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3228 #endif 3229 3230 return LoopVectorPreHeader; 3231 } 3232 3233 std::pair<BasicBlock *, Value *> 3234 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3235 /* 3236 In this function we generate a new loop. The new loop will contain 3237 the vectorized instructions while the old loop will continue to run the 3238 scalar remainder. 3239 3240 [ ] <-- loop iteration number check. 3241 / | 3242 / v 3243 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3244 | / | 3245 | / v 3246 || [ ] <-- vector pre header. 3247 |/ | 3248 | v 3249 | [ ] \ 3250 | [ ]_| <-- vector loop (created during VPlan execution). 3251 | | 3252 | v 3253 \ -[ ] <--- middle-block. 3254 \/ | 3255 /\ v 3256 | ->[ ] <--- new preheader. 3257 | | 3258 (opt) v <-- edge from middle to exit iff epilogue is not required. 3259 | [ ] \ 3260 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3261 \ | 3262 \ v 3263 >[ ] <-- exit block(s). 3264 ... 3265 */ 3266 3267 // Get the metadata of the original loop before it gets modified. 3268 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3269 3270 // Workaround! Compute the trip count of the original loop and cache it 3271 // before we start modifying the CFG. This code has a systemic problem 3272 // wherein it tries to run analysis over partially constructed IR; this is 3273 // wrong, and not simply for SCEV. The trip count of the original loop 3274 // simply happens to be prone to hitting this in practice. In theory, we 3275 // can hit the same issue for any SCEV, or ValueTracking query done during 3276 // mutation. See PR49900. 3277 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3278 3279 // Create an empty vector loop, and prepare basic blocks for the runtime 3280 // checks. 3281 createVectorLoopSkeleton(""); 3282 3283 // Now, compare the new count to zero. If it is zero skip the vector loop and 3284 // jump to the scalar loop. This check also covers the case where the 3285 // backedge-taken count is uint##_max: adding one to it will overflow leading 3286 // to an incorrect trip count of zero. In this (rare) case we will also jump 3287 // to the scalar loop. 3288 emitIterationCountCheck(LoopScalarPreHeader); 3289 3290 // Generate the code to check any assumptions that we've made for SCEV 3291 // expressions. 3292 emitSCEVChecks(LoopScalarPreHeader); 3293 3294 // Generate the code that checks in runtime if arrays overlap. We put the 3295 // checks into a separate block to make the more common case of few elements 3296 // faster. 3297 emitMemRuntimeChecks(LoopScalarPreHeader); 3298 3299 // Emit phis for the new starting index of the scalar loop. 3300 createInductionResumeValues(); 3301 3302 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3303 } 3304 3305 // Fix up external users of the induction variable. At this point, we are 3306 // in LCSSA form, with all external PHIs that use the IV having one input value, 3307 // coming from the remainder loop. We need those PHIs to also have a correct 3308 // value for the IV when arriving directly from the middle block. 3309 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3310 const InductionDescriptor &II, 3311 Value *VectorTripCount, Value *EndValue, 3312 BasicBlock *MiddleBlock, 3313 BasicBlock *VectorHeader, VPlan &Plan) { 3314 // There are two kinds of external IV usages - those that use the value 3315 // computed in the last iteration (the PHI) and those that use the penultimate 3316 // value (the value that feeds into the phi from the loop latch). 3317 // We allow both, but they, obviously, have different values. 3318 3319 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3320 3321 DenseMap<Value *, Value *> MissingVals; 3322 3323 // An external user of the last iteration's value should see the value that 3324 // the remainder loop uses to initialize its own IV. 3325 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3326 for (User *U : PostInc->users()) { 3327 Instruction *UI = cast<Instruction>(U); 3328 if (!OrigLoop->contains(UI)) { 3329 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3330 MissingVals[UI] = EndValue; 3331 } 3332 } 3333 3334 // An external user of the penultimate value need to see EndValue - Step. 3335 // The simplest way to get this is to recompute it from the constituent SCEVs, 3336 // that is Start + (Step * (CRD - 1)). 3337 for (User *U : OrigPhi->users()) { 3338 auto *UI = cast<Instruction>(U); 3339 if (!OrigLoop->contains(UI)) { 3340 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3341 3342 IRBuilder<> B(MiddleBlock->getTerminator()); 3343 3344 // Fast-math-flags propagate from the original induction instruction. 3345 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3346 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3347 3348 Value *CountMinusOne = B.CreateSub( 3349 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3350 Value *CMO = 3351 !II.getStep()->getType()->isIntegerTy() 3352 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3353 II.getStep()->getType()) 3354 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3355 CMO->setName("cast.cmo"); 3356 3357 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3358 VectorHeader->getTerminator()); 3359 Value *Escape = 3360 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3361 Escape->setName("ind.escape"); 3362 MissingVals[UI] = Escape; 3363 } 3364 } 3365 3366 for (auto &I : MissingVals) { 3367 PHINode *PHI = cast<PHINode>(I.first); 3368 // One corner case we have to handle is two IVs "chasing" each-other, 3369 // that is %IV2 = phi [...], [ %IV1, %latch ] 3370 // In this case, if IV1 has an external use, we need to avoid adding both 3371 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3372 // don't already have an incoming value for the middle block. 3373 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3374 PHI->addIncoming(I.second, MiddleBlock); 3375 Plan.removeLiveOut(PHI); 3376 } 3377 } 3378 } 3379 3380 namespace { 3381 3382 struct CSEDenseMapInfo { 3383 static bool canHandle(const Instruction *I) { 3384 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3385 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3386 } 3387 3388 static inline Instruction *getEmptyKey() { 3389 return DenseMapInfo<Instruction *>::getEmptyKey(); 3390 } 3391 3392 static inline Instruction *getTombstoneKey() { 3393 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3394 } 3395 3396 static unsigned getHashValue(const Instruction *I) { 3397 assert(canHandle(I) && "Unknown instruction!"); 3398 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3399 I->value_op_end())); 3400 } 3401 3402 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3403 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3404 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3405 return LHS == RHS; 3406 return LHS->isIdenticalTo(RHS); 3407 } 3408 }; 3409 3410 } // end anonymous namespace 3411 3412 ///Perform cse of induction variable instructions. 3413 static void cse(BasicBlock *BB) { 3414 // Perform simple cse. 3415 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3416 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3417 if (!CSEDenseMapInfo::canHandle(&In)) 3418 continue; 3419 3420 // Check if we can replace this instruction with any of the 3421 // visited instructions. 3422 if (Instruction *V = CSEMap.lookup(&In)) { 3423 In.replaceAllUsesWith(V); 3424 In.eraseFromParent(); 3425 continue; 3426 } 3427 3428 CSEMap[&In] = &In; 3429 } 3430 } 3431 3432 InstructionCost 3433 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3434 bool &NeedToScalarize) const { 3435 Function *F = CI->getCalledFunction(); 3436 Type *ScalarRetTy = CI->getType(); 3437 SmallVector<Type *, 4> Tys, ScalarTys; 3438 for (auto &ArgOp : CI->args()) 3439 ScalarTys.push_back(ArgOp->getType()); 3440 3441 // Estimate cost of scalarized vector call. The source operands are assumed 3442 // to be vectors, so we need to extract individual elements from there, 3443 // execute VF scalar calls, and then gather the result into the vector return 3444 // value. 3445 InstructionCost ScalarCallCost = 3446 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3447 if (VF.isScalar()) 3448 return ScalarCallCost; 3449 3450 // Compute corresponding vector type for return value and arguments. 3451 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3452 for (Type *ScalarTy : ScalarTys) 3453 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3454 3455 // Compute costs of unpacking argument values for the scalar calls and 3456 // packing the return values to a vector. 3457 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3458 3459 InstructionCost Cost = 3460 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3461 3462 // If we can't emit a vector call for this function, then the currently found 3463 // cost is the cost we need to return. 3464 NeedToScalarize = true; 3465 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3466 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3467 3468 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3469 return Cost; 3470 3471 // If the corresponding vector cost is cheaper, return its cost. 3472 InstructionCost VectorCallCost = 3473 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3474 if (VectorCallCost < Cost) { 3475 NeedToScalarize = false; 3476 Cost = VectorCallCost; 3477 } 3478 return Cost; 3479 } 3480 3481 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3482 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3483 return Elt; 3484 return VectorType::get(Elt, VF); 3485 } 3486 3487 InstructionCost 3488 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3489 ElementCount VF) const { 3490 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3491 assert(ID && "Expected intrinsic call!"); 3492 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3493 FastMathFlags FMF; 3494 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3495 FMF = FPMO->getFastMathFlags(); 3496 3497 SmallVector<const Value *> Arguments(CI->args()); 3498 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3499 SmallVector<Type *> ParamTys; 3500 std::transform(FTy->param_begin(), FTy->param_end(), 3501 std::back_inserter(ParamTys), 3502 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3503 3504 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3505 dyn_cast<IntrinsicInst>(CI)); 3506 return TTI.getIntrinsicInstrCost(CostAttrs, 3507 TargetTransformInfo::TCK_RecipThroughput); 3508 } 3509 3510 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3511 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3512 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3513 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3514 } 3515 3516 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3517 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3518 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3519 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3520 } 3521 3522 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3523 // For every instruction `I` in MinBWs, truncate the operands, create a 3524 // truncated version of `I` and reextend its result. InstCombine runs 3525 // later and will remove any ext/trunc pairs. 3526 SmallPtrSet<Value *, 4> Erased; 3527 for (const auto &KV : Cost->getMinimalBitwidths()) { 3528 // If the value wasn't vectorized, we must maintain the original scalar 3529 // type. The absence of the value from State indicates that it 3530 // wasn't vectorized. 3531 // FIXME: Should not rely on getVPValue at this point. 3532 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3533 if (!State.hasAnyVectorValue(Def)) 3534 continue; 3535 for (unsigned Part = 0; Part < UF; ++Part) { 3536 Value *I = State.get(Def, Part); 3537 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3538 continue; 3539 Type *OriginalTy = I->getType(); 3540 Type *ScalarTruncatedTy = 3541 IntegerType::get(OriginalTy->getContext(), KV.second); 3542 auto *TruncatedTy = VectorType::get( 3543 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3544 if (TruncatedTy == OriginalTy) 3545 continue; 3546 3547 IRBuilder<> B(cast<Instruction>(I)); 3548 auto ShrinkOperand = [&](Value *V) -> Value * { 3549 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3550 if (ZI->getSrcTy() == TruncatedTy) 3551 return ZI->getOperand(0); 3552 return B.CreateZExtOrTrunc(V, TruncatedTy); 3553 }; 3554 3555 // The actual instruction modification depends on the instruction type, 3556 // unfortunately. 3557 Value *NewI = nullptr; 3558 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3559 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3560 ShrinkOperand(BO->getOperand(1))); 3561 3562 // Any wrapping introduced by shrinking this operation shouldn't be 3563 // considered undefined behavior. So, we can't unconditionally copy 3564 // arithmetic wrapping flags to NewI. 3565 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3566 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3567 NewI = 3568 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3569 ShrinkOperand(CI->getOperand(1))); 3570 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3571 NewI = B.CreateSelect(SI->getCondition(), 3572 ShrinkOperand(SI->getTrueValue()), 3573 ShrinkOperand(SI->getFalseValue())); 3574 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3575 switch (CI->getOpcode()) { 3576 default: 3577 llvm_unreachable("Unhandled cast!"); 3578 case Instruction::Trunc: 3579 NewI = ShrinkOperand(CI->getOperand(0)); 3580 break; 3581 case Instruction::SExt: 3582 NewI = B.CreateSExtOrTrunc( 3583 CI->getOperand(0), 3584 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3585 break; 3586 case Instruction::ZExt: 3587 NewI = B.CreateZExtOrTrunc( 3588 CI->getOperand(0), 3589 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3590 break; 3591 } 3592 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3593 auto Elements0 = 3594 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3595 auto *O0 = B.CreateZExtOrTrunc( 3596 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3597 auto Elements1 = 3598 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3599 auto *O1 = B.CreateZExtOrTrunc( 3600 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3601 3602 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3603 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3604 // Don't do anything with the operands, just extend the result. 3605 continue; 3606 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3607 auto Elements = 3608 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3609 auto *O0 = B.CreateZExtOrTrunc( 3610 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3611 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3612 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3613 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3614 auto Elements = 3615 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3616 auto *O0 = B.CreateZExtOrTrunc( 3617 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3618 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3619 } else { 3620 // If we don't know what to do, be conservative and don't do anything. 3621 continue; 3622 } 3623 3624 // Lastly, extend the result. 3625 NewI->takeName(cast<Instruction>(I)); 3626 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3627 I->replaceAllUsesWith(Res); 3628 cast<Instruction>(I)->eraseFromParent(); 3629 Erased.insert(I); 3630 State.reset(Def, Res, Part); 3631 } 3632 } 3633 3634 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3635 for (const auto &KV : Cost->getMinimalBitwidths()) { 3636 // If the value wasn't vectorized, we must maintain the original scalar 3637 // type. The absence of the value from State indicates that it 3638 // wasn't vectorized. 3639 // FIXME: Should not rely on getVPValue at this point. 3640 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3641 if (!State.hasAnyVectorValue(Def)) 3642 continue; 3643 for (unsigned Part = 0; Part < UF; ++Part) { 3644 Value *I = State.get(Def, Part); 3645 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3646 if (Inst && Inst->use_empty()) { 3647 Value *NewI = Inst->getOperand(0); 3648 Inst->eraseFromParent(); 3649 State.reset(Def, NewI, Part); 3650 } 3651 } 3652 } 3653 } 3654 3655 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3656 VPlan &Plan) { 3657 // Insert truncates and extends for any truncated instructions as hints to 3658 // InstCombine. 3659 if (VF.isVector()) 3660 truncateToMinimalBitwidths(State); 3661 3662 // Fix widened non-induction PHIs by setting up the PHI operands. 3663 if (EnableVPlanNativePath) 3664 fixNonInductionPHIs(Plan, State); 3665 3666 // At this point every instruction in the original loop is widened to a 3667 // vector form. Now we need to fix the recurrences in the loop. These PHI 3668 // nodes are currently empty because we did not want to introduce cycles. 3669 // This is the second stage of vectorizing recurrences. 3670 fixCrossIterationPHIs(State); 3671 3672 // Forget the original basic block. 3673 PSE.getSE()->forgetLoop(OrigLoop); 3674 3675 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3676 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3677 if (Cost->requiresScalarEpilogue(VF)) { 3678 // No edge from the middle block to the unique exit block has been inserted 3679 // and there is nothing to fix from vector loop; phis should have incoming 3680 // from scalar loop only. 3681 Plan.clearLiveOuts(); 3682 } else { 3683 // If we inserted an edge from the middle block to the unique exit block, 3684 // update uses outside the loop (phis) to account for the newly inserted 3685 // edge. 3686 3687 // Fix-up external users of the induction variables. 3688 for (auto &Entry : Legal->getInductionVars()) 3689 fixupIVUsers(Entry.first, Entry.second, 3690 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3691 IVEndValues[Entry.first], LoopMiddleBlock, 3692 VectorLoop->getHeader(), Plan); 3693 } 3694 3695 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3696 // in the exit block, so update the builder. 3697 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3698 for (auto &KV : Plan.getLiveOuts()) 3699 KV.second->fixPhi(Plan, State); 3700 3701 for (Instruction *PI : PredicatedInstructions) 3702 sinkScalarOperands(&*PI); 3703 3704 // Remove redundant induction instructions. 3705 cse(VectorLoop->getHeader()); 3706 3707 // Set/update profile weights for the vector and remainder loops as original 3708 // loop iterations are now distributed among them. Note that original loop 3709 // represented by LoopScalarBody becomes remainder loop after vectorization. 3710 // 3711 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3712 // end up getting slightly roughened result but that should be OK since 3713 // profile is not inherently precise anyway. Note also possible bypass of 3714 // vector code caused by legality checks is ignored, assigning all the weight 3715 // to the vector loop, optimistically. 3716 // 3717 // For scalable vectorization we can't know at compile time how many iterations 3718 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3719 // vscale of '1'. 3720 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3721 LI->getLoopFor(LoopScalarBody), 3722 VF.getKnownMinValue() * UF); 3723 } 3724 3725 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3726 // In order to support recurrences we need to be able to vectorize Phi nodes. 3727 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3728 // stage #2: We now need to fix the recurrences by adding incoming edges to 3729 // the currently empty PHI nodes. At this point every instruction in the 3730 // original loop is widened to a vector form so we can use them to construct 3731 // the incoming edges. 3732 VPBasicBlock *Header = 3733 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3734 for (VPRecipeBase &R : Header->phis()) { 3735 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3736 fixReduction(ReductionPhi, State); 3737 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3738 fixFirstOrderRecurrence(FOR, State); 3739 } 3740 } 3741 3742 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3743 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3744 // This is the second phase of vectorizing first-order recurrences. An 3745 // overview of the transformation is described below. Suppose we have the 3746 // following loop. 3747 // 3748 // for (int i = 0; i < n; ++i) 3749 // b[i] = a[i] - a[i - 1]; 3750 // 3751 // There is a first-order recurrence on "a". For this loop, the shorthand 3752 // scalar IR looks like: 3753 // 3754 // scalar.ph: 3755 // s_init = a[-1] 3756 // br scalar.body 3757 // 3758 // scalar.body: 3759 // i = phi [0, scalar.ph], [i+1, scalar.body] 3760 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3761 // s2 = a[i] 3762 // b[i] = s2 - s1 3763 // br cond, scalar.body, ... 3764 // 3765 // In this example, s1 is a recurrence because it's value depends on the 3766 // previous iteration. In the first phase of vectorization, we created a 3767 // vector phi v1 for s1. We now complete the vectorization and produce the 3768 // shorthand vector IR shown below (for VF = 4, UF = 1). 3769 // 3770 // vector.ph: 3771 // v_init = vector(..., ..., ..., a[-1]) 3772 // br vector.body 3773 // 3774 // vector.body 3775 // i = phi [0, vector.ph], [i+4, vector.body] 3776 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3777 // v2 = a[i, i+1, i+2, i+3]; 3778 // v3 = vector(v1(3), v2(0, 1, 2)) 3779 // b[i, i+1, i+2, i+3] = v2 - v3 3780 // br cond, vector.body, middle.block 3781 // 3782 // middle.block: 3783 // x = v2(3) 3784 // br scalar.ph 3785 // 3786 // scalar.ph: 3787 // s_init = phi [x, middle.block], [a[-1], otherwise] 3788 // br scalar.body 3789 // 3790 // After execution completes the vector loop, we extract the next value of 3791 // the recurrence (x) to use as the initial value in the scalar loop. 3792 3793 // Extract the last vector element in the middle block. This will be the 3794 // initial value for the recurrence when jumping to the scalar loop. 3795 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3796 Value *Incoming = State.get(PreviousDef, UF - 1); 3797 auto *ExtractForScalar = Incoming; 3798 auto *IdxTy = Builder.getInt32Ty(); 3799 if (VF.isVector()) { 3800 auto *One = ConstantInt::get(IdxTy, 1); 3801 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3802 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3803 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3804 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3805 "vector.recur.extract"); 3806 } 3807 // Extract the second last element in the middle block if the 3808 // Phi is used outside the loop. We need to extract the phi itself 3809 // and not the last element (the phi update in the current iteration). This 3810 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3811 // when the scalar loop is not run at all. 3812 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3813 if (VF.isVector()) { 3814 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3815 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3816 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3817 Incoming, Idx, "vector.recur.extract.for.phi"); 3818 } else if (UF > 1) 3819 // When loop is unrolled without vectorizing, initialize 3820 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3821 // of `Incoming`. This is analogous to the vectorized case above: extracting 3822 // the second last element when VF > 1. 3823 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3824 3825 // Fix the initial value of the original recurrence in the scalar loop. 3826 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3827 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3828 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3829 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3830 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3831 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3832 Start->addIncoming(Incoming, BB); 3833 } 3834 3835 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3836 Phi->setName("scalar.recur"); 3837 3838 // Finally, fix users of the recurrence outside the loop. The users will need 3839 // either the last value of the scalar recurrence or the last value of the 3840 // vector recurrence we extracted in the middle block. Since the loop is in 3841 // LCSSA form, we just need to find all the phi nodes for the original scalar 3842 // recurrence in the exit block, and then add an edge for the middle block. 3843 // Note that LCSSA does not imply single entry when the original scalar loop 3844 // had multiple exiting edges (as we always run the last iteration in the 3845 // scalar epilogue); in that case, there is no edge from middle to exit and 3846 // and thus no phis which needed updated. 3847 if (!Cost->requiresScalarEpilogue(VF)) 3848 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3849 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3850 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3851 State.Plan->removeLiveOut(&LCSSAPhi); 3852 } 3853 } 3854 3855 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3856 VPTransformState &State) { 3857 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3858 // Get it's reduction variable descriptor. 3859 assert(Legal->isReductionVariable(OrigPhi) && 3860 "Unable to find the reduction variable"); 3861 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3862 3863 RecurKind RK = RdxDesc.getRecurrenceKind(); 3864 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3865 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3866 setDebugLocFromInst(ReductionStartValue); 3867 3868 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3869 // This is the vector-clone of the value that leaves the loop. 3870 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3871 3872 // Wrap flags are in general invalid after vectorization, clear them. 3873 clearReductionWrapFlags(PhiR, State); 3874 3875 // Before each round, move the insertion point right between 3876 // the PHIs and the values we are going to write. 3877 // This allows us to write both PHINodes and the extractelement 3878 // instructions. 3879 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3880 3881 setDebugLocFromInst(LoopExitInst); 3882 3883 Type *PhiTy = OrigPhi->getType(); 3884 3885 VPBasicBlock *LatchVPBB = 3886 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3887 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3888 // If tail is folded by masking, the vector value to leave the loop should be 3889 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3890 // instead of the former. For an inloop reduction the reduction will already 3891 // be predicated, and does not need to be handled here. 3892 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3893 for (unsigned Part = 0; Part < UF; ++Part) { 3894 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3895 SelectInst *Sel = nullptr; 3896 for (User *U : VecLoopExitInst->users()) { 3897 if (isa<SelectInst>(U)) { 3898 assert(!Sel && "Reduction exit feeding two selects"); 3899 Sel = cast<SelectInst>(U); 3900 } else 3901 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3902 } 3903 assert(Sel && "Reduction exit feeds no select"); 3904 State.reset(LoopExitInstDef, Sel, Part); 3905 3906 if (isa<FPMathOperator>(Sel)) 3907 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3908 3909 // If the target can create a predicated operator for the reduction at no 3910 // extra cost in the loop (for example a predicated vadd), it can be 3911 // cheaper for the select to remain in the loop than be sunk out of it, 3912 // and so use the select value for the phi instead of the old 3913 // LoopExitValue. 3914 if (PreferPredicatedReductionSelect || 3915 TTI->preferPredicatedReductionSelect( 3916 RdxDesc.getOpcode(), PhiTy, 3917 TargetTransformInfo::ReductionFlags())) { 3918 auto *VecRdxPhi = 3919 cast<PHINode>(State.get(PhiR, Part)); 3920 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3921 } 3922 } 3923 } 3924 3925 // If the vector reduction can be performed in a smaller type, we truncate 3926 // then extend the loop exit value to enable InstCombine to evaluate the 3927 // entire expression in the smaller type. 3928 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3929 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3930 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3931 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3932 VectorParts RdxParts(UF); 3933 for (unsigned Part = 0; Part < UF; ++Part) { 3934 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3935 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3936 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3937 : Builder.CreateZExt(Trunc, VecTy); 3938 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3939 if (U != Trunc) { 3940 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3941 RdxParts[Part] = Extnd; 3942 } 3943 } 3944 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3945 for (unsigned Part = 0; Part < UF; ++Part) { 3946 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3947 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3948 } 3949 } 3950 3951 // Reduce all of the unrolled parts into a single vector. 3952 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3953 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3954 3955 // The middle block terminator has already been assigned a DebugLoc here (the 3956 // OrigLoop's single latch terminator). We want the whole middle block to 3957 // appear to execute on this line because: (a) it is all compiler generated, 3958 // (b) these instructions are always executed after evaluating the latch 3959 // conditional branch, and (c) other passes may add new predecessors which 3960 // terminate on this line. This is the easiest way to ensure we don't 3961 // accidentally cause an extra step back into the loop while debugging. 3962 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3963 if (PhiR->isOrdered()) 3964 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3965 else { 3966 // Floating-point operations should have some FMF to enable the reduction. 3967 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3968 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3969 for (unsigned Part = 1; Part < UF; ++Part) { 3970 Value *RdxPart = State.get(LoopExitInstDef, Part); 3971 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3972 ReducedPartRdx = Builder.CreateBinOp( 3973 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3974 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3975 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3976 ReducedPartRdx, RdxPart); 3977 else 3978 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3979 } 3980 } 3981 3982 // Create the reduction after the loop. Note that inloop reductions create the 3983 // target reduction in the loop using a Reduction recipe. 3984 if (VF.isVector() && !PhiR->isInLoop()) { 3985 ReducedPartRdx = 3986 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3987 // If the reduction can be performed in a smaller type, we need to extend 3988 // the reduction to the wider type before we branch to the original loop. 3989 if (PhiTy != RdxDesc.getRecurrenceType()) 3990 ReducedPartRdx = RdxDesc.isSigned() 3991 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3992 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3993 } 3994 3995 PHINode *ResumePhi = 3996 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 3997 3998 // Create a phi node that merges control-flow from the backedge-taken check 3999 // block and the middle block. 4000 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4001 LoopScalarPreHeader->getTerminator()); 4002 4003 // If we are fixing reductions in the epilogue loop then we should already 4004 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4005 // we carry over the incoming values correctly. 4006 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4007 if (Incoming == LoopMiddleBlock) 4008 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4009 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4010 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4011 Incoming); 4012 else 4013 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4014 } 4015 4016 // Set the resume value for this reduction 4017 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4018 4019 // If there were stores of the reduction value to a uniform memory address 4020 // inside the loop, create the final store here. 4021 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4022 StoreInst *NewSI = 4023 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4024 propagateMetadata(NewSI, SI); 4025 4026 // If the reduction value is used in other places, 4027 // then let the code below create PHI's for that. 4028 } 4029 4030 // Now, we need to fix the users of the reduction variable 4031 // inside and outside of the scalar remainder loop. 4032 4033 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4034 // in the exit blocks. See comment on analogous loop in 4035 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4036 if (!Cost->requiresScalarEpilogue(VF)) 4037 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4038 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4039 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4040 State.Plan->removeLiveOut(&LCSSAPhi); 4041 } 4042 4043 // Fix the scalar loop reduction variable with the incoming reduction sum 4044 // from the vector body and from the backedge value. 4045 int IncomingEdgeBlockIdx = 4046 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4047 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4048 // Pick the other block. 4049 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4050 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4051 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4052 } 4053 4054 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4055 VPTransformState &State) { 4056 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4057 RecurKind RK = RdxDesc.getRecurrenceKind(); 4058 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4059 return; 4060 4061 SmallVector<VPValue *, 8> Worklist; 4062 SmallPtrSet<VPValue *, 8> Visited; 4063 Worklist.push_back(PhiR); 4064 Visited.insert(PhiR); 4065 4066 while (!Worklist.empty()) { 4067 VPValue *Cur = Worklist.pop_back_val(); 4068 for (unsigned Part = 0; Part < UF; ++Part) { 4069 Value *V = State.get(Cur, Part); 4070 if (!isa<OverflowingBinaryOperator>(V)) 4071 break; 4072 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4073 } 4074 4075 for (VPUser *U : Cur->users()) { 4076 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4077 if (!UserRecipe) 4078 continue; 4079 for (VPValue *V : UserRecipe->definedValues()) 4080 if (Visited.insert(V).second) 4081 Worklist.push_back(V); 4082 } 4083 } 4084 } 4085 4086 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4087 // The basic block and loop containing the predicated instruction. 4088 auto *PredBB = PredInst->getParent(); 4089 auto *VectorLoop = LI->getLoopFor(PredBB); 4090 4091 // Initialize a worklist with the operands of the predicated instruction. 4092 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4093 4094 // Holds instructions that we need to analyze again. An instruction may be 4095 // reanalyzed if we don't yet know if we can sink it or not. 4096 SmallVector<Instruction *, 8> InstsToReanalyze; 4097 4098 // Returns true if a given use occurs in the predicated block. Phi nodes use 4099 // their operands in their corresponding predecessor blocks. 4100 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4101 auto *I = cast<Instruction>(U.getUser()); 4102 BasicBlock *BB = I->getParent(); 4103 if (auto *Phi = dyn_cast<PHINode>(I)) 4104 BB = Phi->getIncomingBlock( 4105 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4106 return BB == PredBB; 4107 }; 4108 4109 // Iteratively sink the scalarized operands of the predicated instruction 4110 // into the block we created for it. When an instruction is sunk, it's 4111 // operands are then added to the worklist. The algorithm ends after one pass 4112 // through the worklist doesn't sink a single instruction. 4113 bool Changed; 4114 do { 4115 // Add the instructions that need to be reanalyzed to the worklist, and 4116 // reset the changed indicator. 4117 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4118 InstsToReanalyze.clear(); 4119 Changed = false; 4120 4121 while (!Worklist.empty()) { 4122 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4123 4124 // We can't sink an instruction if it is a phi node, is not in the loop, 4125 // or may have side effects. 4126 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4127 I->mayHaveSideEffects()) 4128 continue; 4129 4130 // If the instruction is already in PredBB, check if we can sink its 4131 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4132 // sinking the scalar instruction I, hence it appears in PredBB; but it 4133 // may have failed to sink I's operands (recursively), which we try 4134 // (again) here. 4135 if (I->getParent() == PredBB) { 4136 Worklist.insert(I->op_begin(), I->op_end()); 4137 continue; 4138 } 4139 4140 // It's legal to sink the instruction if all its uses occur in the 4141 // predicated block. Otherwise, there's nothing to do yet, and we may 4142 // need to reanalyze the instruction. 4143 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4144 InstsToReanalyze.push_back(I); 4145 continue; 4146 } 4147 4148 // Move the instruction to the beginning of the predicated block, and add 4149 // it's operands to the worklist. 4150 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4151 Worklist.insert(I->op_begin(), I->op_end()); 4152 4153 // The sinking may have enabled other instructions to be sunk, so we will 4154 // need to iterate. 4155 Changed = true; 4156 } 4157 } while (Changed); 4158 } 4159 4160 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4161 VPTransformState &State) { 4162 auto Iter = depth_first( 4163 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); 4164 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4165 for (VPRecipeBase &P : VPBB->phis()) { 4166 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4167 if (!VPPhi) 4168 continue; 4169 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4170 // Make sure the builder has a valid insert point. 4171 Builder.SetInsertPoint(NewPhi); 4172 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4173 VPValue *Inc = VPPhi->getIncomingValue(i); 4174 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4175 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4176 } 4177 } 4178 } 4179 } 4180 4181 bool InnerLoopVectorizer::useOrderedReductions( 4182 const RecurrenceDescriptor &RdxDesc) { 4183 return Cost->useOrderedReductions(RdxDesc); 4184 } 4185 4186 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4187 VPWidenPHIRecipe *PhiR, 4188 VPTransformState &State) { 4189 assert(EnableVPlanNativePath && 4190 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 4191 // Currently we enter here in the VPlan-native path for non-induction 4192 // PHIs where all control flow is uniform. We simply widen these PHIs. 4193 // Create a vector phi with no operands - the vector phi operands will be 4194 // set at the end of vector code generation. 4195 Type *VecTy = (State.VF.isScalar()) 4196 ? PN->getType() 4197 : VectorType::get(PN->getType(), State.VF); 4198 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4199 State.set(PhiR, VecPhi, 0); 4200 } 4201 4202 /// A helper function for checking whether an integer division-related 4203 /// instruction may divide by zero (in which case it must be predicated if 4204 /// executed conditionally in the scalar code). 4205 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4206 /// Non-zero divisors that are non compile-time constants will not be 4207 /// converted into multiplication, so we will still end up scalarizing 4208 /// the division, but can do so w/o predication. 4209 static bool mayDivideByZero(Instruction &I) { 4210 assert((I.getOpcode() == Instruction::UDiv || 4211 I.getOpcode() == Instruction::SDiv || 4212 I.getOpcode() == Instruction::URem || 4213 I.getOpcode() == Instruction::SRem) && 4214 "Unexpected instruction"); 4215 Value *Divisor = I.getOperand(1); 4216 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4217 return !CInt || CInt->isZero(); 4218 } 4219 4220 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4221 VPUser &ArgOperands, 4222 VPTransformState &State) { 4223 assert(!isa<DbgInfoIntrinsic>(I) && 4224 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4225 setDebugLocFromInst(&I); 4226 4227 Module *M = I.getParent()->getParent()->getParent(); 4228 auto *CI = cast<CallInst>(&I); 4229 4230 SmallVector<Type *, 4> Tys; 4231 for (Value *ArgOperand : CI->args()) 4232 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4233 4234 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4235 4236 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4237 // version of the instruction. 4238 // Is it beneficial to perform intrinsic call compared to lib call? 4239 bool NeedToScalarize = false; 4240 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4241 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4242 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4243 assert((UseVectorIntrinsic || !NeedToScalarize) && 4244 "Instruction should be scalarized elsewhere."); 4245 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4246 "Either the intrinsic cost or vector call cost must be valid"); 4247 4248 for (unsigned Part = 0; Part < UF; ++Part) { 4249 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4250 SmallVector<Value *, 4> Args; 4251 for (auto &I : enumerate(ArgOperands.operands())) { 4252 // Some intrinsics have a scalar argument - don't replace it with a 4253 // vector. 4254 Value *Arg; 4255 if (!UseVectorIntrinsic || 4256 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4257 Arg = State.get(I.value(), Part); 4258 else 4259 Arg = State.get(I.value(), VPIteration(0, 0)); 4260 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4261 TysForDecl.push_back(Arg->getType()); 4262 Args.push_back(Arg); 4263 } 4264 4265 Function *VectorF; 4266 if (UseVectorIntrinsic) { 4267 // Use vector version of the intrinsic. 4268 if (VF.isVector()) 4269 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4270 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4271 assert(VectorF && "Can't retrieve vector intrinsic."); 4272 } else { 4273 // Use vector version of the function call. 4274 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4275 #ifndef NDEBUG 4276 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4277 "Can't create vector function."); 4278 #endif 4279 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4280 } 4281 SmallVector<OperandBundleDef, 1> OpBundles; 4282 CI->getOperandBundlesAsDefs(OpBundles); 4283 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4284 4285 if (isa<FPMathOperator>(V)) 4286 V->copyFastMathFlags(CI); 4287 4288 State.set(Def, V, Part); 4289 addMetadata(V, &I); 4290 } 4291 } 4292 4293 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4294 // We should not collect Scalars more than once per VF. Right now, this 4295 // function is called from collectUniformsAndScalars(), which already does 4296 // this check. Collecting Scalars for VF=1 does not make any sense. 4297 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4298 "This function should not be visited twice for the same VF"); 4299 4300 // This avoids any chances of creating a REPLICATE recipe during planning 4301 // since that would result in generation of scalarized code during execution, 4302 // which is not supported for scalable vectors. 4303 if (VF.isScalable()) { 4304 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4305 return; 4306 } 4307 4308 SmallSetVector<Instruction *, 8> Worklist; 4309 4310 // These sets are used to seed the analysis with pointers used by memory 4311 // accesses that will remain scalar. 4312 SmallSetVector<Instruction *, 8> ScalarPtrs; 4313 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4314 auto *Latch = TheLoop->getLoopLatch(); 4315 4316 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4317 // The pointer operands of loads and stores will be scalar as long as the 4318 // memory access is not a gather or scatter operation. The value operand of a 4319 // store will remain scalar if the store is scalarized. 4320 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4321 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4322 assert(WideningDecision != CM_Unknown && 4323 "Widening decision should be ready at this moment"); 4324 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4325 if (Ptr == Store->getValueOperand()) 4326 return WideningDecision == CM_Scalarize; 4327 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4328 "Ptr is neither a value or pointer operand"); 4329 return WideningDecision != CM_GatherScatter; 4330 }; 4331 4332 // A helper that returns true if the given value is a bitcast or 4333 // getelementptr instruction contained in the loop. 4334 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4335 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4336 isa<GetElementPtrInst>(V)) && 4337 !TheLoop->isLoopInvariant(V); 4338 }; 4339 4340 // A helper that evaluates a memory access's use of a pointer. If the use will 4341 // be a scalar use and the pointer is only used by memory accesses, we place 4342 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4343 // PossibleNonScalarPtrs. 4344 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4345 // We only care about bitcast and getelementptr instructions contained in 4346 // the loop. 4347 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4348 return; 4349 4350 // If the pointer has already been identified as scalar (e.g., if it was 4351 // also identified as uniform), there's nothing to do. 4352 auto *I = cast<Instruction>(Ptr); 4353 if (Worklist.count(I)) 4354 return; 4355 4356 // If the use of the pointer will be a scalar use, and all users of the 4357 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4358 // place the pointer in PossibleNonScalarPtrs. 4359 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4360 return isa<LoadInst>(U) || isa<StoreInst>(U); 4361 })) 4362 ScalarPtrs.insert(I); 4363 else 4364 PossibleNonScalarPtrs.insert(I); 4365 }; 4366 4367 // We seed the scalars analysis with three classes of instructions: (1) 4368 // instructions marked uniform-after-vectorization and (2) bitcast, 4369 // getelementptr and (pointer) phi instructions used by memory accesses 4370 // requiring a scalar use. 4371 // 4372 // (1) Add to the worklist all instructions that have been identified as 4373 // uniform-after-vectorization. 4374 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4375 4376 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4377 // memory accesses requiring a scalar use. The pointer operands of loads and 4378 // stores will be scalar as long as the memory accesses is not a gather or 4379 // scatter operation. The value operand of a store will remain scalar if the 4380 // store is scalarized. 4381 for (auto *BB : TheLoop->blocks()) 4382 for (auto &I : *BB) { 4383 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4384 evaluatePtrUse(Load, Load->getPointerOperand()); 4385 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4386 evaluatePtrUse(Store, Store->getPointerOperand()); 4387 evaluatePtrUse(Store, Store->getValueOperand()); 4388 } 4389 } 4390 for (auto *I : ScalarPtrs) 4391 if (!PossibleNonScalarPtrs.count(I)) { 4392 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4393 Worklist.insert(I); 4394 } 4395 4396 // Insert the forced scalars. 4397 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4398 // induction variable when the PHI user is scalarized. 4399 auto ForcedScalar = ForcedScalars.find(VF); 4400 if (ForcedScalar != ForcedScalars.end()) 4401 for (auto *I : ForcedScalar->second) 4402 Worklist.insert(I); 4403 4404 // Expand the worklist by looking through any bitcasts and getelementptr 4405 // instructions we've already identified as scalar. This is similar to the 4406 // expansion step in collectLoopUniforms(); however, here we're only 4407 // expanding to include additional bitcasts and getelementptr instructions. 4408 unsigned Idx = 0; 4409 while (Idx != Worklist.size()) { 4410 Instruction *Dst = Worklist[Idx++]; 4411 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4412 continue; 4413 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4414 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4415 auto *J = cast<Instruction>(U); 4416 return !TheLoop->contains(J) || Worklist.count(J) || 4417 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4418 isScalarUse(J, Src)); 4419 })) { 4420 Worklist.insert(Src); 4421 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4422 } 4423 } 4424 4425 // An induction variable will remain scalar if all users of the induction 4426 // variable and induction variable update remain scalar. 4427 for (auto &Induction : Legal->getInductionVars()) { 4428 auto *Ind = Induction.first; 4429 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4430 4431 // If tail-folding is applied, the primary induction variable will be used 4432 // to feed a vector compare. 4433 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4434 continue; 4435 4436 // Returns true if \p Indvar is a pointer induction that is used directly by 4437 // load/store instruction \p I. 4438 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4439 Instruction *I) { 4440 return Induction.second.getKind() == 4441 InductionDescriptor::IK_PtrInduction && 4442 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4443 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4444 }; 4445 4446 // Determine if all users of the induction variable are scalar after 4447 // vectorization. 4448 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4449 auto *I = cast<Instruction>(U); 4450 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4451 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4452 }); 4453 if (!ScalarInd) 4454 continue; 4455 4456 // Determine if all users of the induction variable update instruction are 4457 // scalar after vectorization. 4458 auto ScalarIndUpdate = 4459 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4460 auto *I = cast<Instruction>(U); 4461 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4462 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4463 }); 4464 if (!ScalarIndUpdate) 4465 continue; 4466 4467 // The induction variable and its update instruction will remain scalar. 4468 Worklist.insert(Ind); 4469 Worklist.insert(IndUpdate); 4470 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4471 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4472 << "\n"); 4473 } 4474 4475 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4476 } 4477 4478 bool LoopVectorizationCostModel::isScalarWithPredication( 4479 Instruction *I, ElementCount VF) const { 4480 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4481 return false; 4482 switch(I->getOpcode()) { 4483 default: 4484 break; 4485 case Instruction::Load: 4486 case Instruction::Store: { 4487 if (!Legal->isMaskRequired(I)) 4488 return false; 4489 auto *Ptr = getLoadStorePointerOperand(I); 4490 auto *Ty = getLoadStoreType(I); 4491 Type *VTy = Ty; 4492 if (VF.isVector()) 4493 VTy = VectorType::get(Ty, VF); 4494 const Align Alignment = getLoadStoreAlignment(I); 4495 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4496 TTI.isLegalMaskedGather(VTy, Alignment)) 4497 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4498 TTI.isLegalMaskedScatter(VTy, Alignment)); 4499 } 4500 case Instruction::UDiv: 4501 case Instruction::SDiv: 4502 case Instruction::SRem: 4503 case Instruction::URem: 4504 return mayDivideByZero(*I); 4505 } 4506 return false; 4507 } 4508 4509 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4510 Instruction *I, ElementCount VF) { 4511 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4512 assert(getWideningDecision(I, VF) == CM_Unknown && 4513 "Decision should not be set yet."); 4514 auto *Group = getInterleavedAccessGroup(I); 4515 assert(Group && "Must have a group."); 4516 4517 // If the instruction's allocated size doesn't equal it's type size, it 4518 // requires padding and will be scalarized. 4519 auto &DL = I->getModule()->getDataLayout(); 4520 auto *ScalarTy = getLoadStoreType(I); 4521 if (hasIrregularType(ScalarTy, DL)) 4522 return false; 4523 4524 // If the group involves a non-integral pointer, we may not be able to 4525 // losslessly cast all values to a common type. 4526 unsigned InterleaveFactor = Group->getFactor(); 4527 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4528 for (unsigned i = 0; i < InterleaveFactor; i++) { 4529 Instruction *Member = Group->getMember(i); 4530 if (!Member) 4531 continue; 4532 auto *MemberTy = getLoadStoreType(Member); 4533 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4534 // Don't coerce non-integral pointers to integers or vice versa. 4535 if (MemberNI != ScalarNI) { 4536 // TODO: Consider adding special nullptr value case here 4537 return false; 4538 } else if (MemberNI && ScalarNI && 4539 ScalarTy->getPointerAddressSpace() != 4540 MemberTy->getPointerAddressSpace()) { 4541 return false; 4542 } 4543 } 4544 4545 // Check if masking is required. 4546 // A Group may need masking for one of two reasons: it resides in a block that 4547 // needs predication, or it was decided to use masking to deal with gaps 4548 // (either a gap at the end of a load-access that may result in a speculative 4549 // load, or any gaps in a store-access). 4550 bool PredicatedAccessRequiresMasking = 4551 blockNeedsPredicationForAnyReason(I->getParent()) && 4552 Legal->isMaskRequired(I); 4553 bool LoadAccessWithGapsRequiresEpilogMasking = 4554 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4555 !isScalarEpilogueAllowed(); 4556 bool StoreAccessWithGapsRequiresMasking = 4557 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4558 if (!PredicatedAccessRequiresMasking && 4559 !LoadAccessWithGapsRequiresEpilogMasking && 4560 !StoreAccessWithGapsRequiresMasking) 4561 return true; 4562 4563 // If masked interleaving is required, we expect that the user/target had 4564 // enabled it, because otherwise it either wouldn't have been created or 4565 // it should have been invalidated by the CostModel. 4566 assert(useMaskedInterleavedAccesses(TTI) && 4567 "Masked interleave-groups for predicated accesses are not enabled."); 4568 4569 if (Group->isReverse()) 4570 return false; 4571 4572 auto *Ty = getLoadStoreType(I); 4573 const Align Alignment = getLoadStoreAlignment(I); 4574 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4575 : TTI.isLegalMaskedStore(Ty, Alignment); 4576 } 4577 4578 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4579 Instruction *I, ElementCount VF) { 4580 // Get and ensure we have a valid memory instruction. 4581 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4582 4583 auto *Ptr = getLoadStorePointerOperand(I); 4584 auto *ScalarTy = getLoadStoreType(I); 4585 4586 // In order to be widened, the pointer should be consecutive, first of all. 4587 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4588 return false; 4589 4590 // If the instruction is a store located in a predicated block, it will be 4591 // scalarized. 4592 if (isScalarWithPredication(I, VF)) 4593 return false; 4594 4595 // If the instruction's allocated size doesn't equal it's type size, it 4596 // requires padding and will be scalarized. 4597 auto &DL = I->getModule()->getDataLayout(); 4598 if (hasIrregularType(ScalarTy, DL)) 4599 return false; 4600 4601 return true; 4602 } 4603 4604 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4605 // We should not collect Uniforms more than once per VF. Right now, 4606 // this function is called from collectUniformsAndScalars(), which 4607 // already does this check. Collecting Uniforms for VF=1 does not make any 4608 // sense. 4609 4610 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4611 "This function should not be visited twice for the same VF"); 4612 4613 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4614 // not analyze again. Uniforms.count(VF) will return 1. 4615 Uniforms[VF].clear(); 4616 4617 // We now know that the loop is vectorizable! 4618 // Collect instructions inside the loop that will remain uniform after 4619 // vectorization. 4620 4621 // Global values, params and instructions outside of current loop are out of 4622 // scope. 4623 auto isOutOfScope = [&](Value *V) -> bool { 4624 Instruction *I = dyn_cast<Instruction>(V); 4625 return (!I || !TheLoop->contains(I)); 4626 }; 4627 4628 // Worklist containing uniform instructions demanding lane 0. 4629 SetVector<Instruction *> Worklist; 4630 BasicBlock *Latch = TheLoop->getLoopLatch(); 4631 4632 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4633 // that are scalar with predication must not be considered uniform after 4634 // vectorization, because that would create an erroneous replicating region 4635 // where only a single instance out of VF should be formed. 4636 // TODO: optimize such seldom cases if found important, see PR40816. 4637 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4638 if (isOutOfScope(I)) { 4639 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4640 << *I << "\n"); 4641 return; 4642 } 4643 if (isScalarWithPredication(I, VF)) { 4644 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4645 << *I << "\n"); 4646 return; 4647 } 4648 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4649 Worklist.insert(I); 4650 }; 4651 4652 // Start with the conditional branch. If the branch condition is an 4653 // instruction contained in the loop that is only used by the branch, it is 4654 // uniform. 4655 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4656 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4657 addToWorklistIfAllowed(Cmp); 4658 4659 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4660 InstWidening WideningDecision = getWideningDecision(I, VF); 4661 assert(WideningDecision != CM_Unknown && 4662 "Widening decision should be ready at this moment"); 4663 4664 // A uniform memory op is itself uniform. We exclude uniform stores 4665 // here as they demand the last lane, not the first one. 4666 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4667 assert(WideningDecision == CM_Scalarize); 4668 return true; 4669 } 4670 4671 return (WideningDecision == CM_Widen || 4672 WideningDecision == CM_Widen_Reverse || 4673 WideningDecision == CM_Interleave); 4674 }; 4675 4676 4677 // Returns true if Ptr is the pointer operand of a memory access instruction 4678 // I, and I is known to not require scalarization. 4679 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4680 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4681 }; 4682 4683 // Holds a list of values which are known to have at least one uniform use. 4684 // Note that there may be other uses which aren't uniform. A "uniform use" 4685 // here is something which only demands lane 0 of the unrolled iterations; 4686 // it does not imply that all lanes produce the same value (e.g. this is not 4687 // the usual meaning of uniform) 4688 SetVector<Value *> HasUniformUse; 4689 4690 // Scan the loop for instructions which are either a) known to have only 4691 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4692 for (auto *BB : TheLoop->blocks()) 4693 for (auto &I : *BB) { 4694 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4695 switch (II->getIntrinsicID()) { 4696 case Intrinsic::sideeffect: 4697 case Intrinsic::experimental_noalias_scope_decl: 4698 case Intrinsic::assume: 4699 case Intrinsic::lifetime_start: 4700 case Intrinsic::lifetime_end: 4701 if (TheLoop->hasLoopInvariantOperands(&I)) 4702 addToWorklistIfAllowed(&I); 4703 break; 4704 default: 4705 break; 4706 } 4707 } 4708 4709 // ExtractValue instructions must be uniform, because the operands are 4710 // known to be loop-invariant. 4711 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4712 assert(isOutOfScope(EVI->getAggregateOperand()) && 4713 "Expected aggregate value to be loop invariant"); 4714 addToWorklistIfAllowed(EVI); 4715 continue; 4716 } 4717 4718 // If there's no pointer operand, there's nothing to do. 4719 auto *Ptr = getLoadStorePointerOperand(&I); 4720 if (!Ptr) 4721 continue; 4722 4723 // A uniform memory op is itself uniform. We exclude uniform stores 4724 // here as they demand the last lane, not the first one. 4725 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4726 addToWorklistIfAllowed(&I); 4727 4728 if (isUniformDecision(&I, VF)) { 4729 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4730 HasUniformUse.insert(Ptr); 4731 } 4732 } 4733 4734 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4735 // demanding) users. Since loops are assumed to be in LCSSA form, this 4736 // disallows uses outside the loop as well. 4737 for (auto *V : HasUniformUse) { 4738 if (isOutOfScope(V)) 4739 continue; 4740 auto *I = cast<Instruction>(V); 4741 auto UsersAreMemAccesses = 4742 llvm::all_of(I->users(), [&](User *U) -> bool { 4743 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4744 }); 4745 if (UsersAreMemAccesses) 4746 addToWorklistIfAllowed(I); 4747 } 4748 4749 // Expand Worklist in topological order: whenever a new instruction 4750 // is added , its users should be already inside Worklist. It ensures 4751 // a uniform instruction will only be used by uniform instructions. 4752 unsigned idx = 0; 4753 while (idx != Worklist.size()) { 4754 Instruction *I = Worklist[idx++]; 4755 4756 for (auto OV : I->operand_values()) { 4757 // isOutOfScope operands cannot be uniform instructions. 4758 if (isOutOfScope(OV)) 4759 continue; 4760 // First order recurrence Phi's should typically be considered 4761 // non-uniform. 4762 auto *OP = dyn_cast<PHINode>(OV); 4763 if (OP && Legal->isFirstOrderRecurrence(OP)) 4764 continue; 4765 // If all the users of the operand are uniform, then add the 4766 // operand into the uniform worklist. 4767 auto *OI = cast<Instruction>(OV); 4768 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4769 auto *J = cast<Instruction>(U); 4770 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4771 })) 4772 addToWorklistIfAllowed(OI); 4773 } 4774 } 4775 4776 // For an instruction to be added into Worklist above, all its users inside 4777 // the loop should also be in Worklist. However, this condition cannot be 4778 // true for phi nodes that form a cyclic dependence. We must process phi 4779 // nodes separately. An induction variable will remain uniform if all users 4780 // of the induction variable and induction variable update remain uniform. 4781 // The code below handles both pointer and non-pointer induction variables. 4782 for (auto &Induction : Legal->getInductionVars()) { 4783 auto *Ind = Induction.first; 4784 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4785 4786 // Determine if all users of the induction variable are uniform after 4787 // vectorization. 4788 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4789 auto *I = cast<Instruction>(U); 4790 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4791 isVectorizedMemAccessUse(I, Ind); 4792 }); 4793 if (!UniformInd) 4794 continue; 4795 4796 // Determine if all users of the induction variable update instruction are 4797 // uniform after vectorization. 4798 auto UniformIndUpdate = 4799 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4800 auto *I = cast<Instruction>(U); 4801 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4802 isVectorizedMemAccessUse(I, IndUpdate); 4803 }); 4804 if (!UniformIndUpdate) 4805 continue; 4806 4807 // The induction variable and its update instruction will remain uniform. 4808 addToWorklistIfAllowed(Ind); 4809 addToWorklistIfAllowed(IndUpdate); 4810 } 4811 4812 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4813 } 4814 4815 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4816 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4817 4818 if (Legal->getRuntimePointerChecking()->Need) { 4819 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4820 "runtime pointer checks needed. Enable vectorization of this " 4821 "loop with '#pragma clang loop vectorize(enable)' when " 4822 "compiling with -Os/-Oz", 4823 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4824 return true; 4825 } 4826 4827 if (!PSE.getPredicate().isAlwaysTrue()) { 4828 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4829 "runtime SCEV checks needed. Enable vectorization of this " 4830 "loop with '#pragma clang loop vectorize(enable)' when " 4831 "compiling with -Os/-Oz", 4832 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4833 return true; 4834 } 4835 4836 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4837 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4838 reportVectorizationFailure("Runtime stride check for small trip count", 4839 "runtime stride == 1 checks needed. Enable vectorization of " 4840 "this loop without such check by compiling with -Os/-Oz", 4841 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4842 return true; 4843 } 4844 4845 return false; 4846 } 4847 4848 ElementCount 4849 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4850 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4851 return ElementCount::getScalable(0); 4852 4853 if (Hints->isScalableVectorizationDisabled()) { 4854 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4855 "ScalableVectorizationDisabled", ORE, TheLoop); 4856 return ElementCount::getScalable(0); 4857 } 4858 4859 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4860 4861 auto MaxScalableVF = ElementCount::getScalable( 4862 std::numeric_limits<ElementCount::ScalarTy>::max()); 4863 4864 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4865 // FIXME: While for scalable vectors this is currently sufficient, this should 4866 // be replaced by a more detailed mechanism that filters out specific VFs, 4867 // instead of invalidating vectorization for a whole set of VFs based on the 4868 // MaxVF. 4869 4870 // Disable scalable vectorization if the loop contains unsupported reductions. 4871 if (!canVectorizeReductions(MaxScalableVF)) { 4872 reportVectorizationInfo( 4873 "Scalable vectorization not supported for the reduction " 4874 "operations found in this loop.", 4875 "ScalableVFUnfeasible", ORE, TheLoop); 4876 return ElementCount::getScalable(0); 4877 } 4878 4879 // Disable scalable vectorization if the loop contains any instructions 4880 // with element types not supported for scalable vectors. 4881 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4882 return !Ty->isVoidTy() && 4883 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4884 })) { 4885 reportVectorizationInfo("Scalable vectorization is not supported " 4886 "for all element types found in this loop.", 4887 "ScalableVFUnfeasible", ORE, TheLoop); 4888 return ElementCount::getScalable(0); 4889 } 4890 4891 if (Legal->isSafeForAnyVectorWidth()) 4892 return MaxScalableVF; 4893 4894 // Limit MaxScalableVF by the maximum safe dependence distance. 4895 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4896 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4897 MaxVScale = 4898 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4899 MaxScalableVF = ElementCount::getScalable( 4900 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 4901 if (!MaxScalableVF) 4902 reportVectorizationInfo( 4903 "Max legal vector width too small, scalable vectorization " 4904 "unfeasible.", 4905 "ScalableVFUnfeasible", ORE, TheLoop); 4906 4907 return MaxScalableVF; 4908 } 4909 4910 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4911 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4912 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4913 unsigned SmallestType, WidestType; 4914 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4915 4916 // Get the maximum safe dependence distance in bits computed by LAA. 4917 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4918 // the memory accesses that is most restrictive (involved in the smallest 4919 // dependence distance). 4920 unsigned MaxSafeElements = 4921 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4922 4923 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4924 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4925 4926 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4927 << ".\n"); 4928 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4929 << ".\n"); 4930 4931 // First analyze the UserVF, fall back if the UserVF should be ignored. 4932 if (UserVF) { 4933 auto MaxSafeUserVF = 4934 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4935 4936 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4937 // If `VF=vscale x N` is safe, then so is `VF=N` 4938 if (UserVF.isScalable()) 4939 return FixedScalableVFPair( 4940 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4941 else 4942 return UserVF; 4943 } 4944 4945 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4946 4947 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4948 // is better to ignore the hint and let the compiler choose a suitable VF. 4949 if (!UserVF.isScalable()) { 4950 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4951 << " is unsafe, clamping to max safe VF=" 4952 << MaxSafeFixedVF << ".\n"); 4953 ORE->emit([&]() { 4954 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4955 TheLoop->getStartLoc(), 4956 TheLoop->getHeader()) 4957 << "User-specified vectorization factor " 4958 << ore::NV("UserVectorizationFactor", UserVF) 4959 << " is unsafe, clamping to maximum safe vectorization factor " 4960 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4961 }); 4962 return MaxSafeFixedVF; 4963 } 4964 4965 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4966 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4967 << " is ignored because scalable vectors are not " 4968 "available.\n"); 4969 ORE->emit([&]() { 4970 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4971 TheLoop->getStartLoc(), 4972 TheLoop->getHeader()) 4973 << "User-specified vectorization factor " 4974 << ore::NV("UserVectorizationFactor", UserVF) 4975 << " is ignored because the target does not support scalable " 4976 "vectors. The compiler will pick a more suitable value."; 4977 }); 4978 } else { 4979 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4980 << " is unsafe. Ignoring scalable UserVF.\n"); 4981 ORE->emit([&]() { 4982 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4983 TheLoop->getStartLoc(), 4984 TheLoop->getHeader()) 4985 << "User-specified vectorization factor " 4986 << ore::NV("UserVectorizationFactor", UserVF) 4987 << " is unsafe. Ignoring the hint to let the compiler pick a " 4988 "more suitable value."; 4989 }); 4990 } 4991 } 4992 4993 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4994 << " / " << WidestType << " bits.\n"); 4995 4996 FixedScalableVFPair Result(ElementCount::getFixed(1), 4997 ElementCount::getScalable(0)); 4998 if (auto MaxVF = 4999 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5000 MaxSafeFixedVF, FoldTailByMasking)) 5001 Result.FixedVF = MaxVF; 5002 5003 if (auto MaxVF = 5004 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5005 MaxSafeScalableVF, FoldTailByMasking)) 5006 if (MaxVF.isScalable()) { 5007 Result.ScalableVF = MaxVF; 5008 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5009 << "\n"); 5010 } 5011 5012 return Result; 5013 } 5014 5015 FixedScalableVFPair 5016 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5017 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5018 // TODO: It may by useful to do since it's still likely to be dynamically 5019 // uniform if the target can skip. 5020 reportVectorizationFailure( 5021 "Not inserting runtime ptr check for divergent target", 5022 "runtime pointer checks needed. Not enabled for divergent target", 5023 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5024 return FixedScalableVFPair::getNone(); 5025 } 5026 5027 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5028 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5029 if (TC == 1) { 5030 reportVectorizationFailure("Single iteration (non) loop", 5031 "loop trip count is one, irrelevant for vectorization", 5032 "SingleIterationLoop", ORE, TheLoop); 5033 return FixedScalableVFPair::getNone(); 5034 } 5035 5036 switch (ScalarEpilogueStatus) { 5037 case CM_ScalarEpilogueAllowed: 5038 return computeFeasibleMaxVF(TC, UserVF, false); 5039 case CM_ScalarEpilogueNotAllowedUsePredicate: 5040 LLVM_FALLTHROUGH; 5041 case CM_ScalarEpilogueNotNeededUsePredicate: 5042 LLVM_DEBUG( 5043 dbgs() << "LV: vector predicate hint/switch found.\n" 5044 << "LV: Not allowing scalar epilogue, creating predicated " 5045 << "vector loop.\n"); 5046 break; 5047 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5048 // fallthrough as a special case of OptForSize 5049 case CM_ScalarEpilogueNotAllowedOptSize: 5050 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5051 LLVM_DEBUG( 5052 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5053 else 5054 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5055 << "count.\n"); 5056 5057 // Bail if runtime checks are required, which are not good when optimising 5058 // for size. 5059 if (runtimeChecksRequired()) 5060 return FixedScalableVFPair::getNone(); 5061 5062 break; 5063 } 5064 5065 // The only loops we can vectorize without a scalar epilogue, are loops with 5066 // a bottom-test and a single exiting block. We'd have to handle the fact 5067 // that not every instruction executes on the last iteration. This will 5068 // require a lane mask which varies through the vector loop body. (TODO) 5069 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5070 // If there was a tail-folding hint/switch, but we can't fold the tail by 5071 // masking, fallback to a vectorization with a scalar epilogue. 5072 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5073 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5074 "scalar epilogue instead.\n"); 5075 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5076 return computeFeasibleMaxVF(TC, UserVF, false); 5077 } 5078 return FixedScalableVFPair::getNone(); 5079 } 5080 5081 // Now try the tail folding 5082 5083 // Invalidate interleave groups that require an epilogue if we can't mask 5084 // the interleave-group. 5085 if (!useMaskedInterleavedAccesses(TTI)) { 5086 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5087 "No decisions should have been taken at this point"); 5088 // Note: There is no need to invalidate any cost modeling decisions here, as 5089 // non where taken so far. 5090 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5091 } 5092 5093 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5094 // Avoid tail folding if the trip count is known to be a multiple of any VF 5095 // we chose. 5096 // FIXME: The condition below pessimises the case for fixed-width vectors, 5097 // when scalable VFs are also candidates for vectorization. 5098 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5099 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5100 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5101 "MaxFixedVF must be a power of 2"); 5102 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5103 : MaxFixedVF.getFixedValue(); 5104 ScalarEvolution *SE = PSE.getSE(); 5105 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5106 const SCEV *ExitCount = SE->getAddExpr( 5107 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5108 const SCEV *Rem = SE->getURemExpr( 5109 SE->applyLoopGuards(ExitCount, TheLoop), 5110 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5111 if (Rem->isZero()) { 5112 // Accept MaxFixedVF if we do not have a tail. 5113 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5114 return MaxFactors; 5115 } 5116 } 5117 5118 // If we don't know the precise trip count, or if the trip count that we 5119 // found modulo the vectorization factor is not zero, try to fold the tail 5120 // by masking. 5121 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5122 if (Legal->prepareToFoldTailByMasking()) { 5123 FoldTailByMasking = true; 5124 return MaxFactors; 5125 } 5126 5127 // If there was a tail-folding hint/switch, but we can't fold the tail by 5128 // masking, fallback to a vectorization with a scalar epilogue. 5129 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5130 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5131 "scalar epilogue instead.\n"); 5132 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5133 return MaxFactors; 5134 } 5135 5136 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5137 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5138 return FixedScalableVFPair::getNone(); 5139 } 5140 5141 if (TC == 0) { 5142 reportVectorizationFailure( 5143 "Unable to calculate the loop count due to complex control flow", 5144 "unable to calculate the loop count due to complex control flow", 5145 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5146 return FixedScalableVFPair::getNone(); 5147 } 5148 5149 reportVectorizationFailure( 5150 "Cannot optimize for size and vectorize at the same time.", 5151 "cannot optimize for size and vectorize at the same time. " 5152 "Enable vectorization of this loop with '#pragma clang loop " 5153 "vectorize(enable)' when compiling with -Os/-Oz", 5154 "NoTailLoopWithOptForSize", ORE, TheLoop); 5155 return FixedScalableVFPair::getNone(); 5156 } 5157 5158 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5159 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5160 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5161 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5162 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5163 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5164 : TargetTransformInfo::RGK_FixedWidthVector); 5165 5166 // Convenience function to return the minimum of two ElementCounts. 5167 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5168 assert((LHS.isScalable() == RHS.isScalable()) && 5169 "Scalable flags must match"); 5170 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5171 }; 5172 5173 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5174 // Note that both WidestRegister and WidestType may not be a powers of 2. 5175 auto MaxVectorElementCount = ElementCount::get( 5176 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5177 ComputeScalableMaxVF); 5178 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5179 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5180 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5181 5182 if (!MaxVectorElementCount) { 5183 LLVM_DEBUG(dbgs() << "LV: The target has no " 5184 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5185 << " vector registers.\n"); 5186 return ElementCount::getFixed(1); 5187 } 5188 5189 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5190 if (ConstTripCount && 5191 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5192 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5193 // If loop trip count (TC) is known at compile time there is no point in 5194 // choosing VF greater than TC (as done in the loop below). Select maximum 5195 // power of two which doesn't exceed TC. 5196 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5197 // when the TC is less than or equal to the known number of lanes. 5198 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5199 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5200 "exceeding the constant trip count: " 5201 << ClampedConstTripCount << "\n"); 5202 return ElementCount::getFixed(ClampedConstTripCount); 5203 } 5204 5205 TargetTransformInfo::RegisterKind RegKind = 5206 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5207 : TargetTransformInfo::RGK_FixedWidthVector; 5208 ElementCount MaxVF = MaxVectorElementCount; 5209 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5210 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5211 auto MaxVectorElementCountMaxBW = ElementCount::get( 5212 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5213 ComputeScalableMaxVF); 5214 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5215 5216 // Collect all viable vectorization factors larger than the default MaxVF 5217 // (i.e. MaxVectorElementCount). 5218 SmallVector<ElementCount, 8> VFs; 5219 for (ElementCount VS = MaxVectorElementCount * 2; 5220 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5221 VFs.push_back(VS); 5222 5223 // For each VF calculate its register usage. 5224 auto RUs = calculateRegisterUsage(VFs); 5225 5226 // Select the largest VF which doesn't require more registers than existing 5227 // ones. 5228 for (int i = RUs.size() - 1; i >= 0; --i) { 5229 bool Selected = true; 5230 for (auto &pair : RUs[i].MaxLocalUsers) { 5231 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5232 if (pair.second > TargetNumRegisters) 5233 Selected = false; 5234 } 5235 if (Selected) { 5236 MaxVF = VFs[i]; 5237 break; 5238 } 5239 } 5240 if (ElementCount MinVF = 5241 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5242 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5243 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5244 << ") with target's minimum: " << MinVF << '\n'); 5245 MaxVF = MinVF; 5246 } 5247 } 5248 5249 // Invalidate any widening decisions we might have made, in case the loop 5250 // requires prediction (decided later), but we have already made some 5251 // load/store widening decisions. 5252 invalidateCostModelingDecisions(); 5253 } 5254 return MaxVF; 5255 } 5256 5257 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5258 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5259 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5260 auto Min = Attr.getVScaleRangeMin(); 5261 auto Max = Attr.getVScaleRangeMax(); 5262 if (Max && Min == Max) 5263 return Max; 5264 } 5265 5266 return TTI.getVScaleForTuning(); 5267 } 5268 5269 bool LoopVectorizationCostModel::isMoreProfitable( 5270 const VectorizationFactor &A, const VectorizationFactor &B) const { 5271 InstructionCost CostA = A.Cost; 5272 InstructionCost CostB = B.Cost; 5273 5274 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5275 5276 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5277 MaxTripCount) { 5278 // If we are folding the tail and the trip count is a known (possibly small) 5279 // constant, the trip count will be rounded up to an integer number of 5280 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5281 // which we compare directly. When not folding the tail, the total cost will 5282 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5283 // approximated with the per-lane cost below instead of using the tripcount 5284 // as here. 5285 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5286 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5287 return RTCostA < RTCostB; 5288 } 5289 5290 // Improve estimate for the vector width if it is scalable. 5291 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5292 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5293 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5294 if (A.Width.isScalable()) 5295 EstimatedWidthA *= VScale.getValue(); 5296 if (B.Width.isScalable()) 5297 EstimatedWidthB *= VScale.getValue(); 5298 } 5299 5300 // Assume vscale may be larger than 1 (or the value being tuned for), 5301 // so that scalable vectorization is slightly favorable over fixed-width 5302 // vectorization. 5303 if (A.Width.isScalable() && !B.Width.isScalable()) 5304 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5305 5306 // To avoid the need for FP division: 5307 // (CostA / A.Width) < (CostB / B.Width) 5308 // <=> (CostA * B.Width) < (CostB * A.Width) 5309 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5310 } 5311 5312 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5313 const ElementCountSet &VFCandidates) { 5314 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5315 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5316 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5317 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5318 "Expected Scalar VF to be a candidate"); 5319 5320 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5321 VectorizationFactor ChosenFactor = ScalarCost; 5322 5323 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5324 if (ForceVectorization && VFCandidates.size() > 1) { 5325 // Ignore scalar width, because the user explicitly wants vectorization. 5326 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5327 // evaluation. 5328 ChosenFactor.Cost = InstructionCost::getMax(); 5329 } 5330 5331 SmallVector<InstructionVFPair> InvalidCosts; 5332 for (const auto &i : VFCandidates) { 5333 // The cost for scalar VF=1 is already calculated, so ignore it. 5334 if (i.isScalar()) 5335 continue; 5336 5337 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5338 VectorizationFactor Candidate(i, C.first); 5339 5340 #ifndef NDEBUG 5341 unsigned AssumedMinimumVscale = 1; 5342 if (Optional<unsigned> VScale = getVScaleForTuning()) 5343 AssumedMinimumVscale = VScale.getValue(); 5344 unsigned Width = 5345 Candidate.Width.isScalable() 5346 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5347 : Candidate.Width.getFixedValue(); 5348 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5349 << " costs: " << (Candidate.Cost / Width)); 5350 if (i.isScalable()) 5351 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5352 << AssumedMinimumVscale << ")"); 5353 LLVM_DEBUG(dbgs() << ".\n"); 5354 #endif 5355 5356 if (!C.second && !ForceVectorization) { 5357 LLVM_DEBUG( 5358 dbgs() << "LV: Not considering vector loop of width " << i 5359 << " because it will not generate any vector instructions.\n"); 5360 continue; 5361 } 5362 5363 // If profitable add it to ProfitableVF list. 5364 if (isMoreProfitable(Candidate, ScalarCost)) 5365 ProfitableVFs.push_back(Candidate); 5366 5367 if (isMoreProfitable(Candidate, ChosenFactor)) 5368 ChosenFactor = Candidate; 5369 } 5370 5371 // Emit a report of VFs with invalid costs in the loop. 5372 if (!InvalidCosts.empty()) { 5373 // Group the remarks per instruction, keeping the instruction order from 5374 // InvalidCosts. 5375 std::map<Instruction *, unsigned> Numbering; 5376 unsigned I = 0; 5377 for (auto &Pair : InvalidCosts) 5378 if (!Numbering.count(Pair.first)) 5379 Numbering[Pair.first] = I++; 5380 5381 // Sort the list, first on instruction(number) then on VF. 5382 llvm::sort(InvalidCosts, 5383 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5384 if (Numbering[A.first] != Numbering[B.first]) 5385 return Numbering[A.first] < Numbering[B.first]; 5386 ElementCountComparator ECC; 5387 return ECC(A.second, B.second); 5388 }); 5389 5390 // For a list of ordered instruction-vf pairs: 5391 // [(load, vf1), (load, vf2), (store, vf1)] 5392 // Group the instructions together to emit separate remarks for: 5393 // load (vf1, vf2) 5394 // store (vf1) 5395 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5396 auto Subset = ArrayRef<InstructionVFPair>(); 5397 do { 5398 if (Subset.empty()) 5399 Subset = Tail.take_front(1); 5400 5401 Instruction *I = Subset.front().first; 5402 5403 // If the next instruction is different, or if there are no other pairs, 5404 // emit a remark for the collated subset. e.g. 5405 // [(load, vf1), (load, vf2))] 5406 // to emit: 5407 // remark: invalid costs for 'load' at VF=(vf, vf2) 5408 if (Subset == Tail || Tail[Subset.size()].first != I) { 5409 std::string OutString; 5410 raw_string_ostream OS(OutString); 5411 assert(!Subset.empty() && "Unexpected empty range"); 5412 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5413 for (auto &Pair : Subset) 5414 OS << (Pair.second == Subset.front().second ? "" : ", ") 5415 << Pair.second; 5416 OS << "):"; 5417 if (auto *CI = dyn_cast<CallInst>(I)) 5418 OS << " call to " << CI->getCalledFunction()->getName(); 5419 else 5420 OS << " " << I->getOpcodeName(); 5421 OS.flush(); 5422 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5423 Tail = Tail.drop_front(Subset.size()); 5424 Subset = {}; 5425 } else 5426 // Grow the subset by one element 5427 Subset = Tail.take_front(Subset.size() + 1); 5428 } while (!Tail.empty()); 5429 } 5430 5431 if (!EnableCondStoresVectorization && NumPredStores) { 5432 reportVectorizationFailure("There are conditional stores.", 5433 "store that is conditionally executed prevents vectorization", 5434 "ConditionalStore", ORE, TheLoop); 5435 ChosenFactor = ScalarCost; 5436 } 5437 5438 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5439 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5440 << "LV: Vectorization seems to be not beneficial, " 5441 << "but was forced by a user.\n"); 5442 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5443 return ChosenFactor; 5444 } 5445 5446 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5447 const Loop &L, ElementCount VF) const { 5448 // Cross iteration phis such as reductions need special handling and are 5449 // currently unsupported. 5450 if (any_of(L.getHeader()->phis(), 5451 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5452 return false; 5453 5454 // Phis with uses outside of the loop require special handling and are 5455 // currently unsupported. 5456 for (auto &Entry : Legal->getInductionVars()) { 5457 // Look for uses of the value of the induction at the last iteration. 5458 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5459 for (User *U : PostInc->users()) 5460 if (!L.contains(cast<Instruction>(U))) 5461 return false; 5462 // Look for uses of penultimate value of the induction. 5463 for (User *U : Entry.first->users()) 5464 if (!L.contains(cast<Instruction>(U))) 5465 return false; 5466 } 5467 5468 // Induction variables that are widened require special handling that is 5469 // currently not supported. 5470 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5471 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5472 this->isProfitableToScalarize(Entry.first, VF)); 5473 })) 5474 return false; 5475 5476 // Epilogue vectorization code has not been auditted to ensure it handles 5477 // non-latch exits properly. It may be fine, but it needs auditted and 5478 // tested. 5479 if (L.getExitingBlock() != L.getLoopLatch()) 5480 return false; 5481 5482 return true; 5483 } 5484 5485 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5486 const ElementCount VF) const { 5487 // FIXME: We need a much better cost-model to take different parameters such 5488 // as register pressure, code size increase and cost of extra branches into 5489 // account. For now we apply a very crude heuristic and only consider loops 5490 // with vectorization factors larger than a certain value. 5491 // We also consider epilogue vectorization unprofitable for targets that don't 5492 // consider interleaving beneficial (eg. MVE). 5493 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5494 return false; 5495 // FIXME: We should consider changing the threshold for scalable 5496 // vectors to take VScaleForTuning into account. 5497 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5498 return true; 5499 return false; 5500 } 5501 5502 VectorizationFactor 5503 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5504 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5505 VectorizationFactor Result = VectorizationFactor::Disabled(); 5506 if (!EnableEpilogueVectorization) { 5507 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5508 return Result; 5509 } 5510 5511 if (!isScalarEpilogueAllowed()) { 5512 LLVM_DEBUG( 5513 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5514 "allowed.\n";); 5515 return Result; 5516 } 5517 5518 // Not really a cost consideration, but check for unsupported cases here to 5519 // simplify the logic. 5520 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5521 LLVM_DEBUG( 5522 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5523 "not a supported candidate.\n";); 5524 return Result; 5525 } 5526 5527 if (EpilogueVectorizationForceVF > 1) { 5528 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5529 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5530 if (LVP.hasPlanWithVF(ForcedEC)) 5531 return {ForcedEC, 0}; 5532 else { 5533 LLVM_DEBUG( 5534 dbgs() 5535 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5536 return Result; 5537 } 5538 } 5539 5540 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5541 TheLoop->getHeader()->getParent()->hasMinSize()) { 5542 LLVM_DEBUG( 5543 dbgs() 5544 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5545 return Result; 5546 } 5547 5548 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5549 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5550 "this loop\n"); 5551 return Result; 5552 } 5553 5554 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5555 // the main loop handles 8 lanes per iteration. We could still benefit from 5556 // vectorizing the epilogue loop with VF=4. 5557 ElementCount EstimatedRuntimeVF = MainLoopVF; 5558 if (MainLoopVF.isScalable()) { 5559 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5560 if (Optional<unsigned> VScale = getVScaleForTuning()) 5561 EstimatedRuntimeVF *= VScale.getValue(); 5562 } 5563 5564 for (auto &NextVF : ProfitableVFs) 5565 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5566 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5567 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5568 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5569 LVP.hasPlanWithVF(NextVF.Width)) 5570 Result = NextVF; 5571 5572 if (Result != VectorizationFactor::Disabled()) 5573 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5574 << Result.Width << "\n";); 5575 return Result; 5576 } 5577 5578 std::pair<unsigned, unsigned> 5579 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5580 unsigned MinWidth = -1U; 5581 unsigned MaxWidth = 8; 5582 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5583 // For in-loop reductions, no element types are added to ElementTypesInLoop 5584 // if there are no loads/stores in the loop. In this case, check through the 5585 // reduction variables to determine the maximum width. 5586 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5587 // Reset MaxWidth so that we can find the smallest type used by recurrences 5588 // in the loop. 5589 MaxWidth = -1U; 5590 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5591 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5592 // When finding the min width used by the recurrence we need to account 5593 // for casts on the input operands of the recurrence. 5594 MaxWidth = std::min<unsigned>( 5595 MaxWidth, std::min<unsigned>( 5596 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5597 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5598 } 5599 } else { 5600 for (Type *T : ElementTypesInLoop) { 5601 MinWidth = std::min<unsigned>( 5602 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5603 MaxWidth = std::max<unsigned>( 5604 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5605 } 5606 } 5607 return {MinWidth, MaxWidth}; 5608 } 5609 5610 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5611 ElementTypesInLoop.clear(); 5612 // For each block. 5613 for (BasicBlock *BB : TheLoop->blocks()) { 5614 // For each instruction in the loop. 5615 for (Instruction &I : BB->instructionsWithoutDebug()) { 5616 Type *T = I.getType(); 5617 5618 // Skip ignored values. 5619 if (ValuesToIgnore.count(&I)) 5620 continue; 5621 5622 // Only examine Loads, Stores and PHINodes. 5623 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5624 continue; 5625 5626 // Examine PHI nodes that are reduction variables. Update the type to 5627 // account for the recurrence type. 5628 if (auto *PN = dyn_cast<PHINode>(&I)) { 5629 if (!Legal->isReductionVariable(PN)) 5630 continue; 5631 const RecurrenceDescriptor &RdxDesc = 5632 Legal->getReductionVars().find(PN)->second; 5633 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5634 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5635 RdxDesc.getRecurrenceType(), 5636 TargetTransformInfo::ReductionFlags())) 5637 continue; 5638 T = RdxDesc.getRecurrenceType(); 5639 } 5640 5641 // Examine the stored values. 5642 if (auto *ST = dyn_cast<StoreInst>(&I)) 5643 T = ST->getValueOperand()->getType(); 5644 5645 assert(T->isSized() && 5646 "Expected the load/store/recurrence type to be sized"); 5647 5648 ElementTypesInLoop.insert(T); 5649 } 5650 } 5651 } 5652 5653 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5654 unsigned LoopCost) { 5655 // -- The interleave heuristics -- 5656 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5657 // There are many micro-architectural considerations that we can't predict 5658 // at this level. For example, frontend pressure (on decode or fetch) due to 5659 // code size, or the number and capabilities of the execution ports. 5660 // 5661 // We use the following heuristics to select the interleave count: 5662 // 1. If the code has reductions, then we interleave to break the cross 5663 // iteration dependency. 5664 // 2. If the loop is really small, then we interleave to reduce the loop 5665 // overhead. 5666 // 3. We don't interleave if we think that we will spill registers to memory 5667 // due to the increased register pressure. 5668 5669 if (!isScalarEpilogueAllowed()) 5670 return 1; 5671 5672 // We used the distance for the interleave count. 5673 if (Legal->getMaxSafeDepDistBytes() != -1U) 5674 return 1; 5675 5676 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5677 const bool HasReductions = !Legal->getReductionVars().empty(); 5678 // Do not interleave loops with a relatively small known or estimated trip 5679 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5680 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5681 // because with the above conditions interleaving can expose ILP and break 5682 // cross iteration dependences for reductions. 5683 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5684 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5685 return 1; 5686 5687 // If we did not calculate the cost for VF (because the user selected the VF) 5688 // then we calculate the cost of VF here. 5689 if (LoopCost == 0) { 5690 InstructionCost C = expectedCost(VF).first; 5691 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5692 LoopCost = *C.getValue(); 5693 5694 // Loop body is free and there is no need for interleaving. 5695 if (LoopCost == 0) 5696 return 1; 5697 } 5698 5699 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5700 // We divide by these constants so assume that we have at least one 5701 // instruction that uses at least one register. 5702 for (auto& pair : R.MaxLocalUsers) { 5703 pair.second = std::max(pair.second, 1U); 5704 } 5705 5706 // We calculate the interleave count using the following formula. 5707 // Subtract the number of loop invariants from the number of available 5708 // registers. These registers are used by all of the interleaved instances. 5709 // Next, divide the remaining registers by the number of registers that is 5710 // required by the loop, in order to estimate how many parallel instances 5711 // fit without causing spills. All of this is rounded down if necessary to be 5712 // a power of two. We want power of two interleave count to simplify any 5713 // addressing operations or alignment considerations. 5714 // We also want power of two interleave counts to ensure that the induction 5715 // variable of the vector loop wraps to zero, when tail is folded by masking; 5716 // this currently happens when OptForSize, in which case IC is set to 1 above. 5717 unsigned IC = UINT_MAX; 5718 5719 for (auto& pair : R.MaxLocalUsers) { 5720 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5721 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5722 << " registers of " 5723 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5724 if (VF.isScalar()) { 5725 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5726 TargetNumRegisters = ForceTargetNumScalarRegs; 5727 } else { 5728 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5729 TargetNumRegisters = ForceTargetNumVectorRegs; 5730 } 5731 unsigned MaxLocalUsers = pair.second; 5732 unsigned LoopInvariantRegs = 0; 5733 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5734 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5735 5736 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5737 // Don't count the induction variable as interleaved. 5738 if (EnableIndVarRegisterHeur) { 5739 TmpIC = 5740 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5741 std::max(1U, (MaxLocalUsers - 1))); 5742 } 5743 5744 IC = std::min(IC, TmpIC); 5745 } 5746 5747 // Clamp the interleave ranges to reasonable counts. 5748 unsigned MaxInterleaveCount = 5749 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5750 5751 // Check if the user has overridden the max. 5752 if (VF.isScalar()) { 5753 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5754 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5755 } else { 5756 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5757 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5758 } 5759 5760 // If trip count is known or estimated compile time constant, limit the 5761 // interleave count to be less than the trip count divided by VF, provided it 5762 // is at least 1. 5763 // 5764 // For scalable vectors we can't know if interleaving is beneficial. It may 5765 // not be beneficial for small loops if none of the lanes in the second vector 5766 // iterations is enabled. However, for larger loops, there is likely to be a 5767 // similar benefit as for fixed-width vectors. For now, we choose to leave 5768 // the InterleaveCount as if vscale is '1', although if some information about 5769 // the vector is known (e.g. min vector size), we can make a better decision. 5770 if (BestKnownTC) { 5771 MaxInterleaveCount = 5772 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5773 // Make sure MaxInterleaveCount is greater than 0. 5774 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5775 } 5776 5777 assert(MaxInterleaveCount > 0 && 5778 "Maximum interleave count must be greater than 0"); 5779 5780 // Clamp the calculated IC to be between the 1 and the max interleave count 5781 // that the target and trip count allows. 5782 if (IC > MaxInterleaveCount) 5783 IC = MaxInterleaveCount; 5784 else 5785 // Make sure IC is greater than 0. 5786 IC = std::max(1u, IC); 5787 5788 assert(IC > 0 && "Interleave count must be greater than 0."); 5789 5790 // Interleave if we vectorized this loop and there is a reduction that could 5791 // benefit from interleaving. 5792 if (VF.isVector() && HasReductions) { 5793 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5794 return IC; 5795 } 5796 5797 // For any scalar loop that either requires runtime checks or predication we 5798 // are better off leaving this to the unroller. Note that if we've already 5799 // vectorized the loop we will have done the runtime check and so interleaving 5800 // won't require further checks. 5801 bool ScalarInterleavingRequiresPredication = 5802 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5803 return Legal->blockNeedsPredication(BB); 5804 })); 5805 bool ScalarInterleavingRequiresRuntimePointerCheck = 5806 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5807 5808 // We want to interleave small loops in order to reduce the loop overhead and 5809 // potentially expose ILP opportunities. 5810 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5811 << "LV: IC is " << IC << '\n' 5812 << "LV: VF is " << VF << '\n'); 5813 const bool AggressivelyInterleaveReductions = 5814 TTI.enableAggressiveInterleaving(HasReductions); 5815 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5816 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5817 // We assume that the cost overhead is 1 and we use the cost model 5818 // to estimate the cost of the loop and interleave until the cost of the 5819 // loop overhead is about 5% of the cost of the loop. 5820 unsigned SmallIC = 5821 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5822 5823 // Interleave until store/load ports (estimated by max interleave count) are 5824 // saturated. 5825 unsigned NumStores = Legal->getNumStores(); 5826 unsigned NumLoads = Legal->getNumLoads(); 5827 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5828 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5829 5830 // There is little point in interleaving for reductions containing selects 5831 // and compares when VF=1 since it may just create more overhead than it's 5832 // worth for loops with small trip counts. This is because we still have to 5833 // do the final reduction after the loop. 5834 bool HasSelectCmpReductions = 5835 HasReductions && 5836 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5837 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5838 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5839 RdxDesc.getRecurrenceKind()); 5840 }); 5841 if (HasSelectCmpReductions) { 5842 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5843 return 1; 5844 } 5845 5846 // If we have a scalar reduction (vector reductions are already dealt with 5847 // by this point), we can increase the critical path length if the loop 5848 // we're interleaving is inside another loop. For tree-wise reductions 5849 // set the limit to 2, and for ordered reductions it's best to disable 5850 // interleaving entirely. 5851 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5852 bool HasOrderedReductions = 5853 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5854 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5855 return RdxDesc.isOrdered(); 5856 }); 5857 if (HasOrderedReductions) { 5858 LLVM_DEBUG( 5859 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5860 return 1; 5861 } 5862 5863 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5864 SmallIC = std::min(SmallIC, F); 5865 StoresIC = std::min(StoresIC, F); 5866 LoadsIC = std::min(LoadsIC, F); 5867 } 5868 5869 if (EnableLoadStoreRuntimeInterleave && 5870 std::max(StoresIC, LoadsIC) > SmallIC) { 5871 LLVM_DEBUG( 5872 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5873 return std::max(StoresIC, LoadsIC); 5874 } 5875 5876 // If there are scalar reductions and TTI has enabled aggressive 5877 // interleaving for reductions, we will interleave to expose ILP. 5878 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5879 AggressivelyInterleaveReductions) { 5880 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5881 // Interleave no less than SmallIC but not as aggressive as the normal IC 5882 // to satisfy the rare situation when resources are too limited. 5883 return std::max(IC / 2, SmallIC); 5884 } else { 5885 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5886 return SmallIC; 5887 } 5888 } 5889 5890 // Interleave if this is a large loop (small loops are already dealt with by 5891 // this point) that could benefit from interleaving. 5892 if (AggressivelyInterleaveReductions) { 5893 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5894 return IC; 5895 } 5896 5897 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5898 return 1; 5899 } 5900 5901 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5902 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5903 // This function calculates the register usage by measuring the highest number 5904 // of values that are alive at a single location. Obviously, this is a very 5905 // rough estimation. We scan the loop in a topological order in order and 5906 // assign a number to each instruction. We use RPO to ensure that defs are 5907 // met before their users. We assume that each instruction that has in-loop 5908 // users starts an interval. We record every time that an in-loop value is 5909 // used, so we have a list of the first and last occurrences of each 5910 // instruction. Next, we transpose this data structure into a multi map that 5911 // holds the list of intervals that *end* at a specific location. This multi 5912 // map allows us to perform a linear search. We scan the instructions linearly 5913 // and record each time that a new interval starts, by placing it in a set. 5914 // If we find this value in the multi-map then we remove it from the set. 5915 // The max register usage is the maximum size of the set. 5916 // We also search for instructions that are defined outside the loop, but are 5917 // used inside the loop. We need this number separately from the max-interval 5918 // usage number because when we unroll, loop-invariant values do not take 5919 // more register. 5920 LoopBlocksDFS DFS(TheLoop); 5921 DFS.perform(LI); 5922 5923 RegisterUsage RU; 5924 5925 // Each 'key' in the map opens a new interval. The values 5926 // of the map are the index of the 'last seen' usage of the 5927 // instruction that is the key. 5928 using IntervalMap = DenseMap<Instruction *, unsigned>; 5929 5930 // Maps instruction to its index. 5931 SmallVector<Instruction *, 64> IdxToInstr; 5932 // Marks the end of each interval. 5933 IntervalMap EndPoint; 5934 // Saves the list of instruction indices that are used in the loop. 5935 SmallPtrSet<Instruction *, 8> Ends; 5936 // Saves the list of values that are used in the loop but are 5937 // defined outside the loop, such as arguments and constants. 5938 SmallPtrSet<Value *, 8> LoopInvariants; 5939 5940 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5941 for (Instruction &I : BB->instructionsWithoutDebug()) { 5942 IdxToInstr.push_back(&I); 5943 5944 // Save the end location of each USE. 5945 for (Value *U : I.operands()) { 5946 auto *Instr = dyn_cast<Instruction>(U); 5947 5948 // Ignore non-instruction values such as arguments, constants, etc. 5949 if (!Instr) 5950 continue; 5951 5952 // If this instruction is outside the loop then record it and continue. 5953 if (!TheLoop->contains(Instr)) { 5954 LoopInvariants.insert(Instr); 5955 continue; 5956 } 5957 5958 // Overwrite previous end points. 5959 EndPoint[Instr] = IdxToInstr.size(); 5960 Ends.insert(Instr); 5961 } 5962 } 5963 } 5964 5965 // Saves the list of intervals that end with the index in 'key'. 5966 using InstrList = SmallVector<Instruction *, 2>; 5967 DenseMap<unsigned, InstrList> TransposeEnds; 5968 5969 // Transpose the EndPoints to a list of values that end at each index. 5970 for (auto &Interval : EndPoint) 5971 TransposeEnds[Interval.second].push_back(Interval.first); 5972 5973 SmallPtrSet<Instruction *, 8> OpenIntervals; 5974 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5975 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5976 5977 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5978 5979 auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { 5980 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5981 return 0; 5982 return TTI.getRegUsageForType(VectorType::get(Ty, VF)); 5983 }; 5984 5985 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5986 Instruction *I = IdxToInstr[i]; 5987 5988 // Remove all of the instructions that end at this location. 5989 InstrList &List = TransposeEnds[i]; 5990 for (Instruction *ToRemove : List) 5991 OpenIntervals.erase(ToRemove); 5992 5993 // Ignore instructions that are never used within the loop. 5994 if (!Ends.count(I)) 5995 continue; 5996 5997 // Skip ignored values. 5998 if (ValuesToIgnore.count(I)) 5999 continue; 6000 6001 // For each VF find the maximum usage of registers. 6002 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6003 // Count the number of live intervals. 6004 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6005 6006 if (VFs[j].isScalar()) { 6007 for (auto Inst : OpenIntervals) { 6008 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6009 if (RegUsage.find(ClassID) == RegUsage.end()) 6010 RegUsage[ClassID] = 1; 6011 else 6012 RegUsage[ClassID] += 1; 6013 } 6014 } else { 6015 collectUniformsAndScalars(VFs[j]); 6016 for (auto Inst : OpenIntervals) { 6017 // Skip ignored values for VF > 1. 6018 if (VecValuesToIgnore.count(Inst)) 6019 continue; 6020 if (isScalarAfterVectorization(Inst, VFs[j])) { 6021 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6022 if (RegUsage.find(ClassID) == RegUsage.end()) 6023 RegUsage[ClassID] = 1; 6024 else 6025 RegUsage[ClassID] += 1; 6026 } else { 6027 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6028 if (RegUsage.find(ClassID) == RegUsage.end()) 6029 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6030 else 6031 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6032 } 6033 } 6034 } 6035 6036 for (auto& pair : RegUsage) { 6037 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6038 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6039 else 6040 MaxUsages[j][pair.first] = pair.second; 6041 } 6042 } 6043 6044 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6045 << OpenIntervals.size() << '\n'); 6046 6047 // Add the current instruction to the list of open intervals. 6048 OpenIntervals.insert(I); 6049 } 6050 6051 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6052 SmallMapVector<unsigned, unsigned, 4> Invariant; 6053 6054 for (auto Inst : LoopInvariants) { 6055 unsigned Usage = 6056 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6057 unsigned ClassID = 6058 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6059 if (Invariant.find(ClassID) == Invariant.end()) 6060 Invariant[ClassID] = Usage; 6061 else 6062 Invariant[ClassID] += Usage; 6063 } 6064 6065 LLVM_DEBUG({ 6066 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6067 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6068 << " item\n"; 6069 for (const auto &pair : MaxUsages[i]) { 6070 dbgs() << "LV(REG): RegisterClass: " 6071 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6072 << " registers\n"; 6073 } 6074 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6075 << " item\n"; 6076 for (const auto &pair : Invariant) { 6077 dbgs() << "LV(REG): RegisterClass: " 6078 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6079 << " registers\n"; 6080 } 6081 }); 6082 6083 RU.LoopInvariantRegs = Invariant; 6084 RU.MaxLocalUsers = MaxUsages[i]; 6085 RUs[i] = RU; 6086 } 6087 6088 return RUs; 6089 } 6090 6091 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6092 ElementCount VF) { 6093 // TODO: Cost model for emulated masked load/store is completely 6094 // broken. This hack guides the cost model to use an artificially 6095 // high enough value to practically disable vectorization with such 6096 // operations, except where previously deployed legality hack allowed 6097 // using very low cost values. This is to avoid regressions coming simply 6098 // from moving "masked load/store" check from legality to cost model. 6099 // Masked Load/Gather emulation was previously never allowed. 6100 // Limited number of Masked Store/Scatter emulation was allowed. 6101 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6102 return isa<LoadInst>(I) || 6103 (isa<StoreInst>(I) && 6104 NumPredStores > NumberOfStoresToPredicate); 6105 } 6106 6107 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6108 // If we aren't vectorizing the loop, or if we've already collected the 6109 // instructions to scalarize, there's nothing to do. Collection may already 6110 // have occurred if we have a user-selected VF and are now computing the 6111 // expected cost for interleaving. 6112 if (VF.isScalar() || VF.isZero() || 6113 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6114 return; 6115 6116 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6117 // not profitable to scalarize any instructions, the presence of VF in the 6118 // map will indicate that we've analyzed it already. 6119 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6120 6121 // Find all the instructions that are scalar with predication in the loop and 6122 // determine if it would be better to not if-convert the blocks they are in. 6123 // If so, we also record the instructions to scalarize. 6124 for (BasicBlock *BB : TheLoop->blocks()) { 6125 if (!blockNeedsPredicationForAnyReason(BB)) 6126 continue; 6127 for (Instruction &I : *BB) 6128 if (isScalarWithPredication(&I, VF)) { 6129 ScalarCostsTy ScalarCosts; 6130 // Do not apply discount if scalable, because that would lead to 6131 // invalid scalarization costs. 6132 // Do not apply discount logic if hacked cost is needed 6133 // for emulated masked memrefs. 6134 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6135 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6136 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6137 // Remember that BB will remain after vectorization. 6138 PredicatedBBsAfterVectorization.insert(BB); 6139 } 6140 } 6141 } 6142 6143 int LoopVectorizationCostModel::computePredInstDiscount( 6144 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6145 assert(!isUniformAfterVectorization(PredInst, VF) && 6146 "Instruction marked uniform-after-vectorization will be predicated"); 6147 6148 // Initialize the discount to zero, meaning that the scalar version and the 6149 // vector version cost the same. 6150 InstructionCost Discount = 0; 6151 6152 // Holds instructions to analyze. The instructions we visit are mapped in 6153 // ScalarCosts. Those instructions are the ones that would be scalarized if 6154 // we find that the scalar version costs less. 6155 SmallVector<Instruction *, 8> Worklist; 6156 6157 // Returns true if the given instruction can be scalarized. 6158 auto canBeScalarized = [&](Instruction *I) -> bool { 6159 // We only attempt to scalarize instructions forming a single-use chain 6160 // from the original predicated block that would otherwise be vectorized. 6161 // Although not strictly necessary, we give up on instructions we know will 6162 // already be scalar to avoid traversing chains that are unlikely to be 6163 // beneficial. 6164 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6165 isScalarAfterVectorization(I, VF)) 6166 return false; 6167 6168 // If the instruction is scalar with predication, it will be analyzed 6169 // separately. We ignore it within the context of PredInst. 6170 if (isScalarWithPredication(I, VF)) 6171 return false; 6172 6173 // If any of the instruction's operands are uniform after vectorization, 6174 // the instruction cannot be scalarized. This prevents, for example, a 6175 // masked load from being scalarized. 6176 // 6177 // We assume we will only emit a value for lane zero of an instruction 6178 // marked uniform after vectorization, rather than VF identical values. 6179 // Thus, if we scalarize an instruction that uses a uniform, we would 6180 // create uses of values corresponding to the lanes we aren't emitting code 6181 // for. This behavior can be changed by allowing getScalarValue to clone 6182 // the lane zero values for uniforms rather than asserting. 6183 for (Use &U : I->operands()) 6184 if (auto *J = dyn_cast<Instruction>(U.get())) 6185 if (isUniformAfterVectorization(J, VF)) 6186 return false; 6187 6188 // Otherwise, we can scalarize the instruction. 6189 return true; 6190 }; 6191 6192 // Compute the expected cost discount from scalarizing the entire expression 6193 // feeding the predicated instruction. We currently only consider expressions 6194 // that are single-use instruction chains. 6195 Worklist.push_back(PredInst); 6196 while (!Worklist.empty()) { 6197 Instruction *I = Worklist.pop_back_val(); 6198 6199 // If we've already analyzed the instruction, there's nothing to do. 6200 if (ScalarCosts.find(I) != ScalarCosts.end()) 6201 continue; 6202 6203 // Compute the cost of the vector instruction. Note that this cost already 6204 // includes the scalarization overhead of the predicated instruction. 6205 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6206 6207 // Compute the cost of the scalarized instruction. This cost is the cost of 6208 // the instruction as if it wasn't if-converted and instead remained in the 6209 // predicated block. We will scale this cost by block probability after 6210 // computing the scalarization overhead. 6211 InstructionCost ScalarCost = 6212 VF.getFixedValue() * 6213 getInstructionCost(I, ElementCount::getFixed(1)).first; 6214 6215 // Compute the scalarization overhead of needed insertelement instructions 6216 // and phi nodes. 6217 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6218 ScalarCost += TTI.getScalarizationOverhead( 6219 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6220 APInt::getAllOnes(VF.getFixedValue()), true, false); 6221 ScalarCost += 6222 VF.getFixedValue() * 6223 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6224 } 6225 6226 // Compute the scalarization overhead of needed extractelement 6227 // instructions. For each of the instruction's operands, if the operand can 6228 // be scalarized, add it to the worklist; otherwise, account for the 6229 // overhead. 6230 for (Use &U : I->operands()) 6231 if (auto *J = dyn_cast<Instruction>(U.get())) { 6232 assert(VectorType::isValidElementType(J->getType()) && 6233 "Instruction has non-scalar type"); 6234 if (canBeScalarized(J)) 6235 Worklist.push_back(J); 6236 else if (needsExtract(J, VF)) { 6237 ScalarCost += TTI.getScalarizationOverhead( 6238 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6239 APInt::getAllOnes(VF.getFixedValue()), false, true); 6240 } 6241 } 6242 6243 // Scale the total scalar cost by block probability. 6244 ScalarCost /= getReciprocalPredBlockProb(); 6245 6246 // Compute the discount. A non-negative discount means the vector version 6247 // of the instruction costs more, and scalarizing would be beneficial. 6248 Discount += VectorCost - ScalarCost; 6249 ScalarCosts[I] = ScalarCost; 6250 } 6251 6252 return *Discount.getValue(); 6253 } 6254 6255 LoopVectorizationCostModel::VectorizationCostTy 6256 LoopVectorizationCostModel::expectedCost( 6257 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6258 VectorizationCostTy Cost; 6259 6260 // For each block. 6261 for (BasicBlock *BB : TheLoop->blocks()) { 6262 VectorizationCostTy BlockCost; 6263 6264 // For each instruction in the old loop. 6265 for (Instruction &I : BB->instructionsWithoutDebug()) { 6266 // Skip ignored values. 6267 if (ValuesToIgnore.count(&I) || 6268 (VF.isVector() && VecValuesToIgnore.count(&I))) 6269 continue; 6270 6271 VectorizationCostTy C = getInstructionCost(&I, VF); 6272 6273 // Check if we should override the cost. 6274 if (C.first.isValid() && 6275 ForceTargetInstructionCost.getNumOccurrences() > 0) 6276 C.first = InstructionCost(ForceTargetInstructionCost); 6277 6278 // Keep a list of instructions with invalid costs. 6279 if (Invalid && !C.first.isValid()) 6280 Invalid->emplace_back(&I, VF); 6281 6282 BlockCost.first += C.first; 6283 BlockCost.second |= C.second; 6284 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6285 << " for VF " << VF << " For instruction: " << I 6286 << '\n'); 6287 } 6288 6289 // If we are vectorizing a predicated block, it will have been 6290 // if-converted. This means that the block's instructions (aside from 6291 // stores and instructions that may divide by zero) will now be 6292 // unconditionally executed. For the scalar case, we may not always execute 6293 // the predicated block, if it is an if-else block. Thus, scale the block's 6294 // cost by the probability of executing it. blockNeedsPredication from 6295 // Legal is used so as to not include all blocks in tail folded loops. 6296 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6297 BlockCost.first /= getReciprocalPredBlockProb(); 6298 6299 Cost.first += BlockCost.first; 6300 Cost.second |= BlockCost.second; 6301 } 6302 6303 return Cost; 6304 } 6305 6306 /// Gets Address Access SCEV after verifying that the access pattern 6307 /// is loop invariant except the induction variable dependence. 6308 /// 6309 /// This SCEV can be sent to the Target in order to estimate the address 6310 /// calculation cost. 6311 static const SCEV *getAddressAccessSCEV( 6312 Value *Ptr, 6313 LoopVectorizationLegality *Legal, 6314 PredicatedScalarEvolution &PSE, 6315 const Loop *TheLoop) { 6316 6317 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6318 if (!Gep) 6319 return nullptr; 6320 6321 // We are looking for a gep with all loop invariant indices except for one 6322 // which should be an induction variable. 6323 auto SE = PSE.getSE(); 6324 unsigned NumOperands = Gep->getNumOperands(); 6325 for (unsigned i = 1; i < NumOperands; ++i) { 6326 Value *Opd = Gep->getOperand(i); 6327 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6328 !Legal->isInductionVariable(Opd)) 6329 return nullptr; 6330 } 6331 6332 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6333 return PSE.getSCEV(Ptr); 6334 } 6335 6336 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6337 return Legal->hasStride(I->getOperand(0)) || 6338 Legal->hasStride(I->getOperand(1)); 6339 } 6340 6341 InstructionCost 6342 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6343 ElementCount VF) { 6344 assert(VF.isVector() && 6345 "Scalarization cost of instruction implies vectorization."); 6346 if (VF.isScalable()) 6347 return InstructionCost::getInvalid(); 6348 6349 Type *ValTy = getLoadStoreType(I); 6350 auto SE = PSE.getSE(); 6351 6352 unsigned AS = getLoadStoreAddressSpace(I); 6353 Value *Ptr = getLoadStorePointerOperand(I); 6354 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6355 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6356 // that it is being called from this specific place. 6357 6358 // Figure out whether the access is strided and get the stride value 6359 // if it's known in compile time 6360 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6361 6362 // Get the cost of the scalar memory instruction and address computation. 6363 InstructionCost Cost = 6364 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6365 6366 // Don't pass *I here, since it is scalar but will actually be part of a 6367 // vectorized loop where the user of it is a vectorized instruction. 6368 const Align Alignment = getLoadStoreAlignment(I); 6369 Cost += VF.getKnownMinValue() * 6370 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6371 AS, TTI::TCK_RecipThroughput); 6372 6373 // Get the overhead of the extractelement and insertelement instructions 6374 // we might create due to scalarization. 6375 Cost += getScalarizationOverhead(I, VF); 6376 6377 // If we have a predicated load/store, it will need extra i1 extracts and 6378 // conditional branches, but may not be executed for each vector lane. Scale 6379 // the cost by the probability of executing the predicated block. 6380 if (isPredicatedInst(I, VF)) { 6381 Cost /= getReciprocalPredBlockProb(); 6382 6383 // Add the cost of an i1 extract and a branch 6384 auto *Vec_i1Ty = 6385 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6386 Cost += TTI.getScalarizationOverhead( 6387 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6388 /*Insert=*/false, /*Extract=*/true); 6389 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6390 6391 if (useEmulatedMaskMemRefHack(I, VF)) 6392 // Artificially setting to a high enough value to practically disable 6393 // vectorization with such operations. 6394 Cost = 3000000; 6395 } 6396 6397 return Cost; 6398 } 6399 6400 InstructionCost 6401 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6402 ElementCount VF) { 6403 Type *ValTy = getLoadStoreType(I); 6404 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6405 Value *Ptr = getLoadStorePointerOperand(I); 6406 unsigned AS = getLoadStoreAddressSpace(I); 6407 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6408 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6409 6410 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6411 "Stride should be 1 or -1 for consecutive memory access"); 6412 const Align Alignment = getLoadStoreAlignment(I); 6413 InstructionCost Cost = 0; 6414 if (Legal->isMaskRequired(I)) 6415 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6416 CostKind); 6417 else 6418 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6419 CostKind, I); 6420 6421 bool Reverse = ConsecutiveStride < 0; 6422 if (Reverse) 6423 Cost += 6424 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6425 return Cost; 6426 } 6427 6428 InstructionCost 6429 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6430 ElementCount VF) { 6431 assert(Legal->isUniformMemOp(*I)); 6432 6433 Type *ValTy = getLoadStoreType(I); 6434 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6435 const Align Alignment = getLoadStoreAlignment(I); 6436 unsigned AS = getLoadStoreAddressSpace(I); 6437 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6438 if (isa<LoadInst>(I)) { 6439 return TTI.getAddressComputationCost(ValTy) + 6440 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6441 CostKind) + 6442 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6443 } 6444 StoreInst *SI = cast<StoreInst>(I); 6445 6446 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6447 return TTI.getAddressComputationCost(ValTy) + 6448 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6449 CostKind) + 6450 (isLoopInvariantStoreValue 6451 ? 0 6452 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6453 VF.getKnownMinValue() - 1)); 6454 } 6455 6456 InstructionCost 6457 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6458 ElementCount VF) { 6459 Type *ValTy = getLoadStoreType(I); 6460 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6461 const Align Alignment = getLoadStoreAlignment(I); 6462 const Value *Ptr = getLoadStorePointerOperand(I); 6463 6464 return TTI.getAddressComputationCost(VectorTy) + 6465 TTI.getGatherScatterOpCost( 6466 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6467 TargetTransformInfo::TCK_RecipThroughput, I); 6468 } 6469 6470 InstructionCost 6471 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6472 ElementCount VF) { 6473 // TODO: Once we have support for interleaving with scalable vectors 6474 // we can calculate the cost properly here. 6475 if (VF.isScalable()) 6476 return InstructionCost::getInvalid(); 6477 6478 Type *ValTy = getLoadStoreType(I); 6479 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6480 unsigned AS = getLoadStoreAddressSpace(I); 6481 6482 auto Group = getInterleavedAccessGroup(I); 6483 assert(Group && "Fail to get an interleaved access group."); 6484 6485 unsigned InterleaveFactor = Group->getFactor(); 6486 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6487 6488 // Holds the indices of existing members in the interleaved group. 6489 SmallVector<unsigned, 4> Indices; 6490 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6491 if (Group->getMember(IF)) 6492 Indices.push_back(IF); 6493 6494 // Calculate the cost of the whole interleaved group. 6495 bool UseMaskForGaps = 6496 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6497 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6498 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6499 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6500 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6501 6502 if (Group->isReverse()) { 6503 // TODO: Add support for reversed masked interleaved access. 6504 assert(!Legal->isMaskRequired(I) && 6505 "Reverse masked interleaved access not supported."); 6506 Cost += 6507 Group->getNumMembers() * 6508 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6509 } 6510 return Cost; 6511 } 6512 6513 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6514 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6515 using namespace llvm::PatternMatch; 6516 // Early exit for no inloop reductions 6517 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6518 return None; 6519 auto *VectorTy = cast<VectorType>(Ty); 6520 6521 // We are looking for a pattern of, and finding the minimal acceptable cost: 6522 // reduce(mul(ext(A), ext(B))) or 6523 // reduce(mul(A, B)) or 6524 // reduce(ext(A)) or 6525 // reduce(A). 6526 // The basic idea is that we walk down the tree to do that, finding the root 6527 // reduction instruction in InLoopReductionImmediateChains. From there we find 6528 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6529 // of the components. If the reduction cost is lower then we return it for the 6530 // reduction instruction and 0 for the other instructions in the pattern. If 6531 // it is not we return an invalid cost specifying the orignal cost method 6532 // should be used. 6533 Instruction *RetI = I; 6534 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6535 if (!RetI->hasOneUser()) 6536 return None; 6537 RetI = RetI->user_back(); 6538 } 6539 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6540 RetI->user_back()->getOpcode() == Instruction::Add) { 6541 if (!RetI->hasOneUser()) 6542 return None; 6543 RetI = RetI->user_back(); 6544 } 6545 6546 // Test if the found instruction is a reduction, and if not return an invalid 6547 // cost specifying the parent to use the original cost modelling. 6548 if (!InLoopReductionImmediateChains.count(RetI)) 6549 return None; 6550 6551 // Find the reduction this chain is a part of and calculate the basic cost of 6552 // the reduction on its own. 6553 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6554 Instruction *ReductionPhi = LastChain; 6555 while (!isa<PHINode>(ReductionPhi)) 6556 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6557 6558 const RecurrenceDescriptor &RdxDesc = 6559 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6560 6561 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6562 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6563 6564 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6565 // normal fmul instruction to the cost of the fadd reduction. 6566 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6567 BaseCost += 6568 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6569 6570 // If we're using ordered reductions then we can just return the base cost 6571 // here, since getArithmeticReductionCost calculates the full ordered 6572 // reduction cost when FP reassociation is not allowed. 6573 if (useOrderedReductions(RdxDesc)) 6574 return BaseCost; 6575 6576 // Get the operand that was not the reduction chain and match it to one of the 6577 // patterns, returning the better cost if it is found. 6578 Instruction *RedOp = RetI->getOperand(1) == LastChain 6579 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6580 : dyn_cast<Instruction>(RetI->getOperand(1)); 6581 6582 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6583 6584 Instruction *Op0, *Op1; 6585 if (RedOp && 6586 match(RedOp, 6587 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6588 match(Op0, m_ZExtOrSExt(m_Value())) && 6589 Op0->getOpcode() == Op1->getOpcode() && 6590 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6591 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6592 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6593 6594 // Matched reduce(ext(mul(ext(A), ext(B))) 6595 // Note that the extend opcodes need to all match, or if A==B they will have 6596 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6597 // which is equally fine. 6598 bool IsUnsigned = isa<ZExtInst>(Op0); 6599 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6600 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6601 6602 InstructionCost ExtCost = 6603 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6604 TTI::CastContextHint::None, CostKind, Op0); 6605 InstructionCost MulCost = 6606 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6607 InstructionCost Ext2Cost = 6608 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6609 TTI::CastContextHint::None, CostKind, RedOp); 6610 6611 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6612 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6613 CostKind); 6614 6615 if (RedCost.isValid() && 6616 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6617 return I == RetI ? RedCost : 0; 6618 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6619 !TheLoop->isLoopInvariant(RedOp)) { 6620 // Matched reduce(ext(A)) 6621 bool IsUnsigned = isa<ZExtInst>(RedOp); 6622 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6623 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6624 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6625 CostKind); 6626 6627 InstructionCost ExtCost = 6628 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6629 TTI::CastContextHint::None, CostKind, RedOp); 6630 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6631 return I == RetI ? RedCost : 0; 6632 } else if (RedOp && 6633 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6634 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6635 Op0->getOpcode() == Op1->getOpcode() && 6636 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6637 bool IsUnsigned = isa<ZExtInst>(Op0); 6638 Type *Op0Ty = Op0->getOperand(0)->getType(); 6639 Type *Op1Ty = Op1->getOperand(0)->getType(); 6640 Type *LargestOpTy = 6641 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6642 : Op0Ty; 6643 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6644 6645 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6646 // different sizes. We take the largest type as the ext to reduce, and add 6647 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6648 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6649 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6650 TTI::CastContextHint::None, CostKind, Op0); 6651 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6652 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6653 TTI::CastContextHint::None, CostKind, Op1); 6654 InstructionCost MulCost = 6655 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6656 6657 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6658 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6659 CostKind); 6660 InstructionCost ExtraExtCost = 0; 6661 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6662 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6663 ExtraExtCost = TTI.getCastInstrCost( 6664 ExtraExtOp->getOpcode(), ExtType, 6665 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6666 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6667 } 6668 6669 if (RedCost.isValid() && 6670 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6671 return I == RetI ? RedCost : 0; 6672 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6673 // Matched reduce(mul()) 6674 InstructionCost MulCost = 6675 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6676 6677 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6678 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6679 CostKind); 6680 6681 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6682 return I == RetI ? RedCost : 0; 6683 } 6684 } 6685 6686 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6687 } 6688 6689 InstructionCost 6690 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6691 ElementCount VF) { 6692 // Calculate scalar cost only. Vectorization cost should be ready at this 6693 // moment. 6694 if (VF.isScalar()) { 6695 Type *ValTy = getLoadStoreType(I); 6696 const Align Alignment = getLoadStoreAlignment(I); 6697 unsigned AS = getLoadStoreAddressSpace(I); 6698 6699 return TTI.getAddressComputationCost(ValTy) + 6700 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6701 TTI::TCK_RecipThroughput, I); 6702 } 6703 return getWideningCost(I, VF); 6704 } 6705 6706 LoopVectorizationCostModel::VectorizationCostTy 6707 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6708 ElementCount VF) { 6709 // If we know that this instruction will remain uniform, check the cost of 6710 // the scalar version. 6711 if (isUniformAfterVectorization(I, VF)) 6712 VF = ElementCount::getFixed(1); 6713 6714 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6715 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6716 6717 // Forced scalars do not have any scalarization overhead. 6718 auto ForcedScalar = ForcedScalars.find(VF); 6719 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6720 auto InstSet = ForcedScalar->second; 6721 if (InstSet.count(I)) 6722 return VectorizationCostTy( 6723 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6724 VF.getKnownMinValue()), 6725 false); 6726 } 6727 6728 Type *VectorTy; 6729 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6730 6731 bool TypeNotScalarized = false; 6732 if (VF.isVector() && VectorTy->isVectorTy()) { 6733 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 6734 if (NumParts) 6735 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6736 else 6737 C = InstructionCost::getInvalid(); 6738 } 6739 return VectorizationCostTy(C, TypeNotScalarized); 6740 } 6741 6742 InstructionCost 6743 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6744 ElementCount VF) const { 6745 6746 // There is no mechanism yet to create a scalable scalarization loop, 6747 // so this is currently Invalid. 6748 if (VF.isScalable()) 6749 return InstructionCost::getInvalid(); 6750 6751 if (VF.isScalar()) 6752 return 0; 6753 6754 InstructionCost Cost = 0; 6755 Type *RetTy = ToVectorTy(I->getType(), VF); 6756 if (!RetTy->isVoidTy() && 6757 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6758 Cost += TTI.getScalarizationOverhead( 6759 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6760 false); 6761 6762 // Some targets keep addresses scalar. 6763 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6764 return Cost; 6765 6766 // Some targets support efficient element stores. 6767 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6768 return Cost; 6769 6770 // Collect operands to consider. 6771 CallInst *CI = dyn_cast<CallInst>(I); 6772 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6773 6774 // Skip operands that do not require extraction/scalarization and do not incur 6775 // any overhead. 6776 SmallVector<Type *> Tys; 6777 for (auto *V : filterExtractingOperands(Ops, VF)) 6778 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6779 return Cost + TTI.getOperandsScalarizationOverhead( 6780 filterExtractingOperands(Ops, VF), Tys); 6781 } 6782 6783 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6784 if (VF.isScalar()) 6785 return; 6786 NumPredStores = 0; 6787 for (BasicBlock *BB : TheLoop->blocks()) { 6788 // For each instruction in the old loop. 6789 for (Instruction &I : *BB) { 6790 Value *Ptr = getLoadStorePointerOperand(&I); 6791 if (!Ptr) 6792 continue; 6793 6794 // TODO: We should generate better code and update the cost model for 6795 // predicated uniform stores. Today they are treated as any other 6796 // predicated store (see added test cases in 6797 // invariant-store-vectorization.ll). 6798 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6799 NumPredStores++; 6800 6801 if (Legal->isUniformMemOp(I)) { 6802 // TODO: Avoid replicating loads and stores instead of 6803 // relying on instcombine to remove them. 6804 // Load: Scalar load + broadcast 6805 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6806 InstructionCost Cost; 6807 if (isa<StoreInst>(&I) && VF.isScalable() && 6808 isLegalGatherOrScatter(&I, VF)) { 6809 Cost = getGatherScatterCost(&I, VF); 6810 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6811 } else { 6812 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 6813 "Cannot yet scalarize uniform stores"); 6814 Cost = getUniformMemOpCost(&I, VF); 6815 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6816 } 6817 continue; 6818 } 6819 6820 // We assume that widening is the best solution when possible. 6821 if (memoryInstructionCanBeWidened(&I, VF)) { 6822 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6823 int ConsecutiveStride = Legal->isConsecutivePtr( 6824 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6825 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6826 "Expected consecutive stride."); 6827 InstWidening Decision = 6828 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6829 setWideningDecision(&I, VF, Decision, Cost); 6830 continue; 6831 } 6832 6833 // Choose between Interleaving, Gather/Scatter or Scalarization. 6834 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6835 unsigned NumAccesses = 1; 6836 if (isAccessInterleaved(&I)) { 6837 auto Group = getInterleavedAccessGroup(&I); 6838 assert(Group && "Fail to get an interleaved access group."); 6839 6840 // Make one decision for the whole group. 6841 if (getWideningDecision(&I, VF) != CM_Unknown) 6842 continue; 6843 6844 NumAccesses = Group->getNumMembers(); 6845 if (interleavedAccessCanBeWidened(&I, VF)) 6846 InterleaveCost = getInterleaveGroupCost(&I, VF); 6847 } 6848 6849 InstructionCost GatherScatterCost = 6850 isLegalGatherOrScatter(&I, VF) 6851 ? getGatherScatterCost(&I, VF) * NumAccesses 6852 : InstructionCost::getInvalid(); 6853 6854 InstructionCost ScalarizationCost = 6855 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6856 6857 // Choose better solution for the current VF, 6858 // write down this decision and use it during vectorization. 6859 InstructionCost Cost; 6860 InstWidening Decision; 6861 if (InterleaveCost <= GatherScatterCost && 6862 InterleaveCost < ScalarizationCost) { 6863 Decision = CM_Interleave; 6864 Cost = InterleaveCost; 6865 } else if (GatherScatterCost < ScalarizationCost) { 6866 Decision = CM_GatherScatter; 6867 Cost = GatherScatterCost; 6868 } else { 6869 Decision = CM_Scalarize; 6870 Cost = ScalarizationCost; 6871 } 6872 // If the instructions belongs to an interleave group, the whole group 6873 // receives the same decision. The whole group receives the cost, but 6874 // the cost will actually be assigned to one instruction. 6875 if (auto Group = getInterleavedAccessGroup(&I)) 6876 setWideningDecision(Group, VF, Decision, Cost); 6877 else 6878 setWideningDecision(&I, VF, Decision, Cost); 6879 } 6880 } 6881 6882 // Make sure that any load of address and any other address computation 6883 // remains scalar unless there is gather/scatter support. This avoids 6884 // inevitable extracts into address registers, and also has the benefit of 6885 // activating LSR more, since that pass can't optimize vectorized 6886 // addresses. 6887 if (TTI.prefersVectorizedAddressing()) 6888 return; 6889 6890 // Start with all scalar pointer uses. 6891 SmallPtrSet<Instruction *, 8> AddrDefs; 6892 for (BasicBlock *BB : TheLoop->blocks()) 6893 for (Instruction &I : *BB) { 6894 Instruction *PtrDef = 6895 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6896 if (PtrDef && TheLoop->contains(PtrDef) && 6897 getWideningDecision(&I, VF) != CM_GatherScatter) 6898 AddrDefs.insert(PtrDef); 6899 } 6900 6901 // Add all instructions used to generate the addresses. 6902 SmallVector<Instruction *, 4> Worklist; 6903 append_range(Worklist, AddrDefs); 6904 while (!Worklist.empty()) { 6905 Instruction *I = Worklist.pop_back_val(); 6906 for (auto &Op : I->operands()) 6907 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6908 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6909 AddrDefs.insert(InstOp).second) 6910 Worklist.push_back(InstOp); 6911 } 6912 6913 for (auto *I : AddrDefs) { 6914 if (isa<LoadInst>(I)) { 6915 // Setting the desired widening decision should ideally be handled in 6916 // by cost functions, but since this involves the task of finding out 6917 // if the loaded register is involved in an address computation, it is 6918 // instead changed here when we know this is the case. 6919 InstWidening Decision = getWideningDecision(I, VF); 6920 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6921 // Scalarize a widened load of address. 6922 setWideningDecision( 6923 I, VF, CM_Scalarize, 6924 (VF.getKnownMinValue() * 6925 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6926 else if (auto Group = getInterleavedAccessGroup(I)) { 6927 // Scalarize an interleave group of address loads. 6928 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6929 if (Instruction *Member = Group->getMember(I)) 6930 setWideningDecision( 6931 Member, VF, CM_Scalarize, 6932 (VF.getKnownMinValue() * 6933 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6934 } 6935 } 6936 } else 6937 // Make sure I gets scalarized and a cost estimate without 6938 // scalarization overhead. 6939 ForcedScalars[VF].insert(I); 6940 } 6941 } 6942 6943 InstructionCost 6944 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6945 Type *&VectorTy) { 6946 Type *RetTy = I->getType(); 6947 if (canTruncateToMinimalBitwidth(I, VF)) 6948 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6949 auto SE = PSE.getSE(); 6950 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6951 6952 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6953 ElementCount VF) -> bool { 6954 if (VF.isScalar()) 6955 return true; 6956 6957 auto Scalarized = InstsToScalarize.find(VF); 6958 assert(Scalarized != InstsToScalarize.end() && 6959 "VF not yet analyzed for scalarization profitability"); 6960 return !Scalarized->second.count(I) && 6961 llvm::all_of(I->users(), [&](User *U) { 6962 auto *UI = cast<Instruction>(U); 6963 return !Scalarized->second.count(UI); 6964 }); 6965 }; 6966 (void) hasSingleCopyAfterVectorization; 6967 6968 if (isScalarAfterVectorization(I, VF)) { 6969 // With the exception of GEPs and PHIs, after scalarization there should 6970 // only be one copy of the instruction generated in the loop. This is 6971 // because the VF is either 1, or any instructions that need scalarizing 6972 // have already been dealt with by the the time we get here. As a result, 6973 // it means we don't have to multiply the instruction cost by VF. 6974 assert(I->getOpcode() == Instruction::GetElementPtr || 6975 I->getOpcode() == Instruction::PHI || 6976 (I->getOpcode() == Instruction::BitCast && 6977 I->getType()->isPointerTy()) || 6978 hasSingleCopyAfterVectorization(I, VF)); 6979 VectorTy = RetTy; 6980 } else 6981 VectorTy = ToVectorTy(RetTy, VF); 6982 6983 // TODO: We need to estimate the cost of intrinsic calls. 6984 switch (I->getOpcode()) { 6985 case Instruction::GetElementPtr: 6986 // We mark this instruction as zero-cost because the cost of GEPs in 6987 // vectorized code depends on whether the corresponding memory instruction 6988 // is scalarized or not. Therefore, we handle GEPs with the memory 6989 // instruction cost. 6990 return 0; 6991 case Instruction::Br: { 6992 // In cases of scalarized and predicated instructions, there will be VF 6993 // predicated blocks in the vectorized loop. Each branch around these 6994 // blocks requires also an extract of its vector compare i1 element. 6995 bool ScalarPredicatedBB = false; 6996 BranchInst *BI = cast<BranchInst>(I); 6997 if (VF.isVector() && BI->isConditional() && 6998 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6999 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7000 ScalarPredicatedBB = true; 7001 7002 if (ScalarPredicatedBB) { 7003 // Not possible to scalarize scalable vector with predicated instructions. 7004 if (VF.isScalable()) 7005 return InstructionCost::getInvalid(); 7006 // Return cost for branches around scalarized and predicated blocks. 7007 auto *Vec_i1Ty = 7008 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7009 return ( 7010 TTI.getScalarizationOverhead( 7011 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7012 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7013 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7014 // The back-edge branch will remain, as will all scalar branches. 7015 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7016 else 7017 // This branch will be eliminated by if-conversion. 7018 return 0; 7019 // Note: We currently assume zero cost for an unconditional branch inside 7020 // a predicated block since it will become a fall-through, although we 7021 // may decide in the future to call TTI for all branches. 7022 } 7023 case Instruction::PHI: { 7024 auto *Phi = cast<PHINode>(I); 7025 7026 // First-order recurrences are replaced by vector shuffles inside the loop. 7027 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7028 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7029 return TTI.getShuffleCost( 7030 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7031 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7032 7033 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7034 // converted into select instructions. We require N - 1 selects per phi 7035 // node, where N is the number of incoming values. 7036 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7037 return (Phi->getNumIncomingValues() - 1) * 7038 TTI.getCmpSelInstrCost( 7039 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7040 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7041 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7042 7043 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7044 } 7045 case Instruction::UDiv: 7046 case Instruction::SDiv: 7047 case Instruction::URem: 7048 case Instruction::SRem: 7049 // If we have a predicated instruction, it may not be executed for each 7050 // vector lane. Get the scalarization cost and scale this amount by the 7051 // probability of executing the predicated block. If the instruction is not 7052 // predicated, we fall through to the next case. 7053 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7054 InstructionCost Cost = 0; 7055 7056 // These instructions have a non-void type, so account for the phi nodes 7057 // that we will create. This cost is likely to be zero. The phi node 7058 // cost, if any, should be scaled by the block probability because it 7059 // models a copy at the end of each predicated block. 7060 Cost += VF.getKnownMinValue() * 7061 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7062 7063 // The cost of the non-predicated instruction. 7064 Cost += VF.getKnownMinValue() * 7065 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7066 7067 // The cost of insertelement and extractelement instructions needed for 7068 // scalarization. 7069 Cost += getScalarizationOverhead(I, VF); 7070 7071 // Scale the cost by the probability of executing the predicated blocks. 7072 // This assumes the predicated block for each vector lane is equally 7073 // likely. 7074 return Cost / getReciprocalPredBlockProb(); 7075 } 7076 LLVM_FALLTHROUGH; 7077 case Instruction::Add: 7078 case Instruction::FAdd: 7079 case Instruction::Sub: 7080 case Instruction::FSub: 7081 case Instruction::Mul: 7082 case Instruction::FMul: 7083 case Instruction::FDiv: 7084 case Instruction::FRem: 7085 case Instruction::Shl: 7086 case Instruction::LShr: 7087 case Instruction::AShr: 7088 case Instruction::And: 7089 case Instruction::Or: 7090 case Instruction::Xor: { 7091 // Since we will replace the stride by 1 the multiplication should go away. 7092 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7093 return 0; 7094 7095 // Detect reduction patterns 7096 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7097 return *RedCost; 7098 7099 // Certain instructions can be cheaper to vectorize if they have a constant 7100 // second vector operand. One example of this are shifts on x86. 7101 Value *Op2 = I->getOperand(1); 7102 TargetTransformInfo::OperandValueProperties Op2VP; 7103 TargetTransformInfo::OperandValueKind Op2VK = 7104 TTI.getOperandInfo(Op2, Op2VP); 7105 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7106 Op2VK = TargetTransformInfo::OK_UniformValue; 7107 7108 SmallVector<const Value *, 4> Operands(I->operand_values()); 7109 return TTI.getArithmeticInstrCost( 7110 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7111 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7112 } 7113 case Instruction::FNeg: { 7114 return TTI.getArithmeticInstrCost( 7115 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7116 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7117 TargetTransformInfo::OP_None, I->getOperand(0), I); 7118 } 7119 case Instruction::Select: { 7120 SelectInst *SI = cast<SelectInst>(I); 7121 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7122 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7123 7124 const Value *Op0, *Op1; 7125 using namespace llvm::PatternMatch; 7126 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7127 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7128 // select x, y, false --> x & y 7129 // select x, true, y --> x | y 7130 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7131 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7132 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7133 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7134 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7135 Op1->getType()->getScalarSizeInBits() == 1); 7136 7137 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7138 return TTI.getArithmeticInstrCost( 7139 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7140 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7141 } 7142 7143 Type *CondTy = SI->getCondition()->getType(); 7144 if (!ScalarCond) 7145 CondTy = VectorType::get(CondTy, VF); 7146 7147 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7148 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7149 Pred = Cmp->getPredicate(); 7150 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7151 CostKind, I); 7152 } 7153 case Instruction::ICmp: 7154 case Instruction::FCmp: { 7155 Type *ValTy = I->getOperand(0)->getType(); 7156 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7157 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7158 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7159 VectorTy = ToVectorTy(ValTy, VF); 7160 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7161 cast<CmpInst>(I)->getPredicate(), CostKind, 7162 I); 7163 } 7164 case Instruction::Store: 7165 case Instruction::Load: { 7166 ElementCount Width = VF; 7167 if (Width.isVector()) { 7168 InstWidening Decision = getWideningDecision(I, Width); 7169 assert(Decision != CM_Unknown && 7170 "CM decision should be taken at this point"); 7171 if (Decision == CM_Scalarize) 7172 Width = ElementCount::getFixed(1); 7173 } 7174 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7175 return getMemoryInstructionCost(I, VF); 7176 } 7177 case Instruction::BitCast: 7178 if (I->getType()->isPointerTy()) 7179 return 0; 7180 LLVM_FALLTHROUGH; 7181 case Instruction::ZExt: 7182 case Instruction::SExt: 7183 case Instruction::FPToUI: 7184 case Instruction::FPToSI: 7185 case Instruction::FPExt: 7186 case Instruction::PtrToInt: 7187 case Instruction::IntToPtr: 7188 case Instruction::SIToFP: 7189 case Instruction::UIToFP: 7190 case Instruction::Trunc: 7191 case Instruction::FPTrunc: { 7192 // Computes the CastContextHint from a Load/Store instruction. 7193 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7194 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7195 "Expected a load or a store!"); 7196 7197 if (VF.isScalar() || !TheLoop->contains(I)) 7198 return TTI::CastContextHint::Normal; 7199 7200 switch (getWideningDecision(I, VF)) { 7201 case LoopVectorizationCostModel::CM_GatherScatter: 7202 return TTI::CastContextHint::GatherScatter; 7203 case LoopVectorizationCostModel::CM_Interleave: 7204 return TTI::CastContextHint::Interleave; 7205 case LoopVectorizationCostModel::CM_Scalarize: 7206 case LoopVectorizationCostModel::CM_Widen: 7207 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7208 : TTI::CastContextHint::Normal; 7209 case LoopVectorizationCostModel::CM_Widen_Reverse: 7210 return TTI::CastContextHint::Reversed; 7211 case LoopVectorizationCostModel::CM_Unknown: 7212 llvm_unreachable("Instr did not go through cost modelling?"); 7213 } 7214 7215 llvm_unreachable("Unhandled case!"); 7216 }; 7217 7218 unsigned Opcode = I->getOpcode(); 7219 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7220 // For Trunc, the context is the only user, which must be a StoreInst. 7221 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7222 if (I->hasOneUse()) 7223 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7224 CCH = ComputeCCH(Store); 7225 } 7226 // For Z/Sext, the context is the operand, which must be a LoadInst. 7227 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7228 Opcode == Instruction::FPExt) { 7229 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7230 CCH = ComputeCCH(Load); 7231 } 7232 7233 // We optimize the truncation of induction variables having constant 7234 // integer steps. The cost of these truncations is the same as the scalar 7235 // operation. 7236 if (isOptimizableIVTruncate(I, VF)) { 7237 auto *Trunc = cast<TruncInst>(I); 7238 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7239 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7240 } 7241 7242 // Detect reduction patterns 7243 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7244 return *RedCost; 7245 7246 Type *SrcScalarTy = I->getOperand(0)->getType(); 7247 Type *SrcVecTy = 7248 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7249 if (canTruncateToMinimalBitwidth(I, VF)) { 7250 // This cast is going to be shrunk. This may remove the cast or it might 7251 // turn it into slightly different cast. For example, if MinBW == 16, 7252 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7253 // 7254 // Calculate the modified src and dest types. 7255 Type *MinVecTy = VectorTy; 7256 if (Opcode == Instruction::Trunc) { 7257 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7258 VectorTy = 7259 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7260 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7261 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7262 VectorTy = 7263 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7264 } 7265 } 7266 7267 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7268 } 7269 case Instruction::Call: { 7270 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7271 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7272 return *RedCost; 7273 bool NeedToScalarize; 7274 CallInst *CI = cast<CallInst>(I); 7275 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7276 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7277 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7278 return std::min(CallCost, IntrinsicCost); 7279 } 7280 return CallCost; 7281 } 7282 case Instruction::ExtractValue: 7283 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7284 case Instruction::Alloca: 7285 // We cannot easily widen alloca to a scalable alloca, as 7286 // the result would need to be a vector of pointers. 7287 if (VF.isScalable()) 7288 return InstructionCost::getInvalid(); 7289 LLVM_FALLTHROUGH; 7290 default: 7291 // This opcode is unknown. Assume that it is the same as 'mul'. 7292 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7293 } // end of switch. 7294 } 7295 7296 char LoopVectorize::ID = 0; 7297 7298 static const char lv_name[] = "Loop Vectorization"; 7299 7300 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7301 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7302 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7303 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7304 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7305 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7306 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7307 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7308 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7309 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7310 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7311 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7312 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7313 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7314 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7315 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7316 7317 namespace llvm { 7318 7319 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7320 7321 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7322 bool VectorizeOnlyWhenForced) { 7323 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7324 } 7325 7326 } // end namespace llvm 7327 7328 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7329 // Check if the pointer operand of a load or store instruction is 7330 // consecutive. 7331 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7332 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7333 return false; 7334 } 7335 7336 void LoopVectorizationCostModel::collectValuesToIgnore() { 7337 // Ignore ephemeral values. 7338 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7339 7340 // Find all stores to invariant variables. Since they are going to sink 7341 // outside the loop we do not need calculate cost for them. 7342 for (BasicBlock *BB : TheLoop->blocks()) 7343 for (Instruction &I : *BB) { 7344 StoreInst *SI; 7345 if ((SI = dyn_cast<StoreInst>(&I)) && 7346 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7347 ValuesToIgnore.insert(&I); 7348 } 7349 7350 // Ignore type-promoting instructions we identified during reduction 7351 // detection. 7352 for (auto &Reduction : Legal->getReductionVars()) { 7353 const RecurrenceDescriptor &RedDes = Reduction.second; 7354 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7355 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7356 } 7357 // Ignore type-casting instructions we identified during induction 7358 // detection. 7359 for (auto &Induction : Legal->getInductionVars()) { 7360 const InductionDescriptor &IndDes = Induction.second; 7361 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7362 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7363 } 7364 } 7365 7366 void LoopVectorizationCostModel::collectInLoopReductions() { 7367 for (auto &Reduction : Legal->getReductionVars()) { 7368 PHINode *Phi = Reduction.first; 7369 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7370 7371 // We don't collect reductions that are type promoted (yet). 7372 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7373 continue; 7374 7375 // If the target would prefer this reduction to happen "in-loop", then we 7376 // want to record it as such. 7377 unsigned Opcode = RdxDesc.getOpcode(); 7378 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7379 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7380 TargetTransformInfo::ReductionFlags())) 7381 continue; 7382 7383 // Check that we can correctly put the reductions into the loop, by 7384 // finding the chain of operations that leads from the phi to the loop 7385 // exit value. 7386 SmallVector<Instruction *, 4> ReductionOperations = 7387 RdxDesc.getReductionOpChain(Phi, TheLoop); 7388 bool InLoop = !ReductionOperations.empty(); 7389 if (InLoop) { 7390 InLoopReductionChains[Phi] = ReductionOperations; 7391 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7392 Instruction *LastChain = Phi; 7393 for (auto *I : ReductionOperations) { 7394 InLoopReductionImmediateChains[I] = LastChain; 7395 LastChain = I; 7396 } 7397 } 7398 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7399 << " reduction for phi: " << *Phi << "\n"); 7400 } 7401 } 7402 7403 // TODO: we could return a pair of values that specify the max VF and 7404 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7405 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7406 // doesn't have a cost model that can choose which plan to execute if 7407 // more than one is generated. 7408 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7409 LoopVectorizationCostModel &CM) { 7410 unsigned WidestType; 7411 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7412 return WidestVectorRegBits / WidestType; 7413 } 7414 7415 VectorizationFactor 7416 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7417 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7418 ElementCount VF = UserVF; 7419 // Outer loop handling: They may require CFG and instruction level 7420 // transformations before even evaluating whether vectorization is profitable. 7421 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7422 // the vectorization pipeline. 7423 if (!OrigLoop->isInnermost()) { 7424 // If the user doesn't provide a vectorization factor, determine a 7425 // reasonable one. 7426 if (UserVF.isZero()) { 7427 VF = ElementCount::getFixed(determineVPlanVF( 7428 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7429 .getFixedSize(), 7430 CM)); 7431 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7432 7433 // Make sure we have a VF > 1 for stress testing. 7434 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7435 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7436 << "overriding computed VF.\n"); 7437 VF = ElementCount::getFixed(4); 7438 } 7439 } 7440 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7441 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7442 "VF needs to be a power of two"); 7443 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7444 << "VF " << VF << " to build VPlans.\n"); 7445 buildVPlans(VF, VF); 7446 7447 // For VPlan build stress testing, we bail out after VPlan construction. 7448 if (VPlanBuildStressTest) 7449 return VectorizationFactor::Disabled(); 7450 7451 return {VF, 0 /*Cost*/}; 7452 } 7453 7454 LLVM_DEBUG( 7455 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7456 "VPlan-native path.\n"); 7457 return VectorizationFactor::Disabled(); 7458 } 7459 7460 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { 7461 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7462 return (NumRuntimePointerChecks > 7463 VectorizerParams::RuntimeMemoryCheckThreshold && 7464 !Hints.allowReordering()) || 7465 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7466 } 7467 7468 Optional<VectorizationFactor> 7469 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7470 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7471 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7472 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7473 return None; 7474 7475 // Invalidate interleave groups if all blocks of loop will be predicated. 7476 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7477 !useMaskedInterleavedAccesses(*TTI)) { 7478 LLVM_DEBUG( 7479 dbgs() 7480 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7481 "which requires masked-interleaved support.\n"); 7482 if (CM.InterleaveInfo.invalidateGroups()) 7483 // Invalidating interleave groups also requires invalidating all decisions 7484 // based on them, which includes widening decisions and uniform and scalar 7485 // values. 7486 CM.invalidateCostModelingDecisions(); 7487 } 7488 7489 ElementCount MaxUserVF = 7490 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7491 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7492 if (!UserVF.isZero() && UserVFIsLegal) { 7493 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7494 "VF needs to be a power of two"); 7495 // Collect the instructions (and their associated costs) that will be more 7496 // profitable to scalarize. 7497 if (CM.selectUserVectorizationFactor(UserVF)) { 7498 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7499 CM.collectInLoopReductions(); 7500 buildVPlansWithVPRecipes(UserVF, UserVF); 7501 LLVM_DEBUG(printPlans(dbgs())); 7502 return {{UserVF, 0}}; 7503 } else 7504 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7505 "InvalidCost", ORE, OrigLoop); 7506 } 7507 7508 // Populate the set of Vectorization Factor Candidates. 7509 ElementCountSet VFCandidates; 7510 for (auto VF = ElementCount::getFixed(1); 7511 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7512 VFCandidates.insert(VF); 7513 for (auto VF = ElementCount::getScalable(1); 7514 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7515 VFCandidates.insert(VF); 7516 7517 for (const auto &VF : VFCandidates) { 7518 // Collect Uniform and Scalar instructions after vectorization with VF. 7519 CM.collectUniformsAndScalars(VF); 7520 7521 // Collect the instructions (and their associated costs) that will be more 7522 // profitable to scalarize. 7523 if (VF.isVector()) 7524 CM.collectInstsToScalarize(VF); 7525 } 7526 7527 CM.collectInLoopReductions(); 7528 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7529 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7530 7531 LLVM_DEBUG(printPlans(dbgs())); 7532 if (!MaxFactors.hasVector()) 7533 return VectorizationFactor::Disabled(); 7534 7535 // Select the optimal vectorization factor. 7536 return CM.selectVectorizationFactor(VFCandidates); 7537 } 7538 7539 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7540 assert(count_if(VPlans, 7541 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7542 1 && 7543 "Best VF has not a single VPlan."); 7544 7545 for (const VPlanPtr &Plan : VPlans) { 7546 if (Plan->hasVF(VF)) 7547 return *Plan.get(); 7548 } 7549 llvm_unreachable("No plan found!"); 7550 } 7551 7552 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7553 SmallVector<Metadata *, 4> MDs; 7554 // Reserve first location for self reference to the LoopID metadata node. 7555 MDs.push_back(nullptr); 7556 bool IsUnrollMetadata = false; 7557 MDNode *LoopID = L->getLoopID(); 7558 if (LoopID) { 7559 // First find existing loop unrolling disable metadata. 7560 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7561 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7562 if (MD) { 7563 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7564 IsUnrollMetadata = 7565 S && S->getString().startswith("llvm.loop.unroll.disable"); 7566 } 7567 MDs.push_back(LoopID->getOperand(i)); 7568 } 7569 } 7570 7571 if (!IsUnrollMetadata) { 7572 // Add runtime unroll disable metadata. 7573 LLVMContext &Context = L->getHeader()->getContext(); 7574 SmallVector<Metadata *, 1> DisableOperands; 7575 DisableOperands.push_back( 7576 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7577 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7578 MDs.push_back(DisableNode); 7579 MDNode *NewLoopID = MDNode::get(Context, MDs); 7580 // Set operand 0 to refer to the loop id itself. 7581 NewLoopID->replaceOperandWith(0, NewLoopID); 7582 L->setLoopID(NewLoopID); 7583 } 7584 } 7585 7586 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7587 VPlan &BestVPlan, 7588 InnerLoopVectorizer &ILV, 7589 DominatorTree *DT) { 7590 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7591 << '\n'); 7592 7593 // Perform the actual loop transformation. 7594 7595 // 1. Set up the skeleton for vectorization, including vector pre-header and 7596 // middle block. The vector loop is created during VPlan execution. 7597 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7598 Value *CanonicalIVStartValue; 7599 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7600 ILV.createVectorizedLoopSkeleton(); 7601 ILV.collectPoisonGeneratingRecipes(State); 7602 7603 ILV.printDebugTracesAtStart(); 7604 7605 //===------------------------------------------------===// 7606 // 7607 // Notice: any optimization or new instruction that go 7608 // into the code below should also be implemented in 7609 // the cost-model. 7610 // 7611 //===------------------------------------------------===// 7612 7613 // 2. Copy and widen instructions from the old loop into the new loop. 7614 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7615 ILV.getOrCreateVectorTripCount(nullptr), 7616 CanonicalIVStartValue, State); 7617 BestVPlan.execute(&State); 7618 7619 // Keep all loop hints from the original loop on the vector loop (we'll 7620 // replace the vectorizer-specific hints below). 7621 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7622 7623 Optional<MDNode *> VectorizedLoopID = 7624 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7625 LLVMLoopVectorizeFollowupVectorized}); 7626 7627 VPBasicBlock *HeaderVPBB = 7628 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7629 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7630 if (VectorizedLoopID.hasValue()) 7631 L->setLoopID(VectorizedLoopID.getValue()); 7632 else { 7633 // Keep all loop hints from the original loop on the vector loop (we'll 7634 // replace the vectorizer-specific hints below). 7635 if (MDNode *LID = OrigLoop->getLoopID()) 7636 L->setLoopID(LID); 7637 7638 LoopVectorizeHints Hints(L, true, *ORE); 7639 Hints.setAlreadyVectorized(); 7640 } 7641 // Disable runtime unrolling when vectorizing the epilogue loop. 7642 if (CanonicalIVStartValue) 7643 AddRuntimeUnrollDisableMetaData(L); 7644 7645 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7646 // predication, updating analyses. 7647 ILV.fixVectorizedLoop(State, BestVPlan); 7648 7649 ILV.printDebugTracesAtEnd(); 7650 } 7651 7652 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7653 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7654 for (const auto &Plan : VPlans) 7655 if (PrintVPlansInDotFormat) 7656 Plan->printDOT(O); 7657 else 7658 Plan->print(O); 7659 } 7660 #endif 7661 7662 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7663 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7664 7665 // We create new control-flow for the vectorized loop, so the original exit 7666 // conditions will be dead after vectorization if it's only used by the 7667 // terminator 7668 SmallVector<BasicBlock*> ExitingBlocks; 7669 OrigLoop->getExitingBlocks(ExitingBlocks); 7670 for (auto *BB : ExitingBlocks) { 7671 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7672 if (!Cmp || !Cmp->hasOneUse()) 7673 continue; 7674 7675 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7676 if (!DeadInstructions.insert(Cmp).second) 7677 continue; 7678 7679 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7680 // TODO: can recurse through operands in general 7681 for (Value *Op : Cmp->operands()) { 7682 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7683 DeadInstructions.insert(cast<Instruction>(Op)); 7684 } 7685 } 7686 7687 // We create new "steps" for induction variable updates to which the original 7688 // induction variables map. An original update instruction will be dead if 7689 // all its users except the induction variable are dead. 7690 auto *Latch = OrigLoop->getLoopLatch(); 7691 for (auto &Induction : Legal->getInductionVars()) { 7692 PHINode *Ind = Induction.first; 7693 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7694 7695 // If the tail is to be folded by masking, the primary induction variable, 7696 // if exists, isn't dead: it will be used for masking. Don't kill it. 7697 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7698 continue; 7699 7700 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7701 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7702 })) 7703 DeadInstructions.insert(IndUpdate); 7704 } 7705 } 7706 7707 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7708 7709 //===--------------------------------------------------------------------===// 7710 // EpilogueVectorizerMainLoop 7711 //===--------------------------------------------------------------------===// 7712 7713 /// This function is partially responsible for generating the control flow 7714 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7715 std::pair<BasicBlock *, Value *> 7716 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7717 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7718 7719 // Workaround! Compute the trip count of the original loop and cache it 7720 // before we start modifying the CFG. This code has a systemic problem 7721 // wherein it tries to run analysis over partially constructed IR; this is 7722 // wrong, and not simply for SCEV. The trip count of the original loop 7723 // simply happens to be prone to hitting this in practice. In theory, we 7724 // can hit the same issue for any SCEV, or ValueTracking query done during 7725 // mutation. See PR49900. 7726 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7727 createVectorLoopSkeleton(""); 7728 7729 // Generate the code to check the minimum iteration count of the vector 7730 // epilogue (see below). 7731 EPI.EpilogueIterationCountCheck = 7732 emitIterationCountCheck(LoopScalarPreHeader, true); 7733 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7734 7735 // Generate the code to check any assumptions that we've made for SCEV 7736 // expressions. 7737 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7738 7739 // Generate the code that checks at runtime if arrays overlap. We put the 7740 // checks into a separate block to make the more common case of few elements 7741 // faster. 7742 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7743 7744 // Generate the iteration count check for the main loop, *after* the check 7745 // for the epilogue loop, so that the path-length is shorter for the case 7746 // that goes directly through the vector epilogue. The longer-path length for 7747 // the main loop is compensated for, by the gain from vectorizing the larger 7748 // trip count. Note: the branch will get updated later on when we vectorize 7749 // the epilogue. 7750 EPI.MainLoopIterationCountCheck = 7751 emitIterationCountCheck(LoopScalarPreHeader, false); 7752 7753 // Generate the induction variable. 7754 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7755 7756 // Skip induction resume value creation here because they will be created in 7757 // the second pass. If we created them here, they wouldn't be used anyway, 7758 // because the vplan in the second pass still contains the inductions from the 7759 // original loop. 7760 7761 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7762 } 7763 7764 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7765 LLVM_DEBUG({ 7766 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7767 << "Main Loop VF:" << EPI.MainLoopVF 7768 << ", Main Loop UF:" << EPI.MainLoopUF 7769 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7770 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7771 }); 7772 } 7773 7774 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7775 DEBUG_WITH_TYPE(VerboseDebug, { 7776 dbgs() << "intermediate fn:\n" 7777 << *OrigLoop->getHeader()->getParent() << "\n"; 7778 }); 7779 } 7780 7781 BasicBlock * 7782 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7783 bool ForEpilogue) { 7784 assert(Bypass && "Expected valid bypass basic block."); 7785 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7786 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7787 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7788 // Reuse existing vector loop preheader for TC checks. 7789 // Note that new preheader block is generated for vector loop. 7790 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7791 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7792 7793 // Generate code to check if the loop's trip count is less than VF * UF of the 7794 // main vector loop. 7795 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7796 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7797 7798 Value *CheckMinIters = Builder.CreateICmp( 7799 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7800 "min.iters.check"); 7801 7802 if (!ForEpilogue) 7803 TCCheckBlock->setName("vector.main.loop.iter.check"); 7804 7805 // Create new preheader for vector loop. 7806 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7807 DT, LI, nullptr, "vector.ph"); 7808 7809 if (ForEpilogue) { 7810 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7811 DT->getNode(Bypass)->getIDom()) && 7812 "TC check is expected to dominate Bypass"); 7813 7814 // Update dominator for Bypass & LoopExit. 7815 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7816 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7817 // For loops with multiple exits, there's no edge from the middle block 7818 // to exit blocks (as the epilogue must run) and thus no need to update 7819 // the immediate dominator of the exit blocks. 7820 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7821 7822 LoopBypassBlocks.push_back(TCCheckBlock); 7823 7824 // Save the trip count so we don't have to regenerate it in the 7825 // vec.epilog.iter.check. This is safe to do because the trip count 7826 // generated here dominates the vector epilog iter check. 7827 EPI.TripCount = Count; 7828 } 7829 7830 ReplaceInstWithInst( 7831 TCCheckBlock->getTerminator(), 7832 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7833 7834 return TCCheckBlock; 7835 } 7836 7837 //===--------------------------------------------------------------------===// 7838 // EpilogueVectorizerEpilogueLoop 7839 //===--------------------------------------------------------------------===// 7840 7841 /// This function is partially responsible for generating the control flow 7842 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7843 std::pair<BasicBlock *, Value *> 7844 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7845 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7846 createVectorLoopSkeleton("vec.epilog."); 7847 7848 // Now, compare the remaining count and if there aren't enough iterations to 7849 // execute the vectorized epilogue skip to the scalar part. 7850 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7851 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7852 LoopVectorPreHeader = 7853 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7854 LI, nullptr, "vec.epilog.ph"); 7855 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7856 VecEpilogueIterationCountCheck); 7857 7858 // Adjust the control flow taking the state info from the main loop 7859 // vectorization into account. 7860 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7861 "expected this to be saved from the previous pass."); 7862 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7863 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7864 7865 DT->changeImmediateDominator(LoopVectorPreHeader, 7866 EPI.MainLoopIterationCountCheck); 7867 7868 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7869 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7870 7871 if (EPI.SCEVSafetyCheck) 7872 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7873 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7874 if (EPI.MemSafetyCheck) 7875 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7876 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7877 7878 DT->changeImmediateDominator( 7879 VecEpilogueIterationCountCheck, 7880 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7881 7882 DT->changeImmediateDominator(LoopScalarPreHeader, 7883 EPI.EpilogueIterationCountCheck); 7884 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7885 // If there is an epilogue which must run, there's no edge from the 7886 // middle block to exit blocks and thus no need to update the immediate 7887 // dominator of the exit blocks. 7888 DT->changeImmediateDominator(LoopExitBlock, 7889 EPI.EpilogueIterationCountCheck); 7890 7891 // Keep track of bypass blocks, as they feed start values to the induction 7892 // phis in the scalar loop preheader. 7893 if (EPI.SCEVSafetyCheck) 7894 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7895 if (EPI.MemSafetyCheck) 7896 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7897 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7898 7899 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7900 // merge control-flow from the latch block and the middle block. Update the 7901 // incoming values here and move the Phi into the preheader. 7902 SmallVector<PHINode *, 4> PhisInBlock; 7903 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7904 PhisInBlock.push_back(&Phi); 7905 7906 for (PHINode *Phi : PhisInBlock) { 7907 Phi->replaceIncomingBlockWith( 7908 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7909 VecEpilogueIterationCountCheck); 7910 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7911 if (EPI.SCEVSafetyCheck) 7912 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7913 if (EPI.MemSafetyCheck) 7914 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7915 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7916 } 7917 7918 // Generate a resume induction for the vector epilogue and put it in the 7919 // vector epilogue preheader 7920 Type *IdxTy = Legal->getWidestInductionType(); 7921 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7922 LoopVectorPreHeader->getFirstNonPHI()); 7923 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7924 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7925 EPI.MainLoopIterationCountCheck); 7926 7927 // Generate induction resume values. These variables save the new starting 7928 // indexes for the scalar loop. They are used to test if there are any tail 7929 // iterations left once the vector loop has completed. 7930 // Note that when the vectorized epilogue is skipped due to iteration count 7931 // check, then the resume value for the induction variable comes from 7932 // the trip count of the main vector loop, hence passing the AdditionalBypass 7933 // argument. 7934 createInductionResumeValues({VecEpilogueIterationCountCheck, 7935 EPI.VectorTripCount} /* AdditionalBypass */); 7936 7937 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7938 } 7939 7940 BasicBlock * 7941 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7942 BasicBlock *Bypass, BasicBlock *Insert) { 7943 7944 assert(EPI.TripCount && 7945 "Expected trip count to have been safed in the first pass."); 7946 assert( 7947 (!isa<Instruction>(EPI.TripCount) || 7948 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7949 "saved trip count does not dominate insertion point."); 7950 Value *TC = EPI.TripCount; 7951 IRBuilder<> Builder(Insert->getTerminator()); 7952 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7953 7954 // Generate code to check if the loop's trip count is less than VF * UF of the 7955 // vector epilogue loop. 7956 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7957 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7958 7959 Value *CheckMinIters = 7960 Builder.CreateICmp(P, Count, 7961 createStepForVF(Builder, Count->getType(), 7962 EPI.EpilogueVF, EPI.EpilogueUF), 7963 "min.epilog.iters.check"); 7964 7965 ReplaceInstWithInst( 7966 Insert->getTerminator(), 7967 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7968 7969 LoopBypassBlocks.push_back(Insert); 7970 return Insert; 7971 } 7972 7973 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7974 LLVM_DEBUG({ 7975 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7976 << "Epilogue Loop VF:" << EPI.EpilogueVF 7977 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7978 }); 7979 } 7980 7981 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7982 DEBUG_WITH_TYPE(VerboseDebug, { 7983 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7984 }); 7985 } 7986 7987 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7988 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7989 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7990 bool PredicateAtRangeStart = Predicate(Range.Start); 7991 7992 for (ElementCount TmpVF = Range.Start * 2; 7993 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7994 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7995 Range.End = TmpVF; 7996 break; 7997 } 7998 7999 return PredicateAtRangeStart; 8000 } 8001 8002 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8003 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8004 /// of VF's starting at a given VF and extending it as much as possible. Each 8005 /// vectorization decision can potentially shorten this sub-range during 8006 /// buildVPlan(). 8007 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8008 ElementCount MaxVF) { 8009 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8010 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8011 VFRange SubRange = {VF, MaxVFPlusOne}; 8012 VPlans.push_back(buildVPlan(SubRange)); 8013 VF = SubRange.End; 8014 } 8015 } 8016 8017 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8018 VPlanPtr &Plan) { 8019 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8020 8021 // Look for cached value. 8022 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8023 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8024 if (ECEntryIt != EdgeMaskCache.end()) 8025 return ECEntryIt->second; 8026 8027 VPValue *SrcMask = createBlockInMask(Src, Plan); 8028 8029 // The terminator has to be a branch inst! 8030 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8031 assert(BI && "Unexpected terminator found"); 8032 8033 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8034 return EdgeMaskCache[Edge] = SrcMask; 8035 8036 // If source is an exiting block, we know the exit edge is dynamically dead 8037 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8038 // adding uses of an otherwise potentially dead instruction. 8039 if (OrigLoop->isLoopExiting(Src)) 8040 return EdgeMaskCache[Edge] = SrcMask; 8041 8042 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8043 assert(EdgeMask && "No Edge Mask found for condition"); 8044 8045 if (BI->getSuccessor(0) != Dst) 8046 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8047 8048 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8049 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8050 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8051 // The select version does not introduce new UB if SrcMask is false and 8052 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8053 VPValue *False = Plan->getOrAddVPValue( 8054 ConstantInt::getFalse(BI->getCondition()->getType())); 8055 EdgeMask = 8056 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8057 } 8058 8059 return EdgeMaskCache[Edge] = EdgeMask; 8060 } 8061 8062 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8063 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8064 8065 // Look for cached value. 8066 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8067 if (BCEntryIt != BlockMaskCache.end()) 8068 return BCEntryIt->second; 8069 8070 // All-one mask is modelled as no-mask following the convention for masked 8071 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8072 VPValue *BlockMask = nullptr; 8073 8074 if (OrigLoop->getHeader() == BB) { 8075 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8076 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8077 8078 // Introduce the early-exit compare IV <= BTC to form header block mask. 8079 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8080 // constructing the desired canonical IV in the header block as its first 8081 // non-phi instructions. 8082 assert(CM.foldTailByMasking() && "must fold the tail"); 8083 VPBasicBlock *HeaderVPBB = 8084 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8085 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8086 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8087 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8088 8089 VPBuilder::InsertPointGuard Guard(Builder); 8090 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8091 if (CM.TTI.emitGetActiveLaneMask()) { 8092 VPValue *TC = Plan->getOrCreateTripCount(); 8093 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8094 } else { 8095 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8096 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8097 } 8098 return BlockMaskCache[BB] = BlockMask; 8099 } 8100 8101 // This is the block mask. We OR all incoming edges. 8102 for (auto *Predecessor : predecessors(BB)) { 8103 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8104 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8105 return BlockMaskCache[BB] = EdgeMask; 8106 8107 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8108 BlockMask = EdgeMask; 8109 continue; 8110 } 8111 8112 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8113 } 8114 8115 return BlockMaskCache[BB] = BlockMask; 8116 } 8117 8118 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8119 ArrayRef<VPValue *> Operands, 8120 VFRange &Range, 8121 VPlanPtr &Plan) { 8122 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8123 "Must be called with either a load or store"); 8124 8125 auto willWiden = [&](ElementCount VF) -> bool { 8126 LoopVectorizationCostModel::InstWidening Decision = 8127 CM.getWideningDecision(I, VF); 8128 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8129 "CM decision should be taken at this point."); 8130 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8131 return true; 8132 if (CM.isScalarAfterVectorization(I, VF) || 8133 CM.isProfitableToScalarize(I, VF)) 8134 return false; 8135 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8136 }; 8137 8138 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8139 return nullptr; 8140 8141 VPValue *Mask = nullptr; 8142 if (Legal->isMaskRequired(I)) 8143 Mask = createBlockInMask(I->getParent(), Plan); 8144 8145 // Determine if the pointer operand of the access is either consecutive or 8146 // reverse consecutive. 8147 LoopVectorizationCostModel::InstWidening Decision = 8148 CM.getWideningDecision(I, Range.Start); 8149 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8150 bool Consecutive = 8151 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8152 8153 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8154 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8155 Consecutive, Reverse); 8156 8157 StoreInst *Store = cast<StoreInst>(I); 8158 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8159 Mask, Consecutive, Reverse); 8160 } 8161 8162 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8163 /// insert a recipe to expand the step for the induction recipe. 8164 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8165 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8166 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8167 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8168 // Returns true if an instruction \p I should be scalarized instead of 8169 // vectorized for the chosen vectorization factor. 8170 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8171 return CM.isScalarAfterVectorization(I, VF) || 8172 CM.isProfitableToScalarize(I, VF); 8173 }; 8174 8175 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8176 [&](ElementCount VF) { 8177 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8178 }, 8179 Range); 8180 assert(IndDesc.getStartValue() == 8181 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8182 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8183 "step must be loop invariant"); 8184 8185 VPValue *Step = 8186 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8187 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8188 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8189 !NeedsScalarIVOnly); 8190 } 8191 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8192 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8193 !NeedsScalarIVOnly); 8194 } 8195 8196 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8197 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8198 8199 // Check if this is an integer or fp induction. If so, build the recipe that 8200 // produces its scalar and vector values. 8201 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8202 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8203 *PSE.getSE(), *OrigLoop, Range); 8204 8205 // Check if this is pointer induction. If so, build the recipe for it. 8206 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8207 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8208 *PSE.getSE()); 8209 return nullptr; 8210 } 8211 8212 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8213 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8214 // Optimize the special case where the source is a constant integer 8215 // induction variable. Notice that we can only optimize the 'trunc' case 8216 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8217 // (c) other casts depend on pointer size. 8218 8219 // Determine whether \p K is a truncation based on an induction variable that 8220 // can be optimized. 8221 auto isOptimizableIVTruncate = 8222 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8223 return [=](ElementCount VF) -> bool { 8224 return CM.isOptimizableIVTruncate(K, VF); 8225 }; 8226 }; 8227 8228 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8229 isOptimizableIVTruncate(I), Range)) { 8230 8231 auto *Phi = cast<PHINode>(I->getOperand(0)); 8232 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8233 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8234 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8235 *PSE.getSE(), *OrigLoop, Range); 8236 } 8237 return nullptr; 8238 } 8239 8240 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8241 ArrayRef<VPValue *> Operands, 8242 VPlanPtr &Plan) { 8243 // If all incoming values are equal, the incoming VPValue can be used directly 8244 // instead of creating a new VPBlendRecipe. 8245 VPValue *FirstIncoming = Operands[0]; 8246 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8247 return FirstIncoming == Inc; 8248 })) { 8249 return Operands[0]; 8250 } 8251 8252 unsigned NumIncoming = Phi->getNumIncomingValues(); 8253 // For in-loop reductions, we do not need to create an additional select. 8254 VPValue *InLoopVal = nullptr; 8255 for (unsigned In = 0; In < NumIncoming; In++) { 8256 PHINode *PhiOp = 8257 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8258 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8259 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8260 InLoopVal = Operands[In]; 8261 } 8262 } 8263 8264 assert((!InLoopVal || NumIncoming == 2) && 8265 "Found an in-loop reduction for PHI with unexpected number of " 8266 "incoming values"); 8267 if (InLoopVal) 8268 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8269 8270 // We know that all PHIs in non-header blocks are converted into selects, so 8271 // we don't have to worry about the insertion order and we can just use the 8272 // builder. At this point we generate the predication tree. There may be 8273 // duplications since this is a simple recursive scan, but future 8274 // optimizations will clean it up. 8275 SmallVector<VPValue *, 2> OperandsWithMask; 8276 8277 for (unsigned In = 0; In < NumIncoming; In++) { 8278 VPValue *EdgeMask = 8279 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8280 assert((EdgeMask || NumIncoming == 1) && 8281 "Multiple predecessors with one having a full mask"); 8282 OperandsWithMask.push_back(Operands[In]); 8283 if (EdgeMask) 8284 OperandsWithMask.push_back(EdgeMask); 8285 } 8286 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8287 } 8288 8289 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8290 ArrayRef<VPValue *> Operands, 8291 VFRange &Range) const { 8292 8293 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8294 [this, CI](ElementCount VF) { 8295 return CM.isScalarWithPredication(CI, VF); 8296 }, 8297 Range); 8298 8299 if (IsPredicated) 8300 return nullptr; 8301 8302 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8303 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8304 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8305 ID == Intrinsic::pseudoprobe || 8306 ID == Intrinsic::experimental_noalias_scope_decl)) 8307 return nullptr; 8308 8309 auto willWiden = [&](ElementCount VF) -> bool { 8310 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8311 // The following case may be scalarized depending on the VF. 8312 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8313 // version of the instruction. 8314 // Is it beneficial to perform intrinsic call compared to lib call? 8315 bool NeedToScalarize = false; 8316 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8317 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8318 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8319 return UseVectorIntrinsic || !NeedToScalarize; 8320 }; 8321 8322 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8323 return nullptr; 8324 8325 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8326 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8327 } 8328 8329 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8330 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8331 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8332 // Instruction should be widened, unless it is scalar after vectorization, 8333 // scalarization is profitable or it is predicated. 8334 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8335 return CM.isScalarAfterVectorization(I, VF) || 8336 CM.isProfitableToScalarize(I, VF) || 8337 CM.isScalarWithPredication(I, VF); 8338 }; 8339 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8340 Range); 8341 } 8342 8343 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8344 ArrayRef<VPValue *> Operands) const { 8345 auto IsVectorizableOpcode = [](unsigned Opcode) { 8346 switch (Opcode) { 8347 case Instruction::Add: 8348 case Instruction::And: 8349 case Instruction::AShr: 8350 case Instruction::BitCast: 8351 case Instruction::FAdd: 8352 case Instruction::FCmp: 8353 case Instruction::FDiv: 8354 case Instruction::FMul: 8355 case Instruction::FNeg: 8356 case Instruction::FPExt: 8357 case Instruction::FPToSI: 8358 case Instruction::FPToUI: 8359 case Instruction::FPTrunc: 8360 case Instruction::FRem: 8361 case Instruction::FSub: 8362 case Instruction::ICmp: 8363 case Instruction::IntToPtr: 8364 case Instruction::LShr: 8365 case Instruction::Mul: 8366 case Instruction::Or: 8367 case Instruction::PtrToInt: 8368 case Instruction::SDiv: 8369 case Instruction::Select: 8370 case Instruction::SExt: 8371 case Instruction::Shl: 8372 case Instruction::SIToFP: 8373 case Instruction::SRem: 8374 case Instruction::Sub: 8375 case Instruction::Trunc: 8376 case Instruction::UDiv: 8377 case Instruction::UIToFP: 8378 case Instruction::URem: 8379 case Instruction::Xor: 8380 case Instruction::ZExt: 8381 case Instruction::Freeze: 8382 return true; 8383 } 8384 return false; 8385 }; 8386 8387 if (!IsVectorizableOpcode(I->getOpcode())) 8388 return nullptr; 8389 8390 // Success: widen this instruction. 8391 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8392 } 8393 8394 void VPRecipeBuilder::fixHeaderPhis() { 8395 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8396 for (VPHeaderPHIRecipe *R : PhisToFix) { 8397 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8398 VPRecipeBase *IncR = 8399 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8400 R->addOperand(IncR->getVPSingleValue()); 8401 } 8402 } 8403 8404 VPBasicBlock *VPRecipeBuilder::handleReplication( 8405 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8406 VPlanPtr &Plan) { 8407 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8408 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8409 Range); 8410 8411 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8412 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8413 Range); 8414 8415 // Even if the instruction is not marked as uniform, there are certain 8416 // intrinsic calls that can be effectively treated as such, so we check for 8417 // them here. Conservatively, we only do this for scalable vectors, since 8418 // for fixed-width VFs we can always fall back on full scalarization. 8419 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8420 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8421 case Intrinsic::assume: 8422 case Intrinsic::lifetime_start: 8423 case Intrinsic::lifetime_end: 8424 // For scalable vectors if one of the operands is variant then we still 8425 // want to mark as uniform, which will generate one instruction for just 8426 // the first lane of the vector. We can't scalarize the call in the same 8427 // way as for fixed-width vectors because we don't know how many lanes 8428 // there are. 8429 // 8430 // The reasons for doing it this way for scalable vectors are: 8431 // 1. For the assume intrinsic generating the instruction for the first 8432 // lane is still be better than not generating any at all. For 8433 // example, the input may be a splat across all lanes. 8434 // 2. For the lifetime start/end intrinsics the pointer operand only 8435 // does anything useful when the input comes from a stack object, 8436 // which suggests it should always be uniform. For non-stack objects 8437 // the effect is to poison the object, which still allows us to 8438 // remove the call. 8439 IsUniform = true; 8440 break; 8441 default: 8442 break; 8443 } 8444 } 8445 8446 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8447 IsUniform, IsPredicated); 8448 setRecipe(I, Recipe); 8449 Plan->addVPValue(I, Recipe); 8450 8451 // Find if I uses a predicated instruction. If so, it will use its scalar 8452 // value. Avoid hoisting the insert-element which packs the scalar value into 8453 // a vector value, as that happens iff all users use the vector value. 8454 for (VPValue *Op : Recipe->operands()) { 8455 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8456 if (!PredR) 8457 continue; 8458 auto *RepR = 8459 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8460 assert(RepR->isPredicated() && 8461 "expected Replicate recipe to be predicated"); 8462 RepR->setAlsoPack(false); 8463 } 8464 8465 // Finalize the recipe for Instr, first if it is not predicated. 8466 if (!IsPredicated) { 8467 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8468 VPBB->appendRecipe(Recipe); 8469 return VPBB; 8470 } 8471 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8472 8473 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8474 assert(SingleSucc && "VPBB must have a single successor when handling " 8475 "predicated replication."); 8476 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8477 // Record predicated instructions for above packing optimizations. 8478 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8479 VPBlockUtils::insertBlockAfter(Region, VPBB); 8480 auto *RegSucc = new VPBasicBlock(); 8481 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8482 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8483 return RegSucc; 8484 } 8485 8486 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8487 VPRecipeBase *PredRecipe, 8488 VPlanPtr &Plan) { 8489 // Instructions marked for predication are replicated and placed under an 8490 // if-then construct to prevent side-effects. 8491 8492 // Generate recipes to compute the block mask for this region. 8493 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8494 8495 // Build the triangular if-then region. 8496 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8497 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8498 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8499 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8500 auto *PHIRecipe = Instr->getType()->isVoidTy() 8501 ? nullptr 8502 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8503 if (PHIRecipe) { 8504 Plan->removeVPValueFor(Instr); 8505 Plan->addVPValue(Instr, PHIRecipe); 8506 } 8507 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8508 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8509 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8510 8511 // Note: first set Entry as region entry and then connect successors starting 8512 // from it in order, to propagate the "parent" of each VPBasicBlock. 8513 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8514 VPBlockUtils::connectBlocks(Pred, Exiting); 8515 8516 return Region; 8517 } 8518 8519 VPRecipeOrVPValueTy 8520 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8521 ArrayRef<VPValue *> Operands, 8522 VFRange &Range, VPlanPtr &Plan) { 8523 // First, check for specific widening recipes that deal with inductions, Phi 8524 // nodes, calls and memory operations. 8525 VPRecipeBase *Recipe; 8526 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8527 if (Phi->getParent() != OrigLoop->getHeader()) 8528 return tryToBlend(Phi, Operands, Plan); 8529 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8530 return toVPRecipeResult(Recipe); 8531 8532 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8533 assert((Legal->isReductionVariable(Phi) || 8534 Legal->isFirstOrderRecurrence(Phi)) && 8535 "can only widen reductions and first-order recurrences here"); 8536 VPValue *StartV = Operands[0]; 8537 if (Legal->isReductionVariable(Phi)) { 8538 const RecurrenceDescriptor &RdxDesc = 8539 Legal->getReductionVars().find(Phi)->second; 8540 assert(RdxDesc.getRecurrenceStartValue() == 8541 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8542 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8543 CM.isInLoopReduction(Phi), 8544 CM.useOrderedReductions(RdxDesc)); 8545 } else { 8546 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8547 } 8548 8549 // Record the incoming value from the backedge, so we can add the incoming 8550 // value from the backedge after all recipes have been created. 8551 recordRecipeOf(cast<Instruction>( 8552 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8553 PhisToFix.push_back(PhiRecipe); 8554 return toVPRecipeResult(PhiRecipe); 8555 } 8556 8557 if (isa<TruncInst>(Instr) && 8558 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8559 Range, *Plan))) 8560 return toVPRecipeResult(Recipe); 8561 8562 // All widen recipes below deal only with VF > 1. 8563 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8564 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8565 return nullptr; 8566 8567 if (auto *CI = dyn_cast<CallInst>(Instr)) 8568 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8569 8570 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8571 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8572 8573 if (!shouldWiden(Instr, Range)) 8574 return nullptr; 8575 8576 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8577 return toVPRecipeResult(new VPWidenGEPRecipe( 8578 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8579 8580 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8581 bool InvariantCond = 8582 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8583 return toVPRecipeResult(new VPWidenSelectRecipe( 8584 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8585 } 8586 8587 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8588 } 8589 8590 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8591 ElementCount MaxVF) { 8592 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8593 8594 // Collect instructions from the original loop that will become trivially dead 8595 // in the vectorized loop. We don't need to vectorize these instructions. For 8596 // example, original induction update instructions can become dead because we 8597 // separately emit induction "steps" when generating code for the new loop. 8598 // Similarly, we create a new latch condition when setting up the structure 8599 // of the new loop, so the old one can become dead. 8600 SmallPtrSet<Instruction *, 4> DeadInstructions; 8601 collectTriviallyDeadInstructions(DeadInstructions); 8602 8603 // Add assume instructions we need to drop to DeadInstructions, to prevent 8604 // them from being added to the VPlan. 8605 // TODO: We only need to drop assumes in blocks that get flattend. If the 8606 // control flow is preserved, we should keep them. 8607 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8608 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8609 8610 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8611 // Dead instructions do not need sinking. Remove them from SinkAfter. 8612 for (Instruction *I : DeadInstructions) 8613 SinkAfter.erase(I); 8614 8615 // Cannot sink instructions after dead instructions (there won't be any 8616 // recipes for them). Instead, find the first non-dead previous instruction. 8617 for (auto &P : Legal->getSinkAfter()) { 8618 Instruction *SinkTarget = P.second; 8619 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8620 (void)FirstInst; 8621 while (DeadInstructions.contains(SinkTarget)) { 8622 assert( 8623 SinkTarget != FirstInst && 8624 "Must find a live instruction (at least the one feeding the " 8625 "first-order recurrence PHI) before reaching beginning of the block"); 8626 SinkTarget = SinkTarget->getPrevNode(); 8627 assert(SinkTarget != P.first && 8628 "sink source equals target, no sinking required"); 8629 } 8630 P.second = SinkTarget; 8631 } 8632 8633 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8634 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8635 VFRange SubRange = {VF, MaxVFPlusOne}; 8636 VPlans.push_back( 8637 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8638 VF = SubRange.End; 8639 } 8640 } 8641 8642 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8643 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8644 // BranchOnCount VPInstruction to the latch. 8645 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8646 bool HasNUW) { 8647 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8648 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8649 8650 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8651 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8652 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8653 Header->insert(CanonicalIVPHI, Header->begin()); 8654 8655 auto *CanonicalIVIncrement = 8656 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8657 : VPInstruction::CanonicalIVIncrement, 8658 {CanonicalIVPHI}, DL); 8659 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8660 8661 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8662 EB->appendRecipe(CanonicalIVIncrement); 8663 8664 auto *BranchOnCount = 8665 new VPInstruction(VPInstruction::BranchOnCount, 8666 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8667 EB->appendRecipe(BranchOnCount); 8668 } 8669 8670 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8671 // original exit block. 8672 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8673 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8674 VPlan &Plan) { 8675 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8676 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8677 // Only handle single-exit loops with unique exit blocks for now. 8678 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8679 return; 8680 8681 // Introduce VPUsers modeling the exit values. 8682 for (PHINode &ExitPhi : ExitBB->phis()) { 8683 Value *IncomingValue = 8684 ExitPhi.getIncomingValueForBlock(ExitingBB); 8685 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8686 Plan.addLiveOut(&ExitPhi, V); 8687 } 8688 } 8689 8690 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8691 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8692 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8693 8694 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8695 8696 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8697 8698 // --------------------------------------------------------------------------- 8699 // Pre-construction: record ingredients whose recipes we'll need to further 8700 // process after constructing the initial VPlan. 8701 // --------------------------------------------------------------------------- 8702 8703 // Mark instructions we'll need to sink later and their targets as 8704 // ingredients whose recipe we'll need to record. 8705 for (auto &Entry : SinkAfter) { 8706 RecipeBuilder.recordRecipeOf(Entry.first); 8707 RecipeBuilder.recordRecipeOf(Entry.second); 8708 } 8709 for (auto &Reduction : CM.getInLoopReductionChains()) { 8710 PHINode *Phi = Reduction.first; 8711 RecurKind Kind = 8712 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8713 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8714 8715 RecipeBuilder.recordRecipeOf(Phi); 8716 for (auto &R : ReductionOperations) { 8717 RecipeBuilder.recordRecipeOf(R); 8718 // For min/max reductions, where we have a pair of icmp/select, we also 8719 // need to record the ICmp recipe, so it can be removed later. 8720 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8721 "Only min/max recurrences allowed for inloop reductions"); 8722 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8723 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8724 } 8725 } 8726 8727 // For each interleave group which is relevant for this (possibly trimmed) 8728 // Range, add it to the set of groups to be later applied to the VPlan and add 8729 // placeholders for its members' Recipes which we'll be replacing with a 8730 // single VPInterleaveRecipe. 8731 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8732 auto applyIG = [IG, this](ElementCount VF) -> bool { 8733 return (VF.isVector() && // Query is illegal for VF == 1 8734 CM.getWideningDecision(IG->getInsertPos(), VF) == 8735 LoopVectorizationCostModel::CM_Interleave); 8736 }; 8737 if (!getDecisionAndClampRange(applyIG, Range)) 8738 continue; 8739 InterleaveGroups.insert(IG); 8740 for (unsigned i = 0; i < IG->getFactor(); i++) 8741 if (Instruction *Member = IG->getMember(i)) 8742 RecipeBuilder.recordRecipeOf(Member); 8743 }; 8744 8745 // --------------------------------------------------------------------------- 8746 // Build initial VPlan: Scan the body of the loop in a topological order to 8747 // visit each basic block after having visited its predecessor basic blocks. 8748 // --------------------------------------------------------------------------- 8749 8750 // Create initial VPlan skeleton, starting with a block for the pre-header, 8751 // followed by a region for the vector loop, followed by the middle block. The 8752 // skeleton vector loop region contains a header and latch block. 8753 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8754 auto Plan = std::make_unique<VPlan>(Preheader); 8755 8756 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8757 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8758 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8759 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8760 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8761 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8762 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8763 8764 Instruction *DLInst = 8765 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8766 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8767 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8768 !CM.foldTailByMasking()); 8769 8770 // Scan the body of the loop in a topological order to visit each basic block 8771 // after having visited its predecessor basic blocks. 8772 LoopBlocksDFS DFS(OrigLoop); 8773 DFS.perform(LI); 8774 8775 VPBasicBlock *VPBB = HeaderVPBB; 8776 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8777 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8778 // Relevant instructions from basic block BB will be grouped into VPRecipe 8779 // ingredients and fill a new VPBasicBlock. 8780 unsigned VPBBsForBB = 0; 8781 if (VPBB != HeaderVPBB) 8782 VPBB->setName(BB->getName()); 8783 Builder.setInsertPoint(VPBB); 8784 8785 // Introduce each ingredient into VPlan. 8786 // TODO: Model and preserve debug intrinsics in VPlan. 8787 for (Instruction &I : BB->instructionsWithoutDebug()) { 8788 Instruction *Instr = &I; 8789 8790 // First filter out irrelevant instructions, to ensure no recipes are 8791 // built for them. 8792 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8793 continue; 8794 8795 SmallVector<VPValue *, 4> Operands; 8796 auto *Phi = dyn_cast<PHINode>(Instr); 8797 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8798 Operands.push_back(Plan->getOrAddVPValue( 8799 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8800 } else { 8801 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8802 Operands = {OpRange.begin(), OpRange.end()}; 8803 } 8804 8805 // Invariant stores inside loop will be deleted and a single store 8806 // with the final reduction value will be added to the exit block 8807 StoreInst *SI; 8808 if ((SI = dyn_cast<StoreInst>(&I)) && 8809 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8810 continue; 8811 8812 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8813 Instr, Operands, Range, Plan)) { 8814 // If Instr can be simplified to an existing VPValue, use it. 8815 if (RecipeOrValue.is<VPValue *>()) { 8816 auto *VPV = RecipeOrValue.get<VPValue *>(); 8817 Plan->addVPValue(Instr, VPV); 8818 // If the re-used value is a recipe, register the recipe for the 8819 // instruction, in case the recipe for Instr needs to be recorded. 8820 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8821 RecipeBuilder.setRecipe(Instr, R); 8822 continue; 8823 } 8824 // Otherwise, add the new recipe. 8825 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8826 for (auto *Def : Recipe->definedValues()) { 8827 auto *UV = Def->getUnderlyingValue(); 8828 Plan->addVPValue(UV, Def); 8829 } 8830 8831 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8832 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8833 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8834 // of the header block. That can happen for truncates of induction 8835 // variables. Those recipes are moved to the phi section of the header 8836 // block after applying SinkAfter, which relies on the original 8837 // position of the trunc. 8838 assert(isa<TruncInst>(Instr)); 8839 InductionsToMove.push_back( 8840 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8841 } 8842 RecipeBuilder.setRecipe(Instr, Recipe); 8843 VPBB->appendRecipe(Recipe); 8844 continue; 8845 } 8846 8847 // Otherwise, if all widening options failed, Instruction is to be 8848 // replicated. This may create a successor for VPBB. 8849 VPBasicBlock *NextVPBB = 8850 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8851 if (NextVPBB != VPBB) { 8852 VPBB = NextVPBB; 8853 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8854 : ""); 8855 } 8856 } 8857 8858 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8859 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8860 } 8861 8862 HeaderVPBB->setName("vector.body"); 8863 8864 // Fold the last, empty block into its predecessor. 8865 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8866 assert(VPBB && "expected to fold last (empty) block"); 8867 // After here, VPBB should not be used. 8868 VPBB = nullptr; 8869 8870 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8871 8872 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8873 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8874 "entry block must be set to a VPRegionBlock having a non-empty entry " 8875 "VPBasicBlock"); 8876 RecipeBuilder.fixHeaderPhis(); 8877 8878 // --------------------------------------------------------------------------- 8879 // Transform initial VPlan: Apply previously taken decisions, in order, to 8880 // bring the VPlan to its final state. 8881 // --------------------------------------------------------------------------- 8882 8883 // Apply Sink-After legal constraints. 8884 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8885 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8886 if (Region && Region->isReplicator()) { 8887 assert(Region->getNumSuccessors() == 1 && 8888 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8889 assert(R->getParent()->size() == 1 && 8890 "A recipe in an original replicator region must be the only " 8891 "recipe in its block"); 8892 return Region; 8893 } 8894 return nullptr; 8895 }; 8896 for (auto &Entry : SinkAfter) { 8897 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8898 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8899 8900 auto *TargetRegion = GetReplicateRegion(Target); 8901 auto *SinkRegion = GetReplicateRegion(Sink); 8902 if (!SinkRegion) { 8903 // If the sink source is not a replicate region, sink the recipe directly. 8904 if (TargetRegion) { 8905 // The target is in a replication region, make sure to move Sink to 8906 // the block after it, not into the replication region itself. 8907 VPBasicBlock *NextBlock = 8908 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8909 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8910 } else 8911 Sink->moveAfter(Target); 8912 continue; 8913 } 8914 8915 // The sink source is in a replicate region. Unhook the region from the CFG. 8916 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8917 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8918 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8919 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8920 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8921 8922 if (TargetRegion) { 8923 // The target recipe is also in a replicate region, move the sink region 8924 // after the target region. 8925 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8926 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8927 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8928 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8929 } else { 8930 // The sink source is in a replicate region, we need to move the whole 8931 // replicate region, which should only contain a single recipe in the 8932 // main block. 8933 auto *SplitBlock = 8934 Target->getParent()->splitAt(std::next(Target->getIterator())); 8935 8936 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8937 8938 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8939 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8940 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8941 } 8942 } 8943 8944 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8945 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8946 8947 // Now that sink-after is done, move induction recipes for optimized truncates 8948 // to the phi section of the header block. 8949 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8950 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8951 8952 // Adjust the recipes for any inloop reductions. 8953 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 8954 RecipeBuilder, Range.Start); 8955 8956 // Introduce a recipe to combine the incoming and previous values of a 8957 // first-order recurrence. 8958 for (VPRecipeBase &R : 8959 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8960 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8961 if (!RecurPhi) 8962 continue; 8963 8964 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8965 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8966 auto *Region = GetReplicateRegion(PrevRecipe); 8967 if (Region) 8968 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 8969 if (Region || PrevRecipe->isPhi()) 8970 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8971 else 8972 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 8973 8974 auto *RecurSplice = cast<VPInstruction>( 8975 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 8976 {RecurPhi, RecurPhi->getBackedgeValue()})); 8977 8978 RecurPhi->replaceAllUsesWith(RecurSplice); 8979 // Set the first operand of RecurSplice to RecurPhi again, after replacing 8980 // all users. 8981 RecurSplice->setOperand(0, RecurPhi); 8982 } 8983 8984 // Interleave memory: for each Interleave Group we marked earlier as relevant 8985 // for this VPlan, replace the Recipes widening its memory instructions with a 8986 // single VPInterleaveRecipe at its insertion point. 8987 for (auto IG : InterleaveGroups) { 8988 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8989 RecipeBuilder.getRecipe(IG->getInsertPos())); 8990 SmallVector<VPValue *, 4> StoredValues; 8991 for (unsigned i = 0; i < IG->getFactor(); ++i) 8992 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8993 auto *StoreR = 8994 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8995 StoredValues.push_back(StoreR->getStoredValue()); 8996 } 8997 8998 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8999 Recipe->getMask()); 9000 VPIG->insertBefore(Recipe); 9001 unsigned J = 0; 9002 for (unsigned i = 0; i < IG->getFactor(); ++i) 9003 if (Instruction *Member = IG->getMember(i)) { 9004 if (!Member->getType()->isVoidTy()) { 9005 VPValue *OriginalV = Plan->getVPValue(Member); 9006 Plan->removeVPValueFor(Member); 9007 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9008 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9009 J++; 9010 } 9011 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9012 } 9013 } 9014 9015 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9016 // in ways that accessing values using original IR values is incorrect. 9017 Plan->disableValue2VPValue(); 9018 9019 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9020 VPlanTransforms::sinkScalarOperands(*Plan); 9021 VPlanTransforms::mergeReplicateRegions(*Plan); 9022 VPlanTransforms::removeDeadRecipes(*Plan); 9023 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9024 9025 std::string PlanName; 9026 raw_string_ostream RSO(PlanName); 9027 ElementCount VF = Range.Start; 9028 Plan->addVF(VF); 9029 RSO << "Initial VPlan for VF={" << VF; 9030 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9031 Plan->addVF(VF); 9032 RSO << "," << VF; 9033 } 9034 RSO << "},UF>=1"; 9035 RSO.flush(); 9036 Plan->setName(PlanName); 9037 9038 // Fold Exit block into its predecessor if possible. 9039 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9040 // VPBasicBlock as exit. 9041 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); 9042 9043 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9044 return Plan; 9045 } 9046 9047 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9048 // Outer loop handling: They may require CFG and instruction level 9049 // transformations before even evaluating whether vectorization is profitable. 9050 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9051 // the vectorization pipeline. 9052 assert(!OrigLoop->isInnermost()); 9053 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9054 9055 // Create new empty VPlan 9056 auto Plan = std::make_unique<VPlan>(); 9057 9058 // Build hierarchical CFG 9059 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9060 HCFGBuilder.buildHierarchicalCFG(); 9061 9062 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9063 VF *= 2) 9064 Plan->addVF(VF); 9065 9066 SmallPtrSet<Instruction *, 1> DeadInstructions; 9067 VPlanTransforms::VPInstructionsToVPRecipes( 9068 OrigLoop, Plan, 9069 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9070 DeadInstructions, *PSE.getSE()); 9071 9072 // Remove the existing terminator of the exiting block of the top-most region. 9073 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9074 auto *Term = 9075 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9076 Term->eraseFromParent(); 9077 9078 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9079 true); 9080 return Plan; 9081 } 9082 9083 // Adjust the recipes for reductions. For in-loop reductions the chain of 9084 // instructions leading from the loop exit instr to the phi need to be converted 9085 // to reductions, with one operand being vector and the other being the scalar 9086 // reduction chain. For other reductions, a select is introduced between the phi 9087 // and live-out recipes when folding the tail. 9088 void LoopVectorizationPlanner::adjustRecipesForReductions( 9089 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9090 ElementCount MinVF) { 9091 for (auto &Reduction : CM.getInLoopReductionChains()) { 9092 PHINode *Phi = Reduction.first; 9093 const RecurrenceDescriptor &RdxDesc = 9094 Legal->getReductionVars().find(Phi)->second; 9095 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9096 9097 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9098 continue; 9099 9100 // ReductionOperations are orders top-down from the phi's use to the 9101 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9102 // which of the two operands will remain scalar and which will be reduced. 9103 // For minmax the chain will be the select instructions. 9104 Instruction *Chain = Phi; 9105 for (Instruction *R : ReductionOperations) { 9106 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9107 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9108 9109 VPValue *ChainOp = Plan->getVPValue(Chain); 9110 unsigned FirstOpId; 9111 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9112 "Only min/max recurrences allowed for inloop reductions"); 9113 // Recognize a call to the llvm.fmuladd intrinsic. 9114 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9115 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9116 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9117 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9118 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9119 "Expected to replace a VPWidenSelectSC"); 9120 FirstOpId = 1; 9121 } else { 9122 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9123 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9124 "Expected to replace a VPWidenSC"); 9125 FirstOpId = 0; 9126 } 9127 unsigned VecOpId = 9128 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9129 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9130 9131 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9132 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9133 : nullptr; 9134 9135 if (IsFMulAdd) { 9136 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9137 // need to create an fmul recipe to use as the vector operand for the 9138 // fadd reduction. 9139 VPInstruction *FMulRecipe = new VPInstruction( 9140 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9141 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9142 WidenRecipe->getParent()->insert(FMulRecipe, 9143 WidenRecipe->getIterator()); 9144 VecOp = FMulRecipe; 9145 } 9146 VPReductionRecipe *RedRecipe = 9147 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9148 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9149 Plan->removeVPValueFor(R); 9150 Plan->addVPValue(R, RedRecipe); 9151 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9152 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9153 WidenRecipe->eraseFromParent(); 9154 9155 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9156 VPRecipeBase *CompareRecipe = 9157 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9158 assert(isa<VPWidenRecipe>(CompareRecipe) && 9159 "Expected to replace a VPWidenSC"); 9160 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9161 "Expected no remaining users"); 9162 CompareRecipe->eraseFromParent(); 9163 } 9164 Chain = R; 9165 } 9166 } 9167 9168 // If tail is folded by masking, introduce selects between the phi 9169 // and the live-out instruction of each reduction, at the beginning of the 9170 // dedicated latch block. 9171 if (CM.foldTailByMasking()) { 9172 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9173 for (VPRecipeBase &R : 9174 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9175 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9176 if (!PhiR || PhiR->isInLoop()) 9177 continue; 9178 VPValue *Cond = 9179 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9180 VPValue *Red = PhiR->getBackedgeValue(); 9181 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9182 "reduction recipe must be defined before latch"); 9183 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9184 } 9185 } 9186 } 9187 9188 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9189 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9190 VPSlotTracker &SlotTracker) const { 9191 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9192 IG->getInsertPos()->printAsOperand(O, false); 9193 O << ", "; 9194 getAddr()->printAsOperand(O, SlotTracker); 9195 VPValue *Mask = getMask(); 9196 if (Mask) { 9197 O << ", "; 9198 Mask->printAsOperand(O, SlotTracker); 9199 } 9200 9201 unsigned OpIdx = 0; 9202 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9203 if (!IG->getMember(i)) 9204 continue; 9205 if (getNumStoreOperands() > 0) { 9206 O << "\n" << Indent << " store "; 9207 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9208 O << " to index " << i; 9209 } else { 9210 O << "\n" << Indent << " "; 9211 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9212 O << " = load from index " << i; 9213 } 9214 ++OpIdx; 9215 } 9216 } 9217 #endif 9218 9219 void VPWidenCallRecipe::execute(VPTransformState &State) { 9220 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9221 *this, State); 9222 } 9223 9224 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9225 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9226 State.ILV->setDebugLocFromInst(&I); 9227 9228 // The condition can be loop invariant but still defined inside the 9229 // loop. This means that we can't just use the original 'cond' value. 9230 // We have to take the 'vectorized' value and pick the first lane. 9231 // Instcombine will make this a no-op. 9232 auto *InvarCond = 9233 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9234 9235 for (unsigned Part = 0; Part < State.UF; ++Part) { 9236 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9237 Value *Op0 = State.get(getOperand(1), Part); 9238 Value *Op1 = State.get(getOperand(2), Part); 9239 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9240 State.set(this, Sel, Part); 9241 State.ILV->addMetadata(Sel, &I); 9242 } 9243 } 9244 9245 void VPWidenRecipe::execute(VPTransformState &State) { 9246 auto &I = *cast<Instruction>(getUnderlyingValue()); 9247 auto &Builder = State.Builder; 9248 switch (I.getOpcode()) { 9249 case Instruction::Call: 9250 case Instruction::Br: 9251 case Instruction::PHI: 9252 case Instruction::GetElementPtr: 9253 case Instruction::Select: 9254 llvm_unreachable("This instruction is handled by a different recipe."); 9255 case Instruction::UDiv: 9256 case Instruction::SDiv: 9257 case Instruction::SRem: 9258 case Instruction::URem: 9259 case Instruction::Add: 9260 case Instruction::FAdd: 9261 case Instruction::Sub: 9262 case Instruction::FSub: 9263 case Instruction::FNeg: 9264 case Instruction::Mul: 9265 case Instruction::FMul: 9266 case Instruction::FDiv: 9267 case Instruction::FRem: 9268 case Instruction::Shl: 9269 case Instruction::LShr: 9270 case Instruction::AShr: 9271 case Instruction::And: 9272 case Instruction::Or: 9273 case Instruction::Xor: { 9274 // Just widen unops and binops. 9275 State.ILV->setDebugLocFromInst(&I); 9276 9277 for (unsigned Part = 0; Part < State.UF; ++Part) { 9278 SmallVector<Value *, 2> Ops; 9279 for (VPValue *VPOp : operands()) 9280 Ops.push_back(State.get(VPOp, Part)); 9281 9282 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9283 9284 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9285 VecOp->copyIRFlags(&I); 9286 9287 // If the instruction is vectorized and was in a basic block that needed 9288 // predication, we can't propagate poison-generating flags (nuw/nsw, 9289 // exact, etc.). The control flow has been linearized and the 9290 // instruction is no longer guarded by the predicate, which could make 9291 // the flag properties to no longer hold. 9292 if (State.MayGeneratePoisonRecipes.contains(this)) 9293 VecOp->dropPoisonGeneratingFlags(); 9294 } 9295 9296 // Use this vector value for all users of the original instruction. 9297 State.set(this, V, Part); 9298 State.ILV->addMetadata(V, &I); 9299 } 9300 9301 break; 9302 } 9303 case Instruction::Freeze: { 9304 State.ILV->setDebugLocFromInst(&I); 9305 9306 for (unsigned Part = 0; Part < State.UF; ++Part) { 9307 Value *Op = State.get(getOperand(0), Part); 9308 9309 Value *Freeze = Builder.CreateFreeze(Op); 9310 State.set(this, Freeze, Part); 9311 } 9312 break; 9313 } 9314 case Instruction::ICmp: 9315 case Instruction::FCmp: { 9316 // Widen compares. Generate vector compares. 9317 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9318 auto *Cmp = cast<CmpInst>(&I); 9319 State.ILV->setDebugLocFromInst(Cmp); 9320 for (unsigned Part = 0; Part < State.UF; ++Part) { 9321 Value *A = State.get(getOperand(0), Part); 9322 Value *B = State.get(getOperand(1), Part); 9323 Value *C = nullptr; 9324 if (FCmp) { 9325 // Propagate fast math flags. 9326 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9327 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9328 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9329 } else { 9330 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9331 } 9332 State.set(this, C, Part); 9333 State.ILV->addMetadata(C, &I); 9334 } 9335 9336 break; 9337 } 9338 9339 case Instruction::ZExt: 9340 case Instruction::SExt: 9341 case Instruction::FPToUI: 9342 case Instruction::FPToSI: 9343 case Instruction::FPExt: 9344 case Instruction::PtrToInt: 9345 case Instruction::IntToPtr: 9346 case Instruction::SIToFP: 9347 case Instruction::UIToFP: 9348 case Instruction::Trunc: 9349 case Instruction::FPTrunc: 9350 case Instruction::BitCast: { 9351 auto *CI = cast<CastInst>(&I); 9352 State.ILV->setDebugLocFromInst(CI); 9353 9354 /// Vectorize casts. 9355 Type *DestTy = (State.VF.isScalar()) 9356 ? CI->getType() 9357 : VectorType::get(CI->getType(), State.VF); 9358 9359 for (unsigned Part = 0; Part < State.UF; ++Part) { 9360 Value *A = State.get(getOperand(0), Part); 9361 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9362 State.set(this, Cast, Part); 9363 State.ILV->addMetadata(Cast, &I); 9364 } 9365 break; 9366 } 9367 default: 9368 // This instruction is not vectorized by simple widening. 9369 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9370 llvm_unreachable("Unhandled instruction!"); 9371 } // end of switch. 9372 } 9373 9374 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9375 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9376 // Construct a vector GEP by widening the operands of the scalar GEP as 9377 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9378 // results in a vector of pointers when at least one operand of the GEP 9379 // is vector-typed. Thus, to keep the representation compact, we only use 9380 // vector-typed operands for loop-varying values. 9381 9382 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9383 // If we are vectorizing, but the GEP has only loop-invariant operands, 9384 // the GEP we build (by only using vector-typed operands for 9385 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9386 // produce a vector of pointers, we need to either arbitrarily pick an 9387 // operand to broadcast, or broadcast a clone of the original GEP. 9388 // Here, we broadcast a clone of the original. 9389 // 9390 // TODO: If at some point we decide to scalarize instructions having 9391 // loop-invariant operands, this special case will no longer be 9392 // required. We would add the scalarization decision to 9393 // collectLoopScalars() and teach getVectorValue() to broadcast 9394 // the lane-zero scalar value. 9395 auto *Clone = State.Builder.Insert(GEP->clone()); 9396 for (unsigned Part = 0; Part < State.UF; ++Part) { 9397 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9398 State.set(this, EntryPart, Part); 9399 State.ILV->addMetadata(EntryPart, GEP); 9400 } 9401 } else { 9402 // If the GEP has at least one loop-varying operand, we are sure to 9403 // produce a vector of pointers. But if we are only unrolling, we want 9404 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9405 // produce with the code below will be scalar (if VF == 1) or vector 9406 // (otherwise). Note that for the unroll-only case, we still maintain 9407 // values in the vector mapping with initVector, as we do for other 9408 // instructions. 9409 for (unsigned Part = 0; Part < State.UF; ++Part) { 9410 // The pointer operand of the new GEP. If it's loop-invariant, we 9411 // won't broadcast it. 9412 auto *Ptr = IsPtrLoopInvariant 9413 ? State.get(getOperand(0), VPIteration(0, 0)) 9414 : State.get(getOperand(0), Part); 9415 9416 // Collect all the indices for the new GEP. If any index is 9417 // loop-invariant, we won't broadcast it. 9418 SmallVector<Value *, 4> Indices; 9419 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9420 VPValue *Operand = getOperand(I); 9421 if (IsIndexLoopInvariant[I - 1]) 9422 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9423 else 9424 Indices.push_back(State.get(Operand, Part)); 9425 } 9426 9427 // If the GEP instruction is vectorized and was in a basic block that 9428 // needed predication, we can't propagate the poison-generating 'inbounds' 9429 // flag. The control flow has been linearized and the GEP is no longer 9430 // guarded by the predicate, which could make the 'inbounds' properties to 9431 // no longer hold. 9432 bool IsInBounds = 9433 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9434 9435 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9436 // but it should be a vector, otherwise. 9437 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 9438 Indices, "", IsInBounds); 9439 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9440 "NewGEP is not a pointer vector"); 9441 State.set(this, NewGEP, Part); 9442 State.ILV->addMetadata(NewGEP, GEP); 9443 } 9444 } 9445 } 9446 9447 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9448 assert(!State.Instance && "Int or FP induction being replicated."); 9449 9450 Value *Start = getStartValue()->getLiveInIRValue(); 9451 const InductionDescriptor &ID = getInductionDescriptor(); 9452 TruncInst *Trunc = getTruncInst(); 9453 IRBuilderBase &Builder = State.Builder; 9454 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9455 assert(State.VF.isVector() && "must have vector VF"); 9456 9457 // The value from the original loop to which we are mapping the new induction 9458 // variable. 9459 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9460 9461 // Fast-math-flags propagate from the original induction instruction. 9462 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9463 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9464 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9465 9466 // Now do the actual transformations, and start with fetching the step value. 9467 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9468 9469 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9470 "Expected either an induction phi-node or a truncate of it!"); 9471 9472 // Construct the initial value of the vector IV in the vector loop preheader 9473 auto CurrIP = Builder.saveIP(); 9474 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9475 Builder.SetInsertPoint(VectorPH->getTerminator()); 9476 if (isa<TruncInst>(EntryVal)) { 9477 assert(Start->getType()->isIntegerTy() && 9478 "Truncation requires an integer type"); 9479 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9480 Step = Builder.CreateTrunc(Step, TruncType); 9481 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9482 } 9483 9484 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9485 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9486 Value *SteppedStart = getStepVector( 9487 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9488 9489 // We create vector phi nodes for both integer and floating-point induction 9490 // variables. Here, we determine the kind of arithmetic we will perform. 9491 Instruction::BinaryOps AddOp; 9492 Instruction::BinaryOps MulOp; 9493 if (Step->getType()->isIntegerTy()) { 9494 AddOp = Instruction::Add; 9495 MulOp = Instruction::Mul; 9496 } else { 9497 AddOp = ID.getInductionOpcode(); 9498 MulOp = Instruction::FMul; 9499 } 9500 9501 // Multiply the vectorization factor by the step using integer or 9502 // floating-point arithmetic as appropriate. 9503 Type *StepType = Step->getType(); 9504 Value *RuntimeVF; 9505 if (Step->getType()->isFloatingPointTy()) 9506 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9507 else 9508 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9509 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9510 9511 // Create a vector splat to use in the induction update. 9512 // 9513 // FIXME: If the step is non-constant, we create the vector splat with 9514 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9515 // handle a constant vector splat. 9516 Value *SplatVF = isa<Constant>(Mul) 9517 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9518 : Builder.CreateVectorSplat(State.VF, Mul); 9519 Builder.restoreIP(CurrIP); 9520 9521 // We may need to add the step a number of times, depending on the unroll 9522 // factor. The last of those goes into the PHI. 9523 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9524 &*State.CFG.PrevBB->getFirstInsertionPt()); 9525 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9526 Instruction *LastInduction = VecInd; 9527 for (unsigned Part = 0; Part < State.UF; ++Part) { 9528 State.set(this, LastInduction, Part); 9529 9530 if (isa<TruncInst>(EntryVal)) 9531 State.ILV->addMetadata(LastInduction, EntryVal); 9532 9533 LastInduction = cast<Instruction>( 9534 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9535 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9536 } 9537 9538 LastInduction->setName("vec.ind.next"); 9539 VecInd->addIncoming(SteppedStart, VectorPH); 9540 // Add induction update using an incorrect block temporarily. The phi node 9541 // will be fixed after VPlan execution. Note that at this point the latch 9542 // block cannot be used, as it does not exist yet. 9543 // TODO: Model increment value in VPlan, by turning the recipe into a 9544 // multi-def and a subclass of VPHeaderPHIRecipe. 9545 VecInd->addIncoming(LastInduction, VectorPH); 9546 } 9547 9548 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9549 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9550 "Not a pointer induction according to InductionDescriptor!"); 9551 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9552 "Unexpected type."); 9553 9554 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9555 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9556 9557 if (onlyScalarsGenerated(State.VF)) { 9558 // This is the normalized GEP that starts counting at zero. 9559 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9560 CanonicalIV, IndDesc.getStep()->getType()); 9561 // Determine the number of scalars we need to generate for each unroll 9562 // iteration. If the instruction is uniform, we only need to generate the 9563 // first lane. Otherwise, we generate all VF values. 9564 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9565 assert((IsUniform || !State.VF.isScalable()) && 9566 "Cannot scalarize a scalable VF"); 9567 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9568 9569 for (unsigned Part = 0; Part < State.UF; ++Part) { 9570 Value *PartStart = 9571 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9572 9573 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9574 Value *Idx = State.Builder.CreateAdd( 9575 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9576 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9577 9578 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9579 State.CFG.PrevBB->getTerminator()); 9580 Value *SclrGep = emitTransformedIndex( 9581 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9582 SclrGep->setName("next.gep"); 9583 State.set(this, SclrGep, VPIteration(Part, Lane)); 9584 } 9585 } 9586 return; 9587 } 9588 9589 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9590 "Induction step not a SCEV constant!"); 9591 Type *PhiType = IndDesc.getStep()->getType(); 9592 9593 // Build a pointer phi 9594 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9595 Type *ScStValueType = ScalarStartValue->getType(); 9596 PHINode *NewPointerPhi = 9597 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9598 9599 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9600 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9601 9602 // A pointer induction, performed by using a gep 9603 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9604 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9605 9606 const SCEV *ScalarStep = IndDesc.getStep(); 9607 SCEVExpander Exp(SE, DL, "induction"); 9608 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9609 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9610 Value *NumUnrolledElems = 9611 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9612 Value *InductionGEP = GetElementPtrInst::Create( 9613 IndDesc.getElementType(), NewPointerPhi, 9614 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9615 InductionLoc); 9616 // Add induction update using an incorrect block temporarily. The phi node 9617 // will be fixed after VPlan execution. Note that at this point the latch 9618 // block cannot be used, as it does not exist yet. 9619 // TODO: Model increment value in VPlan, by turning the recipe into a 9620 // multi-def and a subclass of VPHeaderPHIRecipe. 9621 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9622 9623 // Create UF many actual address geps that use the pointer 9624 // phi as base and a vectorized version of the step value 9625 // (<step*0, ..., step*N>) as offset. 9626 for (unsigned Part = 0; Part < State.UF; ++Part) { 9627 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9628 Value *StartOffsetScalar = 9629 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9630 Value *StartOffset = 9631 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9632 // Create a vector of consecutive numbers from zero to VF. 9633 StartOffset = State.Builder.CreateAdd( 9634 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9635 9636 Value *GEP = State.Builder.CreateGEP( 9637 IndDesc.getElementType(), NewPointerPhi, 9638 State.Builder.CreateMul( 9639 StartOffset, 9640 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9641 "vector.gep")); 9642 State.set(this, GEP, Part); 9643 } 9644 } 9645 9646 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9647 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9648 9649 // Fast-math-flags propagate from the original induction instruction. 9650 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9651 if (IndDesc.getInductionBinOp() && 9652 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9653 State.Builder.setFastMathFlags( 9654 IndDesc.getInductionBinOp()->getFastMathFlags()); 9655 9656 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9657 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9658 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9659 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9660 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9661 ScalarIV = 9662 Ty->isIntegerTy() 9663 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9664 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9665 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9666 getStartValue()->getLiveInIRValue(), Step, 9667 IndDesc); 9668 ScalarIV->setName("offset.idx"); 9669 } 9670 if (TruncToTy) { 9671 assert(Step->getType()->isIntegerTy() && 9672 "Truncation requires an integer step"); 9673 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9674 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9675 } 9676 return ScalarIV; 9677 }; 9678 9679 Value *ScalarIV = CreateScalarIV(Step); 9680 if (State.VF.isVector()) { 9681 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9682 return; 9683 } 9684 9685 for (unsigned Part = 0; Part < State.UF; ++Part) { 9686 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9687 Value *EntryPart; 9688 if (Step->getType()->isFloatingPointTy()) { 9689 Value *StartIdx = 9690 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9691 // Floating-point operations inherit FMF via the builder's flags. 9692 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9693 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9694 ScalarIV, MulOp); 9695 } else { 9696 Value *StartIdx = 9697 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9698 EntryPart = State.Builder.CreateAdd( 9699 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9700 } 9701 State.set(this, EntryPart, Part); 9702 } 9703 } 9704 9705 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9706 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9707 State); 9708 } 9709 9710 void VPBlendRecipe::execute(VPTransformState &State) { 9711 State.ILV->setDebugLocFromInst(Phi); 9712 // We know that all PHIs in non-header blocks are converted into 9713 // selects, so we don't have to worry about the insertion order and we 9714 // can just use the builder. 9715 // At this point we generate the predication tree. There may be 9716 // duplications since this is a simple recursive scan, but future 9717 // optimizations will clean it up. 9718 9719 unsigned NumIncoming = getNumIncomingValues(); 9720 9721 // Generate a sequence of selects of the form: 9722 // SELECT(Mask3, In3, 9723 // SELECT(Mask2, In2, 9724 // SELECT(Mask1, In1, 9725 // In0))) 9726 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9727 // are essentially undef are taken from In0. 9728 InnerLoopVectorizer::VectorParts Entry(State.UF); 9729 for (unsigned In = 0; In < NumIncoming; ++In) { 9730 for (unsigned Part = 0; Part < State.UF; ++Part) { 9731 // We might have single edge PHIs (blocks) - use an identity 9732 // 'select' for the first PHI operand. 9733 Value *In0 = State.get(getIncomingValue(In), Part); 9734 if (In == 0) 9735 Entry[Part] = In0; // Initialize with the first incoming value. 9736 else { 9737 // Select between the current value and the previous incoming edge 9738 // based on the incoming mask. 9739 Value *Cond = State.get(getMask(In), Part); 9740 Entry[Part] = 9741 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9742 } 9743 } 9744 } 9745 for (unsigned Part = 0; Part < State.UF; ++Part) 9746 State.set(this, Entry[Part], Part); 9747 } 9748 9749 void VPInterleaveRecipe::execute(VPTransformState &State) { 9750 assert(!State.Instance && "Interleave group being replicated."); 9751 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9752 getStoredValues(), getMask()); 9753 } 9754 9755 void VPReductionRecipe::execute(VPTransformState &State) { 9756 assert(!State.Instance && "Reduction being replicated."); 9757 Value *PrevInChain = State.get(getChainOp(), 0); 9758 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9759 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9760 // Propagate the fast-math flags carried by the underlying instruction. 9761 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9762 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9763 for (unsigned Part = 0; Part < State.UF; ++Part) { 9764 Value *NewVecOp = State.get(getVecOp(), Part); 9765 if (VPValue *Cond = getCondOp()) { 9766 Value *NewCond = State.get(Cond, Part); 9767 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9768 Value *Iden = RdxDesc->getRecurrenceIdentity( 9769 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9770 Value *IdenVec = 9771 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9772 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9773 NewVecOp = Select; 9774 } 9775 Value *NewRed; 9776 Value *NextInChain; 9777 if (IsOrdered) { 9778 if (State.VF.isVector()) 9779 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9780 PrevInChain); 9781 else 9782 NewRed = State.Builder.CreateBinOp( 9783 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9784 NewVecOp); 9785 PrevInChain = NewRed; 9786 } else { 9787 PrevInChain = State.get(getChainOp(), Part); 9788 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9789 } 9790 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9791 NextInChain = 9792 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9793 NewRed, PrevInChain); 9794 } else if (IsOrdered) 9795 NextInChain = NewRed; 9796 else 9797 NextInChain = State.Builder.CreateBinOp( 9798 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9799 PrevInChain); 9800 State.set(this, NextInChain, Part); 9801 } 9802 } 9803 9804 void VPReplicateRecipe::execute(VPTransformState &State) { 9805 if (State.Instance) { // Generate a single instance. 9806 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9807 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9808 IsPredicated, State); 9809 // Insert scalar instance packing it into a vector. 9810 if (AlsoPack && State.VF.isVector()) { 9811 // If we're constructing lane 0, initialize to start from poison. 9812 if (State.Instance->Lane.isFirstLane()) { 9813 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9814 Value *Poison = PoisonValue::get( 9815 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9816 State.set(this, Poison, State.Instance->Part); 9817 } 9818 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9819 } 9820 return; 9821 } 9822 9823 // Generate scalar instances for all VF lanes of all UF parts, unless the 9824 // instruction is uniform inwhich case generate only the first lane for each 9825 // of the UF parts. 9826 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9827 assert((!State.VF.isScalable() || IsUniform) && 9828 "Can't scalarize a scalable vector"); 9829 for (unsigned Part = 0; Part < State.UF; ++Part) 9830 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9831 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9832 VPIteration(Part, Lane), IsPredicated, 9833 State); 9834 } 9835 9836 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9837 assert(State.Instance && "Branch on Mask works only on single instance."); 9838 9839 unsigned Part = State.Instance->Part; 9840 unsigned Lane = State.Instance->Lane.getKnownLane(); 9841 9842 Value *ConditionBit = nullptr; 9843 VPValue *BlockInMask = getMask(); 9844 if (BlockInMask) { 9845 ConditionBit = State.get(BlockInMask, Part); 9846 if (ConditionBit->getType()->isVectorTy()) 9847 ConditionBit = State.Builder.CreateExtractElement( 9848 ConditionBit, State.Builder.getInt32(Lane)); 9849 } else // Block in mask is all-one. 9850 ConditionBit = State.Builder.getTrue(); 9851 9852 // Replace the temporary unreachable terminator with a new conditional branch, 9853 // whose two destinations will be set later when they are created. 9854 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9855 assert(isa<UnreachableInst>(CurrentTerminator) && 9856 "Expected to replace unreachable terminator with conditional branch."); 9857 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9858 CondBr->setSuccessor(0, nullptr); 9859 ReplaceInstWithInst(CurrentTerminator, CondBr); 9860 } 9861 9862 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9863 assert(State.Instance && "Predicated instruction PHI works per instance."); 9864 Instruction *ScalarPredInst = 9865 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9866 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9867 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9868 assert(PredicatingBB && "Predicated block has no single predecessor."); 9869 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9870 "operand must be VPReplicateRecipe"); 9871 9872 // By current pack/unpack logic we need to generate only a single phi node: if 9873 // a vector value for the predicated instruction exists at this point it means 9874 // the instruction has vector users only, and a phi for the vector value is 9875 // needed. In this case the recipe of the predicated instruction is marked to 9876 // also do that packing, thereby "hoisting" the insert-element sequence. 9877 // Otherwise, a phi node for the scalar value is needed. 9878 unsigned Part = State.Instance->Part; 9879 if (State.hasVectorValue(getOperand(0), Part)) { 9880 Value *VectorValue = State.get(getOperand(0), Part); 9881 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9882 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9883 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9884 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9885 if (State.hasVectorValue(this, Part)) 9886 State.reset(this, VPhi, Part); 9887 else 9888 State.set(this, VPhi, Part); 9889 // NOTE: Currently we need to update the value of the operand, so the next 9890 // predicated iteration inserts its generated value in the correct vector. 9891 State.reset(getOperand(0), VPhi, Part); 9892 } else { 9893 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9894 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9895 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9896 PredicatingBB); 9897 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9898 if (State.hasScalarValue(this, *State.Instance)) 9899 State.reset(this, Phi, *State.Instance); 9900 else 9901 State.set(this, Phi, *State.Instance); 9902 // NOTE: Currently we need to update the value of the operand, so the next 9903 // predicated iteration inserts its generated value in the correct vector. 9904 State.reset(getOperand(0), Phi, *State.Instance); 9905 } 9906 } 9907 9908 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9909 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9910 9911 // Attempt to issue a wide load. 9912 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9913 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9914 9915 assert((LI || SI) && "Invalid Load/Store instruction"); 9916 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9917 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9918 9919 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9920 9921 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9922 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9923 bool CreateGatherScatter = !Consecutive; 9924 9925 auto &Builder = State.Builder; 9926 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9927 bool isMaskRequired = getMask(); 9928 if (isMaskRequired) 9929 for (unsigned Part = 0; Part < State.UF; ++Part) 9930 BlockInMaskParts[Part] = State.get(getMask(), Part); 9931 9932 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9933 // Calculate the pointer for the specific unroll-part. 9934 GetElementPtrInst *PartPtr = nullptr; 9935 9936 bool InBounds = false; 9937 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9938 InBounds = gep->isInBounds(); 9939 if (Reverse) { 9940 // If the address is consecutive but reversed, then the 9941 // wide store needs to start at the last vector element. 9942 // RunTimeVF = VScale * VF.getKnownMinValue() 9943 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9944 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9945 // NumElt = -Part * RunTimeVF 9946 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9947 // LastLane = 1 - RunTimeVF 9948 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9949 PartPtr = 9950 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9951 PartPtr->setIsInBounds(InBounds); 9952 PartPtr = cast<GetElementPtrInst>( 9953 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9954 PartPtr->setIsInBounds(InBounds); 9955 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9956 BlockInMaskParts[Part] = 9957 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9958 } else { 9959 Value *Increment = 9960 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9961 PartPtr = cast<GetElementPtrInst>( 9962 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9963 PartPtr->setIsInBounds(InBounds); 9964 } 9965 9966 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9967 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9968 }; 9969 9970 // Handle Stores: 9971 if (SI) { 9972 State.ILV->setDebugLocFromInst(SI); 9973 9974 for (unsigned Part = 0; Part < State.UF; ++Part) { 9975 Instruction *NewSI = nullptr; 9976 Value *StoredVal = State.get(StoredValue, Part); 9977 if (CreateGatherScatter) { 9978 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9979 Value *VectorGep = State.get(getAddr(), Part); 9980 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9981 MaskPart); 9982 } else { 9983 if (Reverse) { 9984 // If we store to reverse consecutive memory locations, then we need 9985 // to reverse the order of elements in the stored value. 9986 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9987 // We don't want to update the value in the map as it might be used in 9988 // another expression. So don't call resetVectorValue(StoredVal). 9989 } 9990 auto *VecPtr = 9991 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9992 if (isMaskRequired) 9993 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9994 BlockInMaskParts[Part]); 9995 else 9996 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9997 } 9998 State.ILV->addMetadata(NewSI, SI); 9999 } 10000 return; 10001 } 10002 10003 // Handle loads. 10004 assert(LI && "Must have a load instruction"); 10005 State.ILV->setDebugLocFromInst(LI); 10006 for (unsigned Part = 0; Part < State.UF; ++Part) { 10007 Value *NewLI; 10008 if (CreateGatherScatter) { 10009 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10010 Value *VectorGep = State.get(getAddr(), Part); 10011 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10012 nullptr, "wide.masked.gather"); 10013 State.ILV->addMetadata(NewLI, LI); 10014 } else { 10015 auto *VecPtr = 10016 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10017 if (isMaskRequired) 10018 NewLI = Builder.CreateMaskedLoad( 10019 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10020 PoisonValue::get(DataTy), "wide.masked.load"); 10021 else 10022 NewLI = 10023 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10024 10025 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10026 State.ILV->addMetadata(NewLI, LI); 10027 if (Reverse) 10028 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10029 } 10030 10031 State.set(getVPSingleValue(), NewLI, Part); 10032 } 10033 } 10034 10035 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10036 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10037 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10038 // for predication. 10039 static ScalarEpilogueLowering getScalarEpilogueLowering( 10040 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10041 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10042 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10043 LoopVectorizationLegality &LVL) { 10044 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10045 // don't look at hints or options, and don't request a scalar epilogue. 10046 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10047 // LoopAccessInfo (due to code dependency and not being able to reliably get 10048 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10049 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10050 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10051 // back to the old way and vectorize with versioning when forced. See D81345.) 10052 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10053 PGSOQueryType::IRPass) && 10054 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10055 return CM_ScalarEpilogueNotAllowedOptSize; 10056 10057 // 2) If set, obey the directives 10058 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10059 switch (PreferPredicateOverEpilogue) { 10060 case PreferPredicateTy::ScalarEpilogue: 10061 return CM_ScalarEpilogueAllowed; 10062 case PreferPredicateTy::PredicateElseScalarEpilogue: 10063 return CM_ScalarEpilogueNotNeededUsePredicate; 10064 case PreferPredicateTy::PredicateOrDontVectorize: 10065 return CM_ScalarEpilogueNotAllowedUsePredicate; 10066 }; 10067 } 10068 10069 // 3) If set, obey the hints 10070 switch (Hints.getPredicate()) { 10071 case LoopVectorizeHints::FK_Enabled: 10072 return CM_ScalarEpilogueNotNeededUsePredicate; 10073 case LoopVectorizeHints::FK_Disabled: 10074 return CM_ScalarEpilogueAllowed; 10075 }; 10076 10077 // 4) if the TTI hook indicates this is profitable, request predication. 10078 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10079 LVL.getLAI())) 10080 return CM_ScalarEpilogueNotNeededUsePredicate; 10081 10082 return CM_ScalarEpilogueAllowed; 10083 } 10084 10085 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10086 // If Values have been set for this Def return the one relevant for \p Part. 10087 if (hasVectorValue(Def, Part)) 10088 return Data.PerPartOutput[Def][Part]; 10089 10090 if (!hasScalarValue(Def, {Part, 0})) { 10091 Value *IRV = Def->getLiveInIRValue(); 10092 Value *B = ILV->getBroadcastInstrs(IRV); 10093 set(Def, B, Part); 10094 return B; 10095 } 10096 10097 Value *ScalarValue = get(Def, {Part, 0}); 10098 // If we aren't vectorizing, we can just copy the scalar map values over 10099 // to the vector map. 10100 if (VF.isScalar()) { 10101 set(Def, ScalarValue, Part); 10102 return ScalarValue; 10103 } 10104 10105 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10106 bool IsUniform = RepR && RepR->isUniform(); 10107 10108 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10109 // Check if there is a scalar value for the selected lane. 10110 if (!hasScalarValue(Def, {Part, LastLane})) { 10111 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10112 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10113 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10114 "unexpected recipe found to be invariant"); 10115 IsUniform = true; 10116 LastLane = 0; 10117 } 10118 10119 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10120 // Set the insert point after the last scalarized instruction or after the 10121 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10122 // will directly follow the scalar definitions. 10123 auto OldIP = Builder.saveIP(); 10124 auto NewIP = 10125 isa<PHINode>(LastInst) 10126 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10127 : std::next(BasicBlock::iterator(LastInst)); 10128 Builder.SetInsertPoint(&*NewIP); 10129 10130 // However, if we are vectorizing, we need to construct the vector values. 10131 // If the value is known to be uniform after vectorization, we can just 10132 // broadcast the scalar value corresponding to lane zero for each unroll 10133 // iteration. Otherwise, we construct the vector values using 10134 // insertelement instructions. Since the resulting vectors are stored in 10135 // State, we will only generate the insertelements once. 10136 Value *VectorValue = nullptr; 10137 if (IsUniform) { 10138 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10139 set(Def, VectorValue, Part); 10140 } else { 10141 // Initialize packing with insertelements to start from undef. 10142 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10143 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10144 set(Def, Undef, Part); 10145 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10146 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10147 VectorValue = get(Def, Part); 10148 } 10149 Builder.restoreIP(OldIP); 10150 return VectorValue; 10151 } 10152 10153 // Process the loop in the VPlan-native vectorization path. This path builds 10154 // VPlan upfront in the vectorization pipeline, which allows to apply 10155 // VPlan-to-VPlan transformations from the very beginning without modifying the 10156 // input LLVM IR. 10157 static bool processLoopInVPlanNativePath( 10158 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10159 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10160 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10161 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10162 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10163 LoopVectorizationRequirements &Requirements) { 10164 10165 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10166 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10167 return false; 10168 } 10169 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10170 Function *F = L->getHeader()->getParent(); 10171 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10172 10173 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10174 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10175 10176 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10177 &Hints, IAI); 10178 // Use the planner for outer loop vectorization. 10179 // TODO: CM is not used at this point inside the planner. Turn CM into an 10180 // optional argument if we don't need it in the future. 10181 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10182 Requirements, ORE); 10183 10184 // Get user vectorization factor. 10185 ElementCount UserVF = Hints.getWidth(); 10186 10187 CM.collectElementTypesForWidening(); 10188 10189 // Plan how to best vectorize, return the best VF and its cost. 10190 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10191 10192 // If we are stress testing VPlan builds, do not attempt to generate vector 10193 // code. Masked vector code generation support will follow soon. 10194 // Also, do not attempt to vectorize if no vector code will be produced. 10195 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10196 return false; 10197 10198 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10199 10200 { 10201 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10202 F->getParent()->getDataLayout()); 10203 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10204 &CM, BFI, PSI, Checks); 10205 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10206 << L->getHeader()->getParent()->getName() << "\"\n"); 10207 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10208 } 10209 10210 // Mark the loop as already vectorized to avoid vectorizing again. 10211 Hints.setAlreadyVectorized(); 10212 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10213 return true; 10214 } 10215 10216 // Emit a remark if there are stores to floats that required a floating point 10217 // extension. If the vectorized loop was generated with floating point there 10218 // will be a performance penalty from the conversion overhead and the change in 10219 // the vector width. 10220 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10221 SmallVector<Instruction *, 4> Worklist; 10222 for (BasicBlock *BB : L->getBlocks()) { 10223 for (Instruction &Inst : *BB) { 10224 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10225 if (S->getValueOperand()->getType()->isFloatTy()) 10226 Worklist.push_back(S); 10227 } 10228 } 10229 } 10230 10231 // Traverse the floating point stores upwards searching, for floating point 10232 // conversions. 10233 SmallPtrSet<const Instruction *, 4> Visited; 10234 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10235 while (!Worklist.empty()) { 10236 auto *I = Worklist.pop_back_val(); 10237 if (!L->contains(I)) 10238 continue; 10239 if (!Visited.insert(I).second) 10240 continue; 10241 10242 // Emit a remark if the floating point store required a floating 10243 // point conversion. 10244 // TODO: More work could be done to identify the root cause such as a 10245 // constant or a function return type and point the user to it. 10246 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10247 ORE->emit([&]() { 10248 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10249 I->getDebugLoc(), L->getHeader()) 10250 << "floating point conversion changes vector width. " 10251 << "Mixed floating point precision requires an up/down " 10252 << "cast that will negatively impact performance."; 10253 }); 10254 10255 for (Use &Op : I->operands()) 10256 if (auto *OpI = dyn_cast<Instruction>(Op)) 10257 Worklist.push_back(OpI); 10258 } 10259 } 10260 10261 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10262 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10263 !EnableLoopInterleaving), 10264 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10265 !EnableLoopVectorization) {} 10266 10267 bool LoopVectorizePass::processLoop(Loop *L) { 10268 assert((EnableVPlanNativePath || L->isInnermost()) && 10269 "VPlan-native path is not enabled. Only process inner loops."); 10270 10271 #ifndef NDEBUG 10272 const std::string DebugLocStr = getDebugLocString(L); 10273 #endif /* NDEBUG */ 10274 10275 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10276 << L->getHeader()->getParent()->getName() << "' from " 10277 << DebugLocStr << "\n"); 10278 10279 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10280 10281 LLVM_DEBUG( 10282 dbgs() << "LV: Loop hints:" 10283 << " force=" 10284 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10285 ? "disabled" 10286 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10287 ? "enabled" 10288 : "?")) 10289 << " width=" << Hints.getWidth() 10290 << " interleave=" << Hints.getInterleave() << "\n"); 10291 10292 // Function containing loop 10293 Function *F = L->getHeader()->getParent(); 10294 10295 // Looking at the diagnostic output is the only way to determine if a loop 10296 // was vectorized (other than looking at the IR or machine code), so it 10297 // is important to generate an optimization remark for each loop. Most of 10298 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10299 // generated as OptimizationRemark and OptimizationRemarkMissed are 10300 // less verbose reporting vectorized loops and unvectorized loops that may 10301 // benefit from vectorization, respectively. 10302 10303 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10304 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10305 return false; 10306 } 10307 10308 PredicatedScalarEvolution PSE(*SE, *L); 10309 10310 // Check if it is legal to vectorize the loop. 10311 LoopVectorizationRequirements Requirements; 10312 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10313 &Requirements, &Hints, DB, AC, BFI, PSI); 10314 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10315 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10316 Hints.emitRemarkWithHints(); 10317 return false; 10318 } 10319 10320 // Check the function attributes and profiles to find out if this function 10321 // should be optimized for size. 10322 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10323 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10324 10325 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10326 // here. They may require CFG and instruction level transformations before 10327 // even evaluating whether vectorization is profitable. Since we cannot modify 10328 // the incoming IR, we need to build VPlan upfront in the vectorization 10329 // pipeline. 10330 if (!L->isInnermost()) 10331 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10332 ORE, BFI, PSI, Hints, Requirements); 10333 10334 assert(L->isInnermost() && "Inner loop expected."); 10335 10336 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10337 // count by optimizing for size, to minimize overheads. 10338 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10339 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10340 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10341 << "This loop is worth vectorizing only if no scalar " 10342 << "iteration overheads are incurred."); 10343 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10344 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10345 else { 10346 LLVM_DEBUG(dbgs() << "\n"); 10347 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10348 } 10349 } 10350 10351 // Check the function attributes to see if implicit floats are allowed. 10352 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10353 // an integer loop and the vector instructions selected are purely integer 10354 // vector instructions? 10355 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10356 reportVectorizationFailure( 10357 "Can't vectorize when the NoImplicitFloat attribute is used", 10358 "loop not vectorized due to NoImplicitFloat attribute", 10359 "NoImplicitFloat", ORE, L); 10360 Hints.emitRemarkWithHints(); 10361 return false; 10362 } 10363 10364 // Check if the target supports potentially unsafe FP vectorization. 10365 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10366 // for the target we're vectorizing for, to make sure none of the 10367 // additional fp-math flags can help. 10368 if (Hints.isPotentiallyUnsafe() && 10369 TTI->isFPVectorizationPotentiallyUnsafe()) { 10370 reportVectorizationFailure( 10371 "Potentially unsafe FP op prevents vectorization", 10372 "loop not vectorized due to unsafe FP support.", 10373 "UnsafeFP", ORE, L); 10374 Hints.emitRemarkWithHints(); 10375 return false; 10376 } 10377 10378 bool AllowOrderedReductions; 10379 // If the flag is set, use that instead and override the TTI behaviour. 10380 if (ForceOrderedReductions.getNumOccurrences() > 0) 10381 AllowOrderedReductions = ForceOrderedReductions; 10382 else 10383 AllowOrderedReductions = TTI->enableOrderedReductions(); 10384 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10385 ORE->emit([&]() { 10386 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10387 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10388 ExactFPMathInst->getDebugLoc(), 10389 ExactFPMathInst->getParent()) 10390 << "loop not vectorized: cannot prove it is safe to reorder " 10391 "floating-point operations"; 10392 }); 10393 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10394 "reorder floating-point operations\n"); 10395 Hints.emitRemarkWithHints(); 10396 return false; 10397 } 10398 10399 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10400 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10401 10402 // If an override option has been passed in for interleaved accesses, use it. 10403 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10404 UseInterleaved = EnableInterleavedMemAccesses; 10405 10406 // Analyze interleaved memory accesses. 10407 if (UseInterleaved) { 10408 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10409 } 10410 10411 // Use the cost model. 10412 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10413 F, &Hints, IAI); 10414 CM.collectValuesToIgnore(); 10415 CM.collectElementTypesForWidening(); 10416 10417 // Use the planner for vectorization. 10418 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10419 Requirements, ORE); 10420 10421 // Get user vectorization factor and interleave count. 10422 ElementCount UserVF = Hints.getWidth(); 10423 unsigned UserIC = Hints.getInterleave(); 10424 10425 // Plan how to best vectorize, return the best VF and its cost. 10426 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10427 10428 VectorizationFactor VF = VectorizationFactor::Disabled(); 10429 unsigned IC = 1; 10430 10431 if (MaybeVF) { 10432 if (LVP.requiresTooManyRuntimeChecks()) { 10433 ORE->emit([&]() { 10434 return OptimizationRemarkAnalysisAliasing( 10435 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10436 L->getHeader()) 10437 << "loop not vectorized: cannot prove it is safe to reorder " 10438 "memory operations"; 10439 }); 10440 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10441 Hints.emitRemarkWithHints(); 10442 return false; 10443 } 10444 VF = *MaybeVF; 10445 // Select the interleave count. 10446 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10447 } 10448 10449 // Identify the diagnostic messages that should be produced. 10450 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10451 bool VectorizeLoop = true, InterleaveLoop = true; 10452 if (VF.Width.isScalar()) { 10453 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10454 VecDiagMsg = std::make_pair( 10455 "VectorizationNotBeneficial", 10456 "the cost-model indicates that vectorization is not beneficial"); 10457 VectorizeLoop = false; 10458 } 10459 10460 if (!MaybeVF && UserIC > 1) { 10461 // Tell the user interleaving was avoided up-front, despite being explicitly 10462 // requested. 10463 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10464 "interleaving should be avoided up front\n"); 10465 IntDiagMsg = std::make_pair( 10466 "InterleavingAvoided", 10467 "Ignoring UserIC, because interleaving was avoided up front"); 10468 InterleaveLoop = false; 10469 } else if (IC == 1 && UserIC <= 1) { 10470 // Tell the user interleaving is not beneficial. 10471 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10472 IntDiagMsg = std::make_pair( 10473 "InterleavingNotBeneficial", 10474 "the cost-model indicates that interleaving is not beneficial"); 10475 InterleaveLoop = false; 10476 if (UserIC == 1) { 10477 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10478 IntDiagMsg.second += 10479 " and is explicitly disabled or interleave count is set to 1"; 10480 } 10481 } else if (IC > 1 && UserIC == 1) { 10482 // Tell the user interleaving is beneficial, but it explicitly disabled. 10483 LLVM_DEBUG( 10484 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10485 IntDiagMsg = std::make_pair( 10486 "InterleavingBeneficialButDisabled", 10487 "the cost-model indicates that interleaving is beneficial " 10488 "but is explicitly disabled or interleave count is set to 1"); 10489 InterleaveLoop = false; 10490 } 10491 10492 // Override IC if user provided an interleave count. 10493 IC = UserIC > 0 ? UserIC : IC; 10494 10495 // Emit diagnostic messages, if any. 10496 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10497 if (!VectorizeLoop && !InterleaveLoop) { 10498 // Do not vectorize or interleaving the loop. 10499 ORE->emit([&]() { 10500 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10501 L->getStartLoc(), L->getHeader()) 10502 << VecDiagMsg.second; 10503 }); 10504 ORE->emit([&]() { 10505 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10506 L->getStartLoc(), L->getHeader()) 10507 << IntDiagMsg.second; 10508 }); 10509 return false; 10510 } else if (!VectorizeLoop && InterleaveLoop) { 10511 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10512 ORE->emit([&]() { 10513 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10514 L->getStartLoc(), L->getHeader()) 10515 << VecDiagMsg.second; 10516 }); 10517 } else if (VectorizeLoop && !InterleaveLoop) { 10518 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10519 << ") in " << DebugLocStr << '\n'); 10520 ORE->emit([&]() { 10521 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10522 L->getStartLoc(), L->getHeader()) 10523 << IntDiagMsg.second; 10524 }); 10525 } else if (VectorizeLoop && InterleaveLoop) { 10526 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10527 << ") in " << DebugLocStr << '\n'); 10528 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10529 } 10530 10531 bool DisableRuntimeUnroll = false; 10532 MDNode *OrigLoopID = L->getLoopID(); 10533 { 10534 // Optimistically generate runtime checks. Drop them if they turn out to not 10535 // be profitable. Limit the scope of Checks, so the cleanup happens 10536 // immediately after vector codegeneration is done. 10537 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10538 F->getParent()->getDataLayout()); 10539 if (!VF.Width.isScalar() || IC > 1) 10540 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC); 10541 10542 using namespace ore; 10543 if (!VectorizeLoop) { 10544 assert(IC > 1 && "interleave count should not be 1 or 0"); 10545 // If we decided that it is not legal to vectorize the loop, then 10546 // interleave it. 10547 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10548 &CM, BFI, PSI, Checks); 10549 10550 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10551 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10552 10553 ORE->emit([&]() { 10554 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10555 L->getHeader()) 10556 << "interleaved loop (interleaved count: " 10557 << NV("InterleaveCount", IC) << ")"; 10558 }); 10559 } else { 10560 // If we decided that it is *legal* to vectorize the loop, then do it. 10561 10562 // Consider vectorizing the epilogue too if it's profitable. 10563 VectorizationFactor EpilogueVF = 10564 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10565 if (EpilogueVF.Width.isVector()) { 10566 10567 // The first pass vectorizes the main loop and creates a scalar epilogue 10568 // to be vectorized by executing the plan (potentially with a different 10569 // factor) again shortly afterwards. 10570 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10571 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10572 EPI, &LVL, &CM, BFI, PSI, Checks); 10573 10574 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10575 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10576 DT); 10577 ++LoopsVectorized; 10578 10579 // Second pass vectorizes the epilogue and adjusts the control flow 10580 // edges from the first pass. 10581 EPI.MainLoopVF = EPI.EpilogueVF; 10582 EPI.MainLoopUF = EPI.EpilogueUF; 10583 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10584 ORE, EPI, &LVL, &CM, BFI, PSI, 10585 Checks); 10586 10587 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10588 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10589 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10590 Header->setName("vec.epilog.vector.body"); 10591 10592 // Ensure that the start values for any VPReductionPHIRecipes are 10593 // updated before vectorising the epilogue loop. 10594 for (VPRecipeBase &R : Header->phis()) { 10595 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10596 if (auto *Resume = MainILV.getReductionResumeValue( 10597 ReductionPhi->getRecurrenceDescriptor())) { 10598 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10599 ReductionPhi->setOperand(0, StartVal); 10600 } 10601 } 10602 } 10603 10604 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10605 DT); 10606 ++LoopsEpilogueVectorized; 10607 10608 if (!MainILV.areSafetyChecksAdded()) 10609 DisableRuntimeUnroll = true; 10610 } else { 10611 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10612 &LVL, &CM, BFI, PSI, Checks); 10613 10614 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10615 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10616 ++LoopsVectorized; 10617 10618 // Add metadata to disable runtime unrolling a scalar loop when there 10619 // are no runtime checks about strides and memory. A scalar loop that is 10620 // rarely used is not worth unrolling. 10621 if (!LB.areSafetyChecksAdded()) 10622 DisableRuntimeUnroll = true; 10623 } 10624 // Report the vectorization decision. 10625 ORE->emit([&]() { 10626 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10627 L->getHeader()) 10628 << "vectorized loop (vectorization width: " 10629 << NV("VectorizationFactor", VF.Width) 10630 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10631 }); 10632 } 10633 10634 if (ORE->allowExtraAnalysis(LV_NAME)) 10635 checkMixedPrecision(L, ORE); 10636 } 10637 10638 Optional<MDNode *> RemainderLoopID = 10639 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10640 LLVMLoopVectorizeFollowupEpilogue}); 10641 if (RemainderLoopID.hasValue()) { 10642 L->setLoopID(RemainderLoopID.getValue()); 10643 } else { 10644 if (DisableRuntimeUnroll) 10645 AddRuntimeUnrollDisableMetaData(L); 10646 10647 // Mark the loop as already vectorized to avoid vectorizing again. 10648 Hints.setAlreadyVectorized(); 10649 } 10650 10651 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10652 return true; 10653 } 10654 10655 LoopVectorizeResult LoopVectorizePass::runImpl( 10656 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10657 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10658 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10659 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10660 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10661 SE = &SE_; 10662 LI = &LI_; 10663 TTI = &TTI_; 10664 DT = &DT_; 10665 BFI = &BFI_; 10666 TLI = TLI_; 10667 AA = &AA_; 10668 AC = &AC_; 10669 GetLAA = &GetLAA_; 10670 DB = &DB_; 10671 ORE = &ORE_; 10672 PSI = PSI_; 10673 10674 // Don't attempt if 10675 // 1. the target claims to have no vector registers, and 10676 // 2. interleaving won't help ILP. 10677 // 10678 // The second condition is necessary because, even if the target has no 10679 // vector registers, loop vectorization may still enable scalar 10680 // interleaving. 10681 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10682 TTI->getMaxInterleaveFactor(1) < 2) 10683 return LoopVectorizeResult(false, false); 10684 10685 bool Changed = false, CFGChanged = false; 10686 10687 // The vectorizer requires loops to be in simplified form. 10688 // Since simplification may add new inner loops, it has to run before the 10689 // legality and profitability checks. This means running the loop vectorizer 10690 // will simplify all loops, regardless of whether anything end up being 10691 // vectorized. 10692 for (auto &L : *LI) 10693 Changed |= CFGChanged |= 10694 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10695 10696 // Build up a worklist of inner-loops to vectorize. This is necessary as 10697 // the act of vectorizing or partially unrolling a loop creates new loops 10698 // and can invalidate iterators across the loops. 10699 SmallVector<Loop *, 8> Worklist; 10700 10701 for (Loop *L : *LI) 10702 collectSupportedLoops(*L, LI, ORE, Worklist); 10703 10704 LoopsAnalyzed += Worklist.size(); 10705 10706 // Now walk the identified inner loops. 10707 while (!Worklist.empty()) { 10708 Loop *L = Worklist.pop_back_val(); 10709 10710 // For the inner loops we actually process, form LCSSA to simplify the 10711 // transform. 10712 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10713 10714 Changed |= CFGChanged |= processLoop(L); 10715 } 10716 10717 // Process each loop nest in the function. 10718 return LoopVectorizeResult(Changed, CFGChanged); 10719 } 10720 10721 PreservedAnalyses LoopVectorizePass::run(Function &F, 10722 FunctionAnalysisManager &AM) { 10723 auto &LI = AM.getResult<LoopAnalysis>(F); 10724 // There are no loops in the function. Return before computing other expensive 10725 // analyses. 10726 if (LI.empty()) 10727 return PreservedAnalyses::all(); 10728 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10729 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10730 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10731 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10732 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10733 auto &AA = AM.getResult<AAManager>(F); 10734 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10735 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10736 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10737 10738 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10739 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10740 [&](Loop &L) -> const LoopAccessInfo & { 10741 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10742 TLI, TTI, nullptr, nullptr, nullptr}; 10743 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10744 }; 10745 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10746 ProfileSummaryInfo *PSI = 10747 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10748 LoopVectorizeResult Result = 10749 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10750 if (!Result.MadeAnyChange) 10751 return PreservedAnalyses::all(); 10752 PreservedAnalyses PA; 10753 10754 // We currently do not preserve loopinfo/dominator analyses with outer loop 10755 // vectorization. Until this is addressed, mark these analyses as preserved 10756 // only for non-VPlan-native path. 10757 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10758 if (!EnableVPlanNativePath) { 10759 PA.preserve<LoopAnalysis>(); 10760 PA.preserve<DominatorTreeAnalysis>(); 10761 } 10762 10763 if (Result.MadeCFGChange) { 10764 // Making CFG changes likely means a loop got vectorized. Indicate that 10765 // extra simplification passes should be run. 10766 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10767 // be run if runtime checks have been added. 10768 AM.getResult<ShouldRunExtraVectorPasses>(F); 10769 PA.preserve<ShouldRunExtraVectorPasses>(); 10770 } else { 10771 PA.preserveSet<CFGAnalyses>(); 10772 } 10773 return PA; 10774 } 10775 10776 void LoopVectorizePass::printPipeline( 10777 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10778 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10779 OS, MapClassName2PassName); 10780 10781 OS << "<"; 10782 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10783 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10784 OS << ">"; 10785 } 10786