1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/None.h" 69 #include "llvm/ADT/Optional.h" 70 #include "llvm/ADT/STLExtras.h" 71 #include "llvm/ADT/SmallPtrSet.h" 72 #include "llvm/ADT/SmallSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 90 #include "llvm/Analysis/ProfileSummaryInfo.h" 91 #include "llvm/Analysis/ScalarEvolution.h" 92 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 93 #include "llvm/Analysis/TargetLibraryInfo.h" 94 #include "llvm/Analysis/TargetTransformInfo.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/InitializePasses.h" 125 #include "llvm/Pass.h" 126 #include "llvm/Support/Casting.h" 127 #include "llvm/Support/CommandLine.h" 128 #include "llvm/Support/Compiler.h" 129 #include "llvm/Support/Debug.h" 130 #include "llvm/Support/ErrorHandling.h" 131 #include "llvm/Support/InstructionCost.h" 132 #include "llvm/Support/MathExtras.h" 133 #include "llvm/Support/raw_ostream.h" 134 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 135 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 200 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks with a " 202 "vectorize(enable) pragma.")); 203 204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 205 // that predication is preferred, and this lists all options. I.e., the 206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 207 // and predicate the instructions accordingly. If tail-folding fails, there are 208 // different fallback strategies depending on these values: 209 namespace PreferPredicateTy { 210 enum Option { 211 ScalarEpilogue = 0, 212 PredicateElseScalarEpilogue, 213 PredicateOrDontVectorize 214 }; 215 } // namespace PreferPredicateTy 216 217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 218 "prefer-predicate-over-epilogue", 219 cl::init(PreferPredicateTy::ScalarEpilogue), 220 cl::Hidden, 221 cl::desc("Tail-folding and predication preferences over creating a scalar " 222 "epilogue loop."), 223 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 224 "scalar-epilogue", 225 "Don't tail-predicate loops, create scalar epilogue"), 226 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 227 "predicate-else-scalar-epilogue", 228 "prefer tail-folding, create scalar epilogue if tail " 229 "folding fails."), 230 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 231 "predicate-dont-vectorize", 232 "prefers tail-folding, don't attempt vectorization if " 233 "tail-folding fails."))); 234 235 static cl::opt<bool> MaximizeBandwidth( 236 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 237 cl::desc("Maximize bandwidth when selecting vectorization factor which " 238 "will be determined by the smallest type in loop.")); 239 240 static cl::opt<bool> EnableInterleavedMemAccesses( 241 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 242 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 243 244 /// An interleave-group may need masking if it resides in a block that needs 245 /// predication, or in order to mask away gaps. 246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 247 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 248 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 249 250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 251 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 252 cl::desc("We don't interleave loops with a estimated constant trip count " 253 "below this number")); 254 255 static cl::opt<unsigned> ForceTargetNumScalarRegs( 256 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of scalar registers.")); 258 259 static cl::opt<unsigned> ForceTargetNumVectorRegs( 260 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's number of vector registers.")); 262 263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 264 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 265 cl::desc("A flag that overrides the target's max interleave factor for " 266 "scalar loops.")); 267 268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 269 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 270 cl::desc("A flag that overrides the target's max interleave factor for " 271 "vectorized loops.")); 272 273 static cl::opt<unsigned> ForceTargetInstructionCost( 274 "force-target-instruction-cost", cl::init(0), cl::Hidden, 275 cl::desc("A flag that overrides the target's expected cost for " 276 "an instruction to a single constant value. Mostly " 277 "useful for getting consistent testing.")); 278 279 static cl::opt<bool> ForceTargetSupportsScalableVectors( 280 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 281 cl::desc( 282 "Pretend that scalable vectors are supported, even if the target does " 283 "not support them. This flag should only be used for testing.")); 284 285 static cl::opt<unsigned> SmallLoopCost( 286 "small-loop-cost", cl::init(20), cl::Hidden, 287 cl::desc( 288 "The cost of a loop that is considered 'small' by the interleaver.")); 289 290 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 291 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 292 cl::desc("Enable the use of the block frequency analysis to access PGO " 293 "heuristics minimizing code growth in cold regions and being more " 294 "aggressive in hot regions.")); 295 296 // Runtime interleave loops for load/store throughput. 297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 298 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 299 cl::desc( 300 "Enable runtime interleaving until load/store ports are saturated")); 301 302 /// Interleave small loops with scalar reductions. 303 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 304 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 305 cl::desc("Enable interleaving for loops with small iteration counts that " 306 "contain scalar reductions to expose ILP.")); 307 308 /// The number of stores in a loop that are allowed to need predication. 309 static cl::opt<unsigned> NumberOfStoresToPredicate( 310 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 311 cl::desc("Max number of stores to be predicated behind an if.")); 312 313 static cl::opt<bool> EnableIndVarRegisterHeur( 314 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 315 cl::desc("Count the induction variable only once when interleaving")); 316 317 static cl::opt<bool> EnableCondStoresVectorization( 318 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 319 cl::desc("Enable if predication of stores during vectorization.")); 320 321 static cl::opt<unsigned> MaxNestedScalarReductionIC( 322 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 323 cl::desc("The maximum interleave count to use when interleaving a scalar " 324 "reduction in a nested loop.")); 325 326 static cl::opt<bool> 327 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 328 cl::Hidden, 329 cl::desc("Prefer in-loop vector reductions, " 330 "overriding the targets preference.")); 331 332 static cl::opt<bool> ForceOrderedReductions( 333 "force-ordered-reductions", cl::init(false), cl::Hidden, 334 cl::desc("Enable the vectorisation of loops with in-order (strict) " 335 "FP reductions")); 336 337 static cl::opt<bool> PreferPredicatedReductionSelect( 338 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 339 cl::desc( 340 "Prefer predicating a reduction operation over an after loop select.")); 341 342 cl::opt<bool> EnableVPlanNativePath( 343 "enable-vplan-native-path", cl::init(false), cl::Hidden, 344 cl::desc("Enable VPlan-native vectorization path with " 345 "support for outer loop vectorization.")); 346 347 // This flag enables the stress testing of the VPlan H-CFG construction in the 348 // VPlan-native vectorization path. It must be used in conjuction with 349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 350 // verification of the H-CFGs built. 351 static cl::opt<bool> VPlanBuildStressTest( 352 "vplan-build-stress-test", cl::init(false), cl::Hidden, 353 cl::desc( 354 "Build VPlan for every supported loop nest in the function and bail " 355 "out right after the build (stress test the VPlan H-CFG construction " 356 "in the VPlan-native vectorization path).")); 357 358 cl::opt<bool> llvm::EnableLoopInterleaving( 359 "interleave-loops", cl::init(true), cl::Hidden, 360 cl::desc("Enable loop interleaving in Loop vectorization passes")); 361 cl::opt<bool> llvm::EnableLoopVectorization( 362 "vectorize-loops", cl::init(true), cl::Hidden, 363 cl::desc("Run the Loop vectorization passes")); 364 365 cl::opt<bool> PrintVPlansInDotFormat( 366 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 367 cl::desc("Use dot format instead of plain text when dumping VPlans")); 368 369 /// A helper function that returns true if the given type is irregular. The 370 /// type is irregular if its allocated size doesn't equal the store size of an 371 /// element of the corresponding vector type. 372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 373 // Determine if an array of N elements of type Ty is "bitcast compatible" 374 // with a <N x Ty> vector. 375 // This is only true if there is no padding between the array elements. 376 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 377 } 378 379 /// A helper function that returns the reciprocal of the block probability of 380 /// predicated blocks. If we return X, we are assuming the predicated block 381 /// will execute once for every X iterations of the loop header. 382 /// 383 /// TODO: We should use actual block probability here, if available. Currently, 384 /// we always assume predicated blocks have a 50% chance of executing. 385 static unsigned getReciprocalPredBlockProb() { return 2; } 386 387 /// A helper function that returns an integer or floating-point constant with 388 /// value C. 389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 390 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 391 : ConstantFP::get(Ty, C); 392 } 393 394 /// Returns "best known" trip count for the specified loop \p L as defined by 395 /// the following procedure: 396 /// 1) Returns exact trip count if it is known. 397 /// 2) Returns expected trip count according to profile data if any. 398 /// 3) Returns upper bound estimate if it is known. 399 /// 4) Returns None if all of the above failed. 400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 401 // Check if exact trip count is known. 402 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 403 return ExpectedTC; 404 405 // Check if there is an expected trip count available from profile data. 406 if (LoopVectorizeWithBlockFrequency) 407 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 408 return EstimatedTC; 409 410 // Check if upper bound estimate is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 412 return ExpectedTC; 413 414 return None; 415 } 416 417 // Forward declare GeneratedRTChecks. 418 class GeneratedRTChecks; 419 420 namespace llvm { 421 422 AnalysisKey ShouldRunExtraVectorPasses::Key; 423 424 /// InnerLoopVectorizer vectorizes loops which contain only one basic 425 /// block to a specified vectorization factor (VF). 426 /// This class performs the widening of scalars into vectors, or multiple 427 /// scalars. This class also implements the following features: 428 /// * It inserts an epilogue loop for handling loops that don't have iteration 429 /// counts that are known to be a multiple of the vectorization factor. 430 /// * It handles the code generation for reduction variables. 431 /// * Scalarization (implementation using scalars) of un-vectorizable 432 /// instructions. 433 /// InnerLoopVectorizer does not perform any vectorization-legality 434 /// checks, and relies on the caller to check for the different legality 435 /// aspects. The InnerLoopVectorizer relies on the 436 /// LoopVectorizationLegality class to provide information about the induction 437 /// and reduction variables that were found to a given vectorization factor. 438 class InnerLoopVectorizer { 439 public: 440 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 441 LoopInfo *LI, DominatorTree *DT, 442 const TargetLibraryInfo *TLI, 443 const TargetTransformInfo *TTI, AssumptionCache *AC, 444 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 445 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 446 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 447 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 448 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 449 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 450 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 451 PSI(PSI), RTChecks(RTChecks) { 452 // Query this against the original loop and save it here because the profile 453 // of the original loop header may change as the transformation happens. 454 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 455 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 456 } 457 458 virtual ~InnerLoopVectorizer() = default; 459 460 /// Create a new empty loop that will contain vectorized instructions later 461 /// on, while the old loop will be used as the scalar remainder. Control flow 462 /// is generated around the vectorized (and scalar epilogue) loops consisting 463 /// of various checks and bypasses. Return the pre-header block of the new 464 /// loop and the start value for the canonical induction, if it is != 0. The 465 /// latter is the case when vectorizing the epilogue loop. In the case of 466 /// epilogue vectorization, this function is overriden to handle the more 467 /// complex control flow around the loops. 468 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 469 470 /// Widen a single call instruction within the innermost loop. 471 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 472 VPTransformState &State); 473 474 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 475 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 476 477 // Return true if any runtime check is added. 478 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 479 480 /// A type for vectorized values in the new loop. Each value from the 481 /// original loop, when vectorized, is represented by UF vector values in the 482 /// new unrolled loop, where UF is the unroll factor. 483 using VectorParts = SmallVector<Value *, 2>; 484 485 /// A helper function to scalarize a single Instruction in the innermost loop. 486 /// Generates a sequence of scalar instances for each lane between \p MinLane 487 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 488 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 489 /// Instr's operands. 490 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 491 const VPIteration &Instance, bool IfPredicateInstr, 492 VPTransformState &State); 493 494 /// Construct the vector value of a scalarized value \p V one lane at a time. 495 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 496 VPTransformState &State); 497 498 /// Try to vectorize interleaved access group \p Group with the base address 499 /// given in \p Addr, optionally masking the vector operations if \p 500 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 501 /// values in the vectorized loop. 502 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 503 ArrayRef<VPValue *> VPDefs, 504 VPTransformState &State, VPValue *Addr, 505 ArrayRef<VPValue *> StoredValues, 506 VPValue *BlockInMask = nullptr); 507 508 /// Set the debug location in the builder \p Ptr using the debug location in 509 /// \p V. If \p Ptr is None then it uses the class member's Builder. 510 void setDebugLocFromInst(const Value *V); 511 512 /// Fix the non-induction PHIs in \p Plan. 513 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 514 515 /// Returns true if the reordering of FP operations is not allowed, but we are 516 /// able to vectorize with strict in-order reductions for the given RdxDesc. 517 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 518 519 /// Create a broadcast instruction. This method generates a broadcast 520 /// instruction (shuffle) for loop invariant values and for the induction 521 /// value. If this is the induction variable then we extend it to N, N+1, ... 522 /// this is needed because each iteration in the loop corresponds to a SIMD 523 /// element. 524 virtual Value *getBroadcastInstrs(Value *V); 525 526 /// Add metadata from one instruction to another. 527 /// 528 /// This includes both the original MDs from \p From and additional ones (\see 529 /// addNewMetadata). Use this for *newly created* instructions in the vector 530 /// loop. 531 void addMetadata(Instruction *To, Instruction *From); 532 533 /// Similar to the previous function but it adds the metadata to a 534 /// vector of instructions. 535 void addMetadata(ArrayRef<Value *> To, Instruction *From); 536 537 // Returns the resume value (bc.merge.rdx) for a reduction as 538 // generated by fixReduction. 539 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 540 541 protected: 542 friend class LoopVectorizationPlanner; 543 544 /// A small list of PHINodes. 545 using PhiVector = SmallVector<PHINode *, 4>; 546 547 /// A type for scalarized values in the new loop. Each value from the 548 /// original loop, when scalarized, is represented by UF x VF scalar values 549 /// in the new unrolled loop, where UF is the unroll factor and VF is the 550 /// vectorization factor. 551 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 552 553 /// Set up the values of the IVs correctly when exiting the vector loop. 554 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 555 Value *VectorTripCount, Value *EndValue, 556 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 557 VPlan &Plan); 558 559 /// Handle all cross-iteration phis in the header. 560 void fixCrossIterationPHIs(VPTransformState &State); 561 562 /// Create the exit value of first order recurrences in the middle block and 563 /// update their users. 564 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 565 VPTransformState &State); 566 567 /// Create code for the loop exit value of the reduction. 568 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 569 570 /// Clear NSW/NUW flags from reduction instructions if necessary. 571 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 572 VPTransformState &State); 573 574 /// Iteratively sink the scalarized operands of a predicated instruction into 575 /// the block that was created for it. 576 void sinkScalarOperands(Instruction *PredInst); 577 578 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 579 /// represented as. 580 void truncateToMinimalBitwidths(VPTransformState &State); 581 582 /// Returns (and creates if needed) the original loop trip count. 583 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 584 585 /// Returns (and creates if needed) the trip count of the widened loop. 586 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 587 588 /// Returns a bitcasted value to the requested vector type. 589 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 590 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 591 const DataLayout &DL); 592 593 /// Emit a bypass check to see if the vector trip count is zero, including if 594 /// it overflows. 595 void emitIterationCountCheck(BasicBlock *Bypass); 596 597 /// Emit a bypass check to see if all of the SCEV assumptions we've 598 /// had to make are correct. Returns the block containing the checks or 599 /// nullptr if no checks have been added. 600 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 601 602 /// Emit bypass checks to check any memory assumptions we may have made. 603 /// Returns the block containing the checks or nullptr if no checks have been 604 /// added. 605 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 606 607 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 608 /// vector loop preheader, middle block and scalar preheader. 609 void createVectorLoopSkeleton(StringRef Prefix); 610 611 /// Create new phi nodes for the induction variables to resume iteration count 612 /// in the scalar epilogue, from where the vectorized loop left off. 613 /// In cases where the loop skeleton is more complicated (eg. epilogue 614 /// vectorization) and the resume values can come from an additional bypass 615 /// block, the \p AdditionalBypass pair provides information about the bypass 616 /// block and the end value on the edge from bypass to this loop. 617 void createInductionResumeValues( 618 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 619 620 /// Complete the loop skeleton by adding debug MDs, creating appropriate 621 /// conditional branches in the middle block, preparing the builder and 622 /// running the verifier. Return the preheader of the completed vector loop. 623 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 624 625 /// Add additional metadata to \p To that was not present on \p Orig. 626 /// 627 /// Currently this is used to add the noalias annotations based on the 628 /// inserted memchecks. Use this for instructions that are *cloned* into the 629 /// vector loop. 630 void addNewMetadata(Instruction *To, const Instruction *Orig); 631 632 /// Collect poison-generating recipes that may generate a poison value that is 633 /// used after vectorization, even when their operands are not poison. Those 634 /// recipes meet the following conditions: 635 /// * Contribute to the address computation of a recipe generating a widen 636 /// memory load/store (VPWidenMemoryInstructionRecipe or 637 /// VPInterleaveRecipe). 638 /// * Such a widen memory load/store has at least one underlying Instruction 639 /// that is in a basic block that needs predication and after vectorization 640 /// the generated instruction won't be predicated. 641 void collectPoisonGeneratingRecipes(VPTransformState &State); 642 643 /// Allow subclasses to override and print debug traces before/after vplan 644 /// execution, when trace information is requested. 645 virtual void printDebugTracesAtStart(){}; 646 virtual void printDebugTracesAtEnd(){}; 647 648 /// The original loop. 649 Loop *OrigLoop; 650 651 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 652 /// dynamic knowledge to simplify SCEV expressions and converts them to a 653 /// more usable form. 654 PredicatedScalarEvolution &PSE; 655 656 /// Loop Info. 657 LoopInfo *LI; 658 659 /// Dominator Tree. 660 DominatorTree *DT; 661 662 /// Alias Analysis. 663 AAResults *AA; 664 665 /// Target Library Info. 666 const TargetLibraryInfo *TLI; 667 668 /// Target Transform Info. 669 const TargetTransformInfo *TTI; 670 671 /// Assumption Cache. 672 AssumptionCache *AC; 673 674 /// Interface to emit optimization remarks. 675 OptimizationRemarkEmitter *ORE; 676 677 /// LoopVersioning. It's only set up (non-null) if memchecks were 678 /// used. 679 /// 680 /// This is currently only used to add no-alias metadata based on the 681 /// memchecks. The actually versioning is performed manually. 682 std::unique_ptr<LoopVersioning> LVer; 683 684 /// The vectorization SIMD factor to use. Each vector will have this many 685 /// vector elements. 686 ElementCount VF; 687 688 /// The vectorization unroll factor to use. Each scalar is vectorized to this 689 /// many different vector instructions. 690 unsigned UF; 691 692 /// The builder that we use 693 IRBuilder<> Builder; 694 695 // --- Vectorization state --- 696 697 /// The vector-loop preheader. 698 BasicBlock *LoopVectorPreHeader; 699 700 /// The scalar-loop preheader. 701 BasicBlock *LoopScalarPreHeader; 702 703 /// Middle Block between the vector and the scalar. 704 BasicBlock *LoopMiddleBlock; 705 706 /// The unique ExitBlock of the scalar loop if one exists. Note that 707 /// there can be multiple exiting edges reaching this block. 708 BasicBlock *LoopExitBlock; 709 710 /// The scalar loop body. 711 BasicBlock *LoopScalarBody; 712 713 /// A list of all bypass blocks. The first block is the entry of the loop. 714 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 715 716 /// Store instructions that were predicated. 717 SmallVector<Instruction *, 4> PredicatedInstructions; 718 719 /// Trip count of the original loop. 720 Value *TripCount = nullptr; 721 722 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 723 Value *VectorTripCount = nullptr; 724 725 /// The legality analysis. 726 LoopVectorizationLegality *Legal; 727 728 /// The profitablity analysis. 729 LoopVectorizationCostModel *Cost; 730 731 // Record whether runtime checks are added. 732 bool AddedSafetyChecks = false; 733 734 // Holds the end values for each induction variable. We save the end values 735 // so we can later fix-up the external users of the induction variables. 736 DenseMap<PHINode *, Value *> IVEndValues; 737 738 /// BFI and PSI are used to check for profile guided size optimizations. 739 BlockFrequencyInfo *BFI; 740 ProfileSummaryInfo *PSI; 741 742 // Whether this loop should be optimized for size based on profile guided size 743 // optimizatios. 744 bool OptForSizeBasedOnProfile; 745 746 /// Structure to hold information about generated runtime checks, responsible 747 /// for cleaning the checks, if vectorization turns out unprofitable. 748 GeneratedRTChecks &RTChecks; 749 750 // Holds the resume values for reductions in the loops, used to set the 751 // correct start value of reduction PHIs when vectorizing the epilogue. 752 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 753 ReductionResumeValues; 754 }; 755 756 class InnerLoopUnroller : public InnerLoopVectorizer { 757 public: 758 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 759 LoopInfo *LI, DominatorTree *DT, 760 const TargetLibraryInfo *TLI, 761 const TargetTransformInfo *TTI, AssumptionCache *AC, 762 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 763 LoopVectorizationLegality *LVL, 764 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 765 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 766 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 767 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 768 BFI, PSI, Check) {} 769 770 private: 771 Value *getBroadcastInstrs(Value *V) override; 772 }; 773 774 /// Encapsulate information regarding vectorization of a loop and its epilogue. 775 /// This information is meant to be updated and used across two stages of 776 /// epilogue vectorization. 777 struct EpilogueLoopVectorizationInfo { 778 ElementCount MainLoopVF = ElementCount::getFixed(0); 779 unsigned MainLoopUF = 0; 780 ElementCount EpilogueVF = ElementCount::getFixed(0); 781 unsigned EpilogueUF = 0; 782 BasicBlock *MainLoopIterationCountCheck = nullptr; 783 BasicBlock *EpilogueIterationCountCheck = nullptr; 784 BasicBlock *SCEVSafetyCheck = nullptr; 785 BasicBlock *MemSafetyCheck = nullptr; 786 Value *TripCount = nullptr; 787 Value *VectorTripCount = nullptr; 788 789 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 790 ElementCount EVF, unsigned EUF) 791 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 792 assert(EUF == 1 && 793 "A high UF for the epilogue loop is likely not beneficial."); 794 } 795 }; 796 797 /// An extension of the inner loop vectorizer that creates a skeleton for a 798 /// vectorized loop that has its epilogue (residual) also vectorized. 799 /// The idea is to run the vplan on a given loop twice, firstly to setup the 800 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 801 /// from the first step and vectorize the epilogue. This is achieved by 802 /// deriving two concrete strategy classes from this base class and invoking 803 /// them in succession from the loop vectorizer planner. 804 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 805 public: 806 InnerLoopAndEpilogueVectorizer( 807 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 808 DominatorTree *DT, const TargetLibraryInfo *TLI, 809 const TargetTransformInfo *TTI, AssumptionCache *AC, 810 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 811 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 812 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 813 GeneratedRTChecks &Checks) 814 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 815 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 816 Checks), 817 EPI(EPI) {} 818 819 // Override this function to handle the more complex control flow around the 820 // three loops. 821 std::pair<BasicBlock *, Value *> 822 createVectorizedLoopSkeleton() final override { 823 return createEpilogueVectorizedLoopSkeleton(); 824 } 825 826 /// The interface for creating a vectorized skeleton using one of two 827 /// different strategies, each corresponding to one execution of the vplan 828 /// as described above. 829 virtual std::pair<BasicBlock *, Value *> 830 createEpilogueVectorizedLoopSkeleton() = 0; 831 832 /// Holds and updates state information required to vectorize the main loop 833 /// and its epilogue in two separate passes. This setup helps us avoid 834 /// regenerating and recomputing runtime safety checks. It also helps us to 835 /// shorten the iteration-count-check path length for the cases where the 836 /// iteration count of the loop is so small that the main vector loop is 837 /// completely skipped. 838 EpilogueLoopVectorizationInfo &EPI; 839 }; 840 841 /// A specialized derived class of inner loop vectorizer that performs 842 /// vectorization of *main* loops in the process of vectorizing loops and their 843 /// epilogues. 844 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 845 public: 846 EpilogueVectorizerMainLoop( 847 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 848 DominatorTree *DT, const TargetLibraryInfo *TLI, 849 const TargetTransformInfo *TTI, AssumptionCache *AC, 850 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 851 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 852 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 853 GeneratedRTChecks &Check) 854 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 855 EPI, LVL, CM, BFI, PSI, Check) {} 856 /// Implements the interface for creating a vectorized skeleton using the 857 /// *main loop* strategy (ie the first pass of vplan execution). 858 std::pair<BasicBlock *, Value *> 859 createEpilogueVectorizedLoopSkeleton() final override; 860 861 protected: 862 /// Emits an iteration count bypass check once for the main loop (when \p 863 /// ForEpilogue is false) and once for the epilogue loop (when \p 864 /// ForEpilogue is true). 865 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 866 void printDebugTracesAtStart() override; 867 void printDebugTracesAtEnd() override; 868 }; 869 870 // A specialized derived class of inner loop vectorizer that performs 871 // vectorization of *epilogue* loops in the process of vectorizing loops and 872 // their epilogues. 873 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 874 public: 875 EpilogueVectorizerEpilogueLoop( 876 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 877 DominatorTree *DT, const TargetLibraryInfo *TLI, 878 const TargetTransformInfo *TTI, AssumptionCache *AC, 879 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 880 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 881 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 882 GeneratedRTChecks &Checks) 883 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 884 EPI, LVL, CM, BFI, PSI, Checks) { 885 TripCount = EPI.TripCount; 886 } 887 /// Implements the interface for creating a vectorized skeleton using the 888 /// *epilogue loop* strategy (ie the second pass of vplan execution). 889 std::pair<BasicBlock *, Value *> 890 createEpilogueVectorizedLoopSkeleton() final override; 891 892 protected: 893 /// Emits an iteration count bypass check after the main vector loop has 894 /// finished to see if there are any iterations left to execute by either 895 /// the vector epilogue or the scalar epilogue. 896 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 897 BasicBlock *Bypass, 898 BasicBlock *Insert); 899 void printDebugTracesAtStart() override; 900 void printDebugTracesAtEnd() override; 901 }; 902 } // end namespace llvm 903 904 /// Look for a meaningful debug location on the instruction or it's 905 /// operands. 906 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 907 if (!I) 908 return I; 909 910 DebugLoc Empty; 911 if (I->getDebugLoc() != Empty) 912 return I; 913 914 for (Use &Op : I->operands()) { 915 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 916 if (OpInst->getDebugLoc() != Empty) 917 return OpInst; 918 } 919 920 return I; 921 } 922 923 void InnerLoopVectorizer::setDebugLocFromInst( 924 const Value *V) { 925 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 926 const DILocation *DIL = Inst->getDebugLoc(); 927 928 // When a FSDiscriminator is enabled, we don't need to add the multiply 929 // factors to the discriminators. 930 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 931 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 932 // FIXME: For scalable vectors, assume vscale=1. 933 auto NewDIL = 934 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 935 if (NewDIL) 936 Builder.SetCurrentDebugLocation(*NewDIL); 937 else 938 LLVM_DEBUG(dbgs() 939 << "Failed to create new discriminator: " 940 << DIL->getFilename() << " Line: " << DIL->getLine()); 941 } else 942 Builder.SetCurrentDebugLocation(DIL); 943 } else 944 Builder.SetCurrentDebugLocation(DebugLoc()); 945 } 946 947 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 948 /// is passed, the message relates to that particular instruction. 949 #ifndef NDEBUG 950 static void debugVectorizationMessage(const StringRef Prefix, 951 const StringRef DebugMsg, 952 Instruction *I) { 953 dbgs() << "LV: " << Prefix << DebugMsg; 954 if (I != nullptr) 955 dbgs() << " " << *I; 956 else 957 dbgs() << '.'; 958 dbgs() << '\n'; 959 } 960 #endif 961 962 /// Create an analysis remark that explains why vectorization failed 963 /// 964 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 965 /// RemarkName is the identifier for the remark. If \p I is passed it is an 966 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 967 /// the location of the remark. \return the remark object that can be 968 /// streamed to. 969 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 970 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 971 Value *CodeRegion = TheLoop->getHeader(); 972 DebugLoc DL = TheLoop->getStartLoc(); 973 974 if (I) { 975 CodeRegion = I->getParent(); 976 // If there is no debug location attached to the instruction, revert back to 977 // using the loop's. 978 if (I->getDebugLoc()) 979 DL = I->getDebugLoc(); 980 } 981 982 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 983 } 984 985 namespace llvm { 986 987 /// Return a value for Step multiplied by VF. 988 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 989 int64_t Step) { 990 assert(Ty->isIntegerTy() && "Expected an integer step"); 991 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 992 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 993 } 994 995 /// Return the runtime value for VF. 996 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 997 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 998 return VF.isScalable() ? B.CreateVScale(EC) : EC; 999 } 1000 1001 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1002 ElementCount VF) { 1003 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1004 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1005 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1006 return B.CreateUIToFP(RuntimeVF, FTy); 1007 } 1008 1009 void reportVectorizationFailure(const StringRef DebugMsg, 1010 const StringRef OREMsg, const StringRef ORETag, 1011 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1012 Instruction *I) { 1013 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1014 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1015 ORE->emit( 1016 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1017 << "loop not vectorized: " << OREMsg); 1018 } 1019 1020 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1021 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1022 Instruction *I) { 1023 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1024 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1025 ORE->emit( 1026 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1027 << Msg); 1028 } 1029 1030 } // end namespace llvm 1031 1032 #ifndef NDEBUG 1033 /// \return string containing a file name and a line # for the given loop. 1034 static std::string getDebugLocString(const Loop *L) { 1035 std::string Result; 1036 if (L) { 1037 raw_string_ostream OS(Result); 1038 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1039 LoopDbgLoc.print(OS); 1040 else 1041 // Just print the module name. 1042 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1043 OS.flush(); 1044 } 1045 return Result; 1046 } 1047 #endif 1048 1049 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1050 const Instruction *Orig) { 1051 // If the loop was versioned with memchecks, add the corresponding no-alias 1052 // metadata. 1053 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1054 LVer->annotateInstWithNoAlias(To, Orig); 1055 } 1056 1057 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1058 VPTransformState &State) { 1059 1060 // Collect recipes in the backward slice of `Root` that may generate a poison 1061 // value that is used after vectorization. 1062 SmallPtrSet<VPRecipeBase *, 16> Visited; 1063 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1064 SmallVector<VPRecipeBase *, 16> Worklist; 1065 Worklist.push_back(Root); 1066 1067 // Traverse the backward slice of Root through its use-def chain. 1068 while (!Worklist.empty()) { 1069 VPRecipeBase *CurRec = Worklist.back(); 1070 Worklist.pop_back(); 1071 1072 if (!Visited.insert(CurRec).second) 1073 continue; 1074 1075 // Prune search if we find another recipe generating a widen memory 1076 // instruction. Widen memory instructions involved in address computation 1077 // will lead to gather/scatter instructions, which don't need to be 1078 // handled. 1079 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1080 isa<VPInterleaveRecipe>(CurRec) || 1081 isa<VPScalarIVStepsRecipe>(CurRec) || 1082 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1083 continue; 1084 1085 // This recipe contributes to the address computation of a widen 1086 // load/store. Collect recipe if its underlying instruction has 1087 // poison-generating flags. 1088 Instruction *Instr = CurRec->getUnderlyingInstr(); 1089 if (Instr && Instr->hasPoisonGeneratingFlags()) 1090 State.MayGeneratePoisonRecipes.insert(CurRec); 1091 1092 // Add new definitions to the worklist. 1093 for (VPValue *operand : CurRec->operands()) 1094 if (VPDef *OpDef = operand->getDef()) 1095 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1096 } 1097 }); 1098 1099 // Traverse all the recipes in the VPlan and collect the poison-generating 1100 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1101 // VPInterleaveRecipe. 1102 auto Iter = depth_first( 1103 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1104 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1105 for (VPRecipeBase &Recipe : *VPBB) { 1106 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1107 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1108 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1109 if (AddrDef && WidenRec->isConsecutive() && 1110 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1111 collectPoisonGeneratingInstrsInBackwardSlice( 1112 cast<VPRecipeBase>(AddrDef)); 1113 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1114 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1115 if (AddrDef) { 1116 // Check if any member of the interleave group needs predication. 1117 const InterleaveGroup<Instruction> *InterGroup = 1118 InterleaveRec->getInterleaveGroup(); 1119 bool NeedPredication = false; 1120 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1121 I < NumMembers; ++I) { 1122 Instruction *Member = InterGroup->getMember(I); 1123 if (Member) 1124 NeedPredication |= 1125 Legal->blockNeedsPredication(Member->getParent()); 1126 } 1127 1128 if (NeedPredication) 1129 collectPoisonGeneratingInstrsInBackwardSlice( 1130 cast<VPRecipeBase>(AddrDef)); 1131 } 1132 } 1133 } 1134 } 1135 } 1136 1137 void InnerLoopVectorizer::addMetadata(Instruction *To, 1138 Instruction *From) { 1139 propagateMetadata(To, From); 1140 addNewMetadata(To, From); 1141 } 1142 1143 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1144 Instruction *From) { 1145 for (Value *V : To) { 1146 if (Instruction *I = dyn_cast<Instruction>(V)) 1147 addMetadata(I, From); 1148 } 1149 } 1150 1151 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1152 const RecurrenceDescriptor &RdxDesc) { 1153 auto It = ReductionResumeValues.find(&RdxDesc); 1154 assert(It != ReductionResumeValues.end() && 1155 "Expected to find a resume value for the reduction."); 1156 return It->second; 1157 } 1158 1159 namespace llvm { 1160 1161 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1162 // lowered. 1163 enum ScalarEpilogueLowering { 1164 1165 // The default: allowing scalar epilogues. 1166 CM_ScalarEpilogueAllowed, 1167 1168 // Vectorization with OptForSize: don't allow epilogues. 1169 CM_ScalarEpilogueNotAllowedOptSize, 1170 1171 // A special case of vectorisation with OptForSize: loops with a very small 1172 // trip count are considered for vectorization under OptForSize, thereby 1173 // making sure the cost of their loop body is dominant, free of runtime 1174 // guards and scalar iteration overheads. 1175 CM_ScalarEpilogueNotAllowedLowTripLoop, 1176 1177 // Loop hint predicate indicating an epilogue is undesired. 1178 CM_ScalarEpilogueNotNeededUsePredicate, 1179 1180 // Directive indicating we must either tail fold or not vectorize 1181 CM_ScalarEpilogueNotAllowedUsePredicate 1182 }; 1183 1184 /// ElementCountComparator creates a total ordering for ElementCount 1185 /// for the purposes of using it in a set structure. 1186 struct ElementCountComparator { 1187 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1188 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1189 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1190 } 1191 }; 1192 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1193 1194 /// LoopVectorizationCostModel - estimates the expected speedups due to 1195 /// vectorization. 1196 /// In many cases vectorization is not profitable. This can happen because of 1197 /// a number of reasons. In this class we mainly attempt to predict the 1198 /// expected speedup/slowdowns due to the supported instruction set. We use the 1199 /// TargetTransformInfo to query the different backends for the cost of 1200 /// different operations. 1201 class LoopVectorizationCostModel { 1202 public: 1203 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1204 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1205 LoopVectorizationLegality *Legal, 1206 const TargetTransformInfo &TTI, 1207 const TargetLibraryInfo *TLI, DemandedBits *DB, 1208 AssumptionCache *AC, 1209 OptimizationRemarkEmitter *ORE, const Function *F, 1210 const LoopVectorizeHints *Hints, 1211 InterleavedAccessInfo &IAI) 1212 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1213 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1214 Hints(Hints), InterleaveInfo(IAI) {} 1215 1216 /// \return An upper bound for the vectorization factors (both fixed and 1217 /// scalable). If the factors are 0, vectorization and interleaving should be 1218 /// avoided up front. 1219 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1220 1221 /// \return True if runtime checks are required for vectorization, and false 1222 /// otherwise. 1223 bool runtimeChecksRequired(); 1224 1225 /// \return The most profitable vectorization factor and the cost of that VF. 1226 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1227 /// then this vectorization factor will be selected if vectorization is 1228 /// possible. 1229 VectorizationFactor 1230 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1231 1232 VectorizationFactor 1233 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1234 const LoopVectorizationPlanner &LVP); 1235 1236 /// Setup cost-based decisions for user vectorization factor. 1237 /// \return true if the UserVF is a feasible VF to be chosen. 1238 bool selectUserVectorizationFactor(ElementCount UserVF) { 1239 collectUniformsAndScalars(UserVF); 1240 collectInstsToScalarize(UserVF); 1241 return expectedCost(UserVF).first.isValid(); 1242 } 1243 1244 /// \return The size (in bits) of the smallest and widest types in the code 1245 /// that needs to be vectorized. We ignore values that remain scalar such as 1246 /// 64 bit loop indices. 1247 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1248 1249 /// \return The desired interleave count. 1250 /// If interleave count has been specified by metadata it will be returned. 1251 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1252 /// are the selected vectorization factor and the cost of the selected VF. 1253 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1254 1255 /// Memory access instruction may be vectorized in more than one way. 1256 /// Form of instruction after vectorization depends on cost. 1257 /// This function takes cost-based decisions for Load/Store instructions 1258 /// and collects them in a map. This decisions map is used for building 1259 /// the lists of loop-uniform and loop-scalar instructions. 1260 /// The calculated cost is saved with widening decision in order to 1261 /// avoid redundant calculations. 1262 void setCostBasedWideningDecision(ElementCount VF); 1263 1264 /// A struct that represents some properties of the register usage 1265 /// of a loop. 1266 struct RegisterUsage { 1267 /// Holds the number of loop invariant values that are used in the loop. 1268 /// The key is ClassID of target-provided register class. 1269 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1270 /// Holds the maximum number of concurrent live intervals in the loop. 1271 /// The key is ClassID of target-provided register class. 1272 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1273 }; 1274 1275 /// \return Returns information about the register usages of the loop for the 1276 /// given vectorization factors. 1277 SmallVector<RegisterUsage, 8> 1278 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1279 1280 /// Collect values we want to ignore in the cost model. 1281 void collectValuesToIgnore(); 1282 1283 /// Collect all element types in the loop for which widening is needed. 1284 void collectElementTypesForWidening(); 1285 1286 /// Split reductions into those that happen in the loop, and those that happen 1287 /// outside. In loop reductions are collected into InLoopReductionChains. 1288 void collectInLoopReductions(); 1289 1290 /// Returns true if we should use strict in-order reductions for the given 1291 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1292 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1293 /// of FP operations. 1294 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1295 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1296 } 1297 1298 /// \returns The smallest bitwidth each instruction can be represented with. 1299 /// The vector equivalents of these instructions should be truncated to this 1300 /// type. 1301 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1302 return MinBWs; 1303 } 1304 1305 /// \returns True if it is more profitable to scalarize instruction \p I for 1306 /// vectorization factor \p VF. 1307 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1308 assert(VF.isVector() && 1309 "Profitable to scalarize relevant only for VF > 1."); 1310 1311 // Cost model is not run in the VPlan-native path - return conservative 1312 // result until this changes. 1313 if (EnableVPlanNativePath) 1314 return false; 1315 1316 auto Scalars = InstsToScalarize.find(VF); 1317 assert(Scalars != InstsToScalarize.end() && 1318 "VF not yet analyzed for scalarization profitability"); 1319 return Scalars->second.find(I) != Scalars->second.end(); 1320 } 1321 1322 /// Returns true if \p I is known to be uniform after vectorization. 1323 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1324 if (VF.isScalar()) 1325 return true; 1326 1327 // Cost model is not run in the VPlan-native path - return conservative 1328 // result until this changes. 1329 if (EnableVPlanNativePath) 1330 return false; 1331 1332 auto UniformsPerVF = Uniforms.find(VF); 1333 assert(UniformsPerVF != Uniforms.end() && 1334 "VF not yet analyzed for uniformity"); 1335 return UniformsPerVF->second.count(I); 1336 } 1337 1338 /// Returns true if \p I is known to be scalar after vectorization. 1339 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1340 if (VF.isScalar()) 1341 return true; 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto ScalarsPerVF = Scalars.find(VF); 1349 assert(ScalarsPerVF != Scalars.end() && 1350 "Scalar values are not calculated for VF"); 1351 return ScalarsPerVF->second.count(I); 1352 } 1353 1354 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1355 /// for vectorization factor \p VF. 1356 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1357 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1358 !isProfitableToScalarize(I, VF) && 1359 !isScalarAfterVectorization(I, VF); 1360 } 1361 1362 /// Decision that was taken during cost calculation for memory instruction. 1363 enum InstWidening { 1364 CM_Unknown, 1365 CM_Widen, // For consecutive accesses with stride +1. 1366 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1367 CM_Interleave, 1368 CM_GatherScatter, 1369 CM_Scalarize 1370 }; 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// instruction \p I and vector width \p VF. 1374 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1375 InstructionCost Cost) { 1376 assert(VF.isVector() && "Expected VF >=2"); 1377 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1378 } 1379 1380 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1381 /// interleaving group \p Grp and vector width \p VF. 1382 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1383 ElementCount VF, InstWidening W, 1384 InstructionCost Cost) { 1385 assert(VF.isVector() && "Expected VF >=2"); 1386 /// Broadcast this decicion to all instructions inside the group. 1387 /// But the cost will be assigned to one instruction only. 1388 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1389 if (auto *I = Grp->getMember(i)) { 1390 if (Grp->getInsertPos() == I) 1391 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1392 else 1393 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1394 } 1395 } 1396 } 1397 1398 /// Return the cost model decision for the given instruction \p I and vector 1399 /// width \p VF. Return CM_Unknown if this instruction did not pass 1400 /// through the cost modeling. 1401 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1402 assert(VF.isVector() && "Expected VF to be a vector VF"); 1403 // Cost model is not run in the VPlan-native path - return conservative 1404 // result until this changes. 1405 if (EnableVPlanNativePath) 1406 return CM_GatherScatter; 1407 1408 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1409 auto Itr = WideningDecisions.find(InstOnVF); 1410 if (Itr == WideningDecisions.end()) 1411 return CM_Unknown; 1412 return Itr->second.first; 1413 } 1414 1415 /// Return the vectorization cost for the given instruction \p I and vector 1416 /// width \p VF. 1417 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1418 assert(VF.isVector() && "Expected VF >=2"); 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1421 "The cost is not calculated"); 1422 return WideningDecisions[InstOnVF].second; 1423 } 1424 1425 /// Return True if instruction \p I is an optimizable truncate whose operand 1426 /// is an induction variable. Such a truncate will be removed by adding a new 1427 /// induction variable with the destination type. 1428 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1429 // If the instruction is not a truncate, return false. 1430 auto *Trunc = dyn_cast<TruncInst>(I); 1431 if (!Trunc) 1432 return false; 1433 1434 // Get the source and destination types of the truncate. 1435 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1436 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1437 1438 // If the truncate is free for the given types, return false. Replacing a 1439 // free truncate with an induction variable would add an induction variable 1440 // update instruction to each iteration of the loop. We exclude from this 1441 // check the primary induction variable since it will need an update 1442 // instruction regardless. 1443 Value *Op = Trunc->getOperand(0); 1444 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1445 return false; 1446 1447 // If the truncated value is not an induction variable, return false. 1448 return Legal->isInductionPhi(Op); 1449 } 1450 1451 /// Collects the instructions to scalarize for each predicated instruction in 1452 /// the loop. 1453 void collectInstsToScalarize(ElementCount VF); 1454 1455 /// Collect Uniform and Scalar values for the given \p VF. 1456 /// The sets depend on CM decision for Load/Store instructions 1457 /// that may be vectorized as interleave, gather-scatter or scalarized. 1458 void collectUniformsAndScalars(ElementCount VF) { 1459 // Do the analysis once. 1460 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1461 return; 1462 setCostBasedWideningDecision(VF); 1463 collectLoopUniforms(VF); 1464 collectLoopScalars(VF); 1465 } 1466 1467 /// Returns true if the target machine supports masked store operation 1468 /// for the given \p DataType and kind of access to \p Ptr. 1469 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1470 return Legal->isConsecutivePtr(DataType, Ptr) && 1471 TTI.isLegalMaskedStore(DataType, Alignment); 1472 } 1473 1474 /// Returns true if the target machine supports masked load operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1477 return Legal->isConsecutivePtr(DataType, Ptr) && 1478 TTI.isLegalMaskedLoad(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine can represent \p V as a masked gather 1482 /// or scatter operation. 1483 bool isLegalGatherOrScatter(Value *V, 1484 ElementCount VF = ElementCount::getFixed(1)) { 1485 bool LI = isa<LoadInst>(V); 1486 bool SI = isa<StoreInst>(V); 1487 if (!LI && !SI) 1488 return false; 1489 auto *Ty = getLoadStoreType(V); 1490 Align Align = getLoadStoreAlignment(V); 1491 if (VF.isVector()) 1492 Ty = VectorType::get(Ty, VF); 1493 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1494 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1495 } 1496 1497 /// Returns true if the target machine supports all of the reduction 1498 /// variables found for the given VF. 1499 bool canVectorizeReductions(ElementCount VF) const { 1500 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1501 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1502 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1503 })); 1504 } 1505 1506 /// Returns true if \p I is an instruction that will be scalarized with 1507 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1508 /// instructions include conditional stores and instructions that may divide 1509 /// by zero. 1510 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1511 1512 // Returns true if \p I is an instruction that will be predicated either 1513 // through scalar predication or masked load/store or masked gather/scatter. 1514 // \p VF is the vectorization factor that will be used to vectorize \p I. 1515 // Superset of instructions that return true for isScalarWithPredication. 1516 bool isPredicatedInst(Instruction *I, ElementCount VF, 1517 bool IsKnownUniform = false) { 1518 // When we know the load is uniform and the original scalar loop was not 1519 // predicated we don't need to mark it as a predicated instruction. Any 1520 // vectorised blocks created when tail-folding are something artificial we 1521 // have introduced and we know there is always at least one active lane. 1522 // That's why we call Legal->blockNeedsPredication here because it doesn't 1523 // query tail-folding. 1524 if (IsKnownUniform && isa<LoadInst>(I) && 1525 !Legal->blockNeedsPredication(I->getParent())) 1526 return false; 1527 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1528 return false; 1529 // Loads and stores that need some form of masked operation are predicated 1530 // instructions. 1531 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1532 return Legal->isMaskRequired(I); 1533 return isScalarWithPredication(I, VF); 1534 } 1535 1536 /// Returns true if \p I is a memory instruction with consecutive memory 1537 /// access that can be widened. 1538 bool 1539 memoryInstructionCanBeWidened(Instruction *I, 1540 ElementCount VF = ElementCount::getFixed(1)); 1541 1542 /// Returns true if \p I is a memory instruction in an interleaved-group 1543 /// of memory accesses that can be vectorized with wide vector loads/stores 1544 /// and shuffles. 1545 bool 1546 interleavedAccessCanBeWidened(Instruction *I, 1547 ElementCount VF = ElementCount::getFixed(1)); 1548 1549 /// Check if \p Instr belongs to any interleaved access group. 1550 bool isAccessInterleaved(Instruction *Instr) { 1551 return InterleaveInfo.isInterleaved(Instr); 1552 } 1553 1554 /// Get the interleaved access group that \p Instr belongs to. 1555 const InterleaveGroup<Instruction> * 1556 getInterleavedAccessGroup(Instruction *Instr) { 1557 return InterleaveInfo.getInterleaveGroup(Instr); 1558 } 1559 1560 /// Returns true if we're required to use a scalar epilogue for at least 1561 /// the final iteration of the original loop. 1562 bool requiresScalarEpilogue(ElementCount VF) const { 1563 if (!isScalarEpilogueAllowed()) 1564 return false; 1565 // If we might exit from anywhere but the latch, must run the exiting 1566 // iteration in scalar form. 1567 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1568 return true; 1569 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1570 } 1571 1572 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1573 /// loop hint annotation. 1574 bool isScalarEpilogueAllowed() const { 1575 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1576 } 1577 1578 /// Returns true if all loop blocks should be masked to fold tail loop. 1579 bool foldTailByMasking() const { return FoldTailByMasking; } 1580 1581 /// Returns true if the instructions in this block requires predication 1582 /// for any reason, e.g. because tail folding now requires a predicate 1583 /// or because the block in the original loop was predicated. 1584 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1585 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1586 } 1587 1588 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1589 /// nodes to the chain of instructions representing the reductions. Uses a 1590 /// MapVector to ensure deterministic iteration order. 1591 using ReductionChainMap = 1592 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1593 1594 /// Return the chain of instructions representing an inloop reduction. 1595 const ReductionChainMap &getInLoopReductionChains() const { 1596 return InLoopReductionChains; 1597 } 1598 1599 /// Returns true if the Phi is part of an inloop reduction. 1600 bool isInLoopReduction(PHINode *Phi) const { 1601 return InLoopReductionChains.count(Phi); 1602 } 1603 1604 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1605 /// with factor VF. Return the cost of the instruction, including 1606 /// scalarization overhead if it's needed. 1607 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1608 1609 /// Estimate cost of a call instruction CI if it were vectorized with factor 1610 /// VF. Return the cost of the instruction, including scalarization overhead 1611 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1612 /// scalarized - 1613 /// i.e. either vector version isn't available, or is too expensive. 1614 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1615 bool &NeedToScalarize) const; 1616 1617 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1618 /// that of B. 1619 bool isMoreProfitable(const VectorizationFactor &A, 1620 const VectorizationFactor &B) const; 1621 1622 /// Invalidates decisions already taken by the cost model. 1623 void invalidateCostModelingDecisions() { 1624 WideningDecisions.clear(); 1625 Uniforms.clear(); 1626 Scalars.clear(); 1627 } 1628 1629 private: 1630 unsigned NumPredStores = 0; 1631 1632 /// Convenience function that returns the value of vscale_range iff 1633 /// vscale_range.min == vscale_range.max or otherwise returns the value 1634 /// returned by the corresponding TLI method. 1635 Optional<unsigned> getVScaleForTuning() const; 1636 1637 /// \return An upper bound for the vectorization factors for both 1638 /// fixed and scalable vectorization, where the minimum-known number of 1639 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1640 /// disabled or unsupported, then the scalable part will be equal to 1641 /// ElementCount::getScalable(0). 1642 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1643 ElementCount UserVF, 1644 bool FoldTailByMasking); 1645 1646 /// \return the maximized element count based on the targets vector 1647 /// registers and the loop trip-count, but limited to a maximum safe VF. 1648 /// This is a helper function of computeFeasibleMaxVF. 1649 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1650 unsigned SmallestType, 1651 unsigned WidestType, 1652 ElementCount MaxSafeVF, 1653 bool FoldTailByMasking); 1654 1655 /// \return the maximum legal scalable VF, based on the safe max number 1656 /// of elements. 1657 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1658 1659 /// The vectorization cost is a combination of the cost itself and a boolean 1660 /// indicating whether any of the contributing operations will actually 1661 /// operate on vector values after type legalization in the backend. If this 1662 /// latter value is false, then all operations will be scalarized (i.e. no 1663 /// vectorization has actually taken place). 1664 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1665 1666 /// Returns the expected execution cost. The unit of the cost does 1667 /// not matter because we use the 'cost' units to compare different 1668 /// vector widths. The cost that is returned is *not* normalized by 1669 /// the factor width. If \p Invalid is not nullptr, this function 1670 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1671 /// each instruction that has an Invalid cost for the given VF. 1672 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1673 VectorizationCostTy 1674 expectedCost(ElementCount VF, 1675 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1676 1677 /// Returns the execution time cost of an instruction for a given vector 1678 /// width. Vector width of one means scalar. 1679 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1680 1681 /// The cost-computation logic from getInstructionCost which provides 1682 /// the vector type as an output parameter. 1683 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1684 Type *&VectorTy); 1685 1686 /// Return the cost of instructions in an inloop reduction pattern, if I is 1687 /// part of that pattern. 1688 Optional<InstructionCost> 1689 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1690 TTI::TargetCostKind CostKind); 1691 1692 /// Calculate vectorization cost of memory instruction \p I. 1693 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1694 1695 /// The cost computation for scalarized memory instruction. 1696 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1697 1698 /// The cost computation for interleaving group of memory instructions. 1699 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1700 1701 /// The cost computation for Gather/Scatter instruction. 1702 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1703 1704 /// The cost computation for widening instruction \p I with consecutive 1705 /// memory access. 1706 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1707 1708 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1709 /// Load: scalar load + broadcast. 1710 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1711 /// element) 1712 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1713 1714 /// Estimate the overhead of scalarizing an instruction. This is a 1715 /// convenience wrapper for the type-based getScalarizationOverhead API. 1716 InstructionCost getScalarizationOverhead(Instruction *I, 1717 ElementCount VF) const; 1718 1719 /// Returns whether the instruction is a load or store and will be a emitted 1720 /// as a vector operation. 1721 bool isConsecutiveLoadOrStore(Instruction *I); 1722 1723 /// Returns true if an artificially high cost for emulated masked memrefs 1724 /// should be used. 1725 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1726 1727 /// Map of scalar integer values to the smallest bitwidth they can be legally 1728 /// represented as. The vector equivalents of these values should be truncated 1729 /// to this type. 1730 MapVector<Instruction *, uint64_t> MinBWs; 1731 1732 /// A type representing the costs for instructions if they were to be 1733 /// scalarized rather than vectorized. The entries are Instruction-Cost 1734 /// pairs. 1735 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1736 1737 /// A set containing all BasicBlocks that are known to present after 1738 /// vectorization as a predicated block. 1739 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1740 1741 /// Records whether it is allowed to have the original scalar loop execute at 1742 /// least once. This may be needed as a fallback loop in case runtime 1743 /// aliasing/dependence checks fail, or to handle the tail/remainder 1744 /// iterations when the trip count is unknown or doesn't divide by the VF, 1745 /// or as a peel-loop to handle gaps in interleave-groups. 1746 /// Under optsize and when the trip count is very small we don't allow any 1747 /// iterations to execute in the scalar loop. 1748 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1749 1750 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1751 bool FoldTailByMasking = false; 1752 1753 /// A map holding scalar costs for different vectorization factors. The 1754 /// presence of a cost for an instruction in the mapping indicates that the 1755 /// instruction will be scalarized when vectorizing with the associated 1756 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1757 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1758 1759 /// Holds the instructions known to be uniform after vectorization. 1760 /// The data is collected per VF. 1761 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1762 1763 /// Holds the instructions known to be scalar after vectorization. 1764 /// The data is collected per VF. 1765 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1766 1767 /// Holds the instructions (address computations) that are forced to be 1768 /// scalarized. 1769 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1770 1771 /// PHINodes of the reductions that should be expanded in-loop along with 1772 /// their associated chains of reduction operations, in program order from top 1773 /// (PHI) to bottom 1774 ReductionChainMap InLoopReductionChains; 1775 1776 /// A Map of inloop reduction operations and their immediate chain operand. 1777 /// FIXME: This can be removed once reductions can be costed correctly in 1778 /// vplan. This was added to allow quick lookup to the inloop operations, 1779 /// without having to loop through InLoopReductionChains. 1780 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1781 1782 /// Returns the expected difference in cost from scalarizing the expression 1783 /// feeding a predicated instruction \p PredInst. The instructions to 1784 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1785 /// non-negative return value implies the expression will be scalarized. 1786 /// Currently, only single-use chains are considered for scalarization. 1787 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1788 ElementCount VF); 1789 1790 /// Collect the instructions that are uniform after vectorization. An 1791 /// instruction is uniform if we represent it with a single scalar value in 1792 /// the vectorized loop corresponding to each vector iteration. Examples of 1793 /// uniform instructions include pointer operands of consecutive or 1794 /// interleaved memory accesses. Note that although uniformity implies an 1795 /// instruction will be scalar, the reverse is not true. In general, a 1796 /// scalarized instruction will be represented by VF scalar values in the 1797 /// vectorized loop, each corresponding to an iteration of the original 1798 /// scalar loop. 1799 void collectLoopUniforms(ElementCount VF); 1800 1801 /// Collect the instructions that are scalar after vectorization. An 1802 /// instruction is scalar if it is known to be uniform or will be scalarized 1803 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1804 /// to the list if they are used by a load/store instruction that is marked as 1805 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1806 /// VF values in the vectorized loop, each corresponding to an iteration of 1807 /// the original scalar loop. 1808 void collectLoopScalars(ElementCount VF); 1809 1810 /// Keeps cost model vectorization decision and cost for instructions. 1811 /// Right now it is used for memory instructions only. 1812 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1813 std::pair<InstWidening, InstructionCost>>; 1814 1815 DecisionList WideningDecisions; 1816 1817 /// Returns true if \p V is expected to be vectorized and it needs to be 1818 /// extracted. 1819 bool needsExtract(Value *V, ElementCount VF) const { 1820 Instruction *I = dyn_cast<Instruction>(V); 1821 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1822 TheLoop->isLoopInvariant(I)) 1823 return false; 1824 1825 // Assume we can vectorize V (and hence we need extraction) if the 1826 // scalars are not computed yet. This can happen, because it is called 1827 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1828 // the scalars are collected. That should be a safe assumption in most 1829 // cases, because we check if the operands have vectorizable types 1830 // beforehand in LoopVectorizationLegality. 1831 return Scalars.find(VF) == Scalars.end() || 1832 !isScalarAfterVectorization(I, VF); 1833 }; 1834 1835 /// Returns a range containing only operands needing to be extracted. 1836 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1837 ElementCount VF) const { 1838 return SmallVector<Value *, 4>(make_filter_range( 1839 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1840 } 1841 1842 /// Determines if we have the infrastructure to vectorize loop \p L and its 1843 /// epilogue, assuming the main loop is vectorized by \p VF. 1844 bool isCandidateForEpilogueVectorization(const Loop &L, 1845 const ElementCount VF) const; 1846 1847 /// Returns true if epilogue vectorization is considered profitable, and 1848 /// false otherwise. 1849 /// \p VF is the vectorization factor chosen for the original loop. 1850 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1851 1852 public: 1853 /// The loop that we evaluate. 1854 Loop *TheLoop; 1855 1856 /// Predicated scalar evolution analysis. 1857 PredicatedScalarEvolution &PSE; 1858 1859 /// Loop Info analysis. 1860 LoopInfo *LI; 1861 1862 /// Vectorization legality. 1863 LoopVectorizationLegality *Legal; 1864 1865 /// Vector target information. 1866 const TargetTransformInfo &TTI; 1867 1868 /// Target Library Info. 1869 const TargetLibraryInfo *TLI; 1870 1871 /// Demanded bits analysis. 1872 DemandedBits *DB; 1873 1874 /// Assumption cache. 1875 AssumptionCache *AC; 1876 1877 /// Interface to emit optimization remarks. 1878 OptimizationRemarkEmitter *ORE; 1879 1880 const Function *TheFunction; 1881 1882 /// Loop Vectorize Hint. 1883 const LoopVectorizeHints *Hints; 1884 1885 /// The interleave access information contains groups of interleaved accesses 1886 /// with the same stride and close to each other. 1887 InterleavedAccessInfo &InterleaveInfo; 1888 1889 /// Values to ignore in the cost model. 1890 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1891 1892 /// Values to ignore in the cost model when VF > 1. 1893 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1894 1895 /// All element types found in the loop. 1896 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1897 1898 /// Profitable vector factors. 1899 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1900 }; 1901 } // end namespace llvm 1902 1903 /// Helper struct to manage generating runtime checks for vectorization. 1904 /// 1905 /// The runtime checks are created up-front in temporary blocks to allow better 1906 /// estimating the cost and un-linked from the existing IR. After deciding to 1907 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1908 /// temporary blocks are completely removed. 1909 class GeneratedRTChecks { 1910 /// Basic block which contains the generated SCEV checks, if any. 1911 BasicBlock *SCEVCheckBlock = nullptr; 1912 1913 /// The value representing the result of the generated SCEV checks. If it is 1914 /// nullptr, either no SCEV checks have been generated or they have been used. 1915 Value *SCEVCheckCond = nullptr; 1916 1917 /// Basic block which contains the generated memory runtime checks, if any. 1918 BasicBlock *MemCheckBlock = nullptr; 1919 1920 /// The value representing the result of the generated memory runtime checks. 1921 /// If it is nullptr, either no memory runtime checks have been generated or 1922 /// they have been used. 1923 Value *MemRuntimeCheckCond = nullptr; 1924 1925 DominatorTree *DT; 1926 LoopInfo *LI; 1927 1928 SCEVExpander SCEVExp; 1929 SCEVExpander MemCheckExp; 1930 1931 public: 1932 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1933 const DataLayout &DL) 1934 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1935 MemCheckExp(SE, DL, "scev.check") {} 1936 1937 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1938 /// accurately estimate the cost of the runtime checks. The blocks are 1939 /// un-linked from the IR and is added back during vector code generation. If 1940 /// there is no vector code generation, the check blocks are removed 1941 /// completely. 1942 void Create(Loop *L, const LoopAccessInfo &LAI, 1943 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1944 1945 BasicBlock *LoopHeader = L->getHeader(); 1946 BasicBlock *Preheader = L->getLoopPreheader(); 1947 1948 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1949 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1950 // may be used by SCEVExpander. The blocks will be un-linked from their 1951 // predecessors and removed from LI & DT at the end of the function. 1952 if (!UnionPred.isAlwaysTrue()) { 1953 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1954 nullptr, "vector.scevcheck"); 1955 1956 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1957 &UnionPred, SCEVCheckBlock->getTerminator()); 1958 } 1959 1960 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1961 if (RtPtrChecking.Need) { 1962 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1963 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1964 "vector.memcheck"); 1965 1966 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1967 if (DiffChecks) { 1968 MemRuntimeCheckCond = addDiffRuntimeChecks( 1969 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1970 [VF](IRBuilderBase &B, unsigned Bits) { 1971 return getRuntimeVF(B, B.getIntNTy(Bits), VF); 1972 }, 1973 IC); 1974 } else { 1975 MemRuntimeCheckCond = 1976 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1977 RtPtrChecking.getChecks(), MemCheckExp); 1978 } 1979 assert(MemRuntimeCheckCond && 1980 "no RT checks generated although RtPtrChecking " 1981 "claimed checks are required"); 1982 } 1983 1984 if (!MemCheckBlock && !SCEVCheckBlock) 1985 return; 1986 1987 // Unhook the temporary block with the checks, update various places 1988 // accordingly. 1989 if (SCEVCheckBlock) 1990 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1991 if (MemCheckBlock) 1992 MemCheckBlock->replaceAllUsesWith(Preheader); 1993 1994 if (SCEVCheckBlock) { 1995 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1996 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1997 Preheader->getTerminator()->eraseFromParent(); 1998 } 1999 if (MemCheckBlock) { 2000 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2001 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2002 Preheader->getTerminator()->eraseFromParent(); 2003 } 2004 2005 DT->changeImmediateDominator(LoopHeader, Preheader); 2006 if (MemCheckBlock) { 2007 DT->eraseNode(MemCheckBlock); 2008 LI->removeBlock(MemCheckBlock); 2009 } 2010 if (SCEVCheckBlock) { 2011 DT->eraseNode(SCEVCheckBlock); 2012 LI->removeBlock(SCEVCheckBlock); 2013 } 2014 } 2015 2016 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2017 /// unused. 2018 ~GeneratedRTChecks() { 2019 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2020 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2021 if (!SCEVCheckCond) 2022 SCEVCleaner.markResultUsed(); 2023 2024 if (!MemRuntimeCheckCond) 2025 MemCheckCleaner.markResultUsed(); 2026 2027 if (MemRuntimeCheckCond) { 2028 auto &SE = *MemCheckExp.getSE(); 2029 // Memory runtime check generation creates compares that use expanded 2030 // values. Remove them before running the SCEVExpanderCleaners. 2031 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2032 if (MemCheckExp.isInsertedInstruction(&I)) 2033 continue; 2034 SE.forgetValue(&I); 2035 I.eraseFromParent(); 2036 } 2037 } 2038 MemCheckCleaner.cleanup(); 2039 SCEVCleaner.cleanup(); 2040 2041 if (SCEVCheckCond) 2042 SCEVCheckBlock->eraseFromParent(); 2043 if (MemRuntimeCheckCond) 2044 MemCheckBlock->eraseFromParent(); 2045 } 2046 2047 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2048 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2049 /// depending on the generated condition. 2050 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2051 BasicBlock *LoopVectorPreHeader, 2052 BasicBlock *LoopExitBlock) { 2053 if (!SCEVCheckCond) 2054 return nullptr; 2055 2056 Value *Cond = SCEVCheckCond; 2057 // Mark the check as used, to prevent it from being removed during cleanup. 2058 SCEVCheckCond = nullptr; 2059 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2060 if (C->isZero()) 2061 return nullptr; 2062 2063 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2064 2065 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2066 // Create new preheader for vector loop. 2067 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2068 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2069 2070 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2071 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2072 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2073 SCEVCheckBlock); 2074 2075 DT->addNewBlock(SCEVCheckBlock, Pred); 2076 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2077 2078 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2079 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2080 return SCEVCheckBlock; 2081 } 2082 2083 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2084 /// the branches to branch to the vector preheader or \p Bypass, depending on 2085 /// the generated condition. 2086 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2087 BasicBlock *LoopVectorPreHeader) { 2088 // Check if we generated code that checks in runtime if arrays overlap. 2089 if (!MemRuntimeCheckCond) 2090 return nullptr; 2091 2092 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2093 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2094 MemCheckBlock); 2095 2096 DT->addNewBlock(MemCheckBlock, Pred); 2097 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2098 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2099 2100 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2101 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2102 2103 ReplaceInstWithInst( 2104 MemCheckBlock->getTerminator(), 2105 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2106 MemCheckBlock->getTerminator()->setDebugLoc( 2107 Pred->getTerminator()->getDebugLoc()); 2108 2109 // Mark the check as used, to prevent it from being removed during cleanup. 2110 MemRuntimeCheckCond = nullptr; 2111 return MemCheckBlock; 2112 } 2113 }; 2114 2115 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2116 // vectorization. The loop needs to be annotated with #pragma omp simd 2117 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2118 // vector length information is not provided, vectorization is not considered 2119 // explicit. Interleave hints are not allowed either. These limitations will be 2120 // relaxed in the future. 2121 // Please, note that we are currently forced to abuse the pragma 'clang 2122 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2123 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2124 // provides *explicit vectorization hints* (LV can bypass legal checks and 2125 // assume that vectorization is legal). However, both hints are implemented 2126 // using the same metadata (llvm.loop.vectorize, processed by 2127 // LoopVectorizeHints). This will be fixed in the future when the native IR 2128 // representation for pragma 'omp simd' is introduced. 2129 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2130 OptimizationRemarkEmitter *ORE) { 2131 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2132 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2133 2134 // Only outer loops with an explicit vectorization hint are supported. 2135 // Unannotated outer loops are ignored. 2136 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2137 return false; 2138 2139 Function *Fn = OuterLp->getHeader()->getParent(); 2140 if (!Hints.allowVectorization(Fn, OuterLp, 2141 true /*VectorizeOnlyWhenForced*/)) { 2142 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2143 return false; 2144 } 2145 2146 if (Hints.getInterleave() > 1) { 2147 // TODO: Interleave support is future work. 2148 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2149 "outer loops.\n"); 2150 Hints.emitRemarkWithHints(); 2151 return false; 2152 } 2153 2154 return true; 2155 } 2156 2157 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2158 OptimizationRemarkEmitter *ORE, 2159 SmallVectorImpl<Loop *> &V) { 2160 // Collect inner loops and outer loops without irreducible control flow. For 2161 // now, only collect outer loops that have explicit vectorization hints. If we 2162 // are stress testing the VPlan H-CFG construction, we collect the outermost 2163 // loop of every loop nest. 2164 if (L.isInnermost() || VPlanBuildStressTest || 2165 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2166 LoopBlocksRPO RPOT(&L); 2167 RPOT.perform(LI); 2168 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2169 V.push_back(&L); 2170 // TODO: Collect inner loops inside marked outer loops in case 2171 // vectorization fails for the outer loop. Do not invoke 2172 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2173 // already known to be reducible. We can use an inherited attribute for 2174 // that. 2175 return; 2176 } 2177 } 2178 for (Loop *InnerL : L) 2179 collectSupportedLoops(*InnerL, LI, ORE, V); 2180 } 2181 2182 namespace { 2183 2184 /// The LoopVectorize Pass. 2185 struct LoopVectorize : public FunctionPass { 2186 /// Pass identification, replacement for typeid 2187 static char ID; 2188 2189 LoopVectorizePass Impl; 2190 2191 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2192 bool VectorizeOnlyWhenForced = false) 2193 : FunctionPass(ID), 2194 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2195 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2196 } 2197 2198 bool runOnFunction(Function &F) override { 2199 if (skipFunction(F)) 2200 return false; 2201 2202 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2203 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2204 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2205 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2206 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2207 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2208 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2209 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2210 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2211 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2212 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2213 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2214 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2215 2216 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2217 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2218 2219 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2220 GetLAA, *ORE, PSI).MadeAnyChange; 2221 } 2222 2223 void getAnalysisUsage(AnalysisUsage &AU) const override { 2224 AU.addRequired<AssumptionCacheTracker>(); 2225 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2226 AU.addRequired<DominatorTreeWrapperPass>(); 2227 AU.addRequired<LoopInfoWrapperPass>(); 2228 AU.addRequired<ScalarEvolutionWrapperPass>(); 2229 AU.addRequired<TargetTransformInfoWrapperPass>(); 2230 AU.addRequired<AAResultsWrapperPass>(); 2231 AU.addRequired<LoopAccessLegacyAnalysis>(); 2232 AU.addRequired<DemandedBitsWrapperPass>(); 2233 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2234 AU.addRequired<InjectTLIMappingsLegacy>(); 2235 2236 // We currently do not preserve loopinfo/dominator analyses with outer loop 2237 // vectorization. Until this is addressed, mark these analyses as preserved 2238 // only for non-VPlan-native path. 2239 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2240 if (!EnableVPlanNativePath) { 2241 AU.addPreserved<LoopInfoWrapperPass>(); 2242 AU.addPreserved<DominatorTreeWrapperPass>(); 2243 } 2244 2245 AU.addPreserved<BasicAAWrapperPass>(); 2246 AU.addPreserved<GlobalsAAWrapperPass>(); 2247 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2248 } 2249 }; 2250 2251 } // end anonymous namespace 2252 2253 //===----------------------------------------------------------------------===// 2254 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2255 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2256 //===----------------------------------------------------------------------===// 2257 2258 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2259 // We need to place the broadcast of invariant variables outside the loop, 2260 // but only if it's proven safe to do so. Else, broadcast will be inside 2261 // vector loop body. 2262 Instruction *Instr = dyn_cast<Instruction>(V); 2263 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2264 (!Instr || 2265 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2266 // Place the code for broadcasting invariant variables in the new preheader. 2267 IRBuilder<>::InsertPointGuard Guard(Builder); 2268 if (SafeToHoist) 2269 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2270 2271 // Broadcast the scalar into all locations in the vector. 2272 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2273 2274 return Shuf; 2275 } 2276 2277 /// This function adds 2278 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2279 /// to each vector element of Val. The sequence starts at StartIndex. 2280 /// \p Opcode is relevant for FP induction variable. 2281 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2282 Instruction::BinaryOps BinOp, ElementCount VF, 2283 IRBuilderBase &Builder) { 2284 assert(VF.isVector() && "only vector VFs are supported"); 2285 2286 // Create and check the types. 2287 auto *ValVTy = cast<VectorType>(Val->getType()); 2288 ElementCount VLen = ValVTy->getElementCount(); 2289 2290 Type *STy = Val->getType()->getScalarType(); 2291 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2292 "Induction Step must be an integer or FP"); 2293 assert(Step->getType() == STy && "Step has wrong type"); 2294 2295 SmallVector<Constant *, 8> Indices; 2296 2297 // Create a vector of consecutive numbers from zero to VF. 2298 VectorType *InitVecValVTy = ValVTy; 2299 if (STy->isFloatingPointTy()) { 2300 Type *InitVecValSTy = 2301 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2302 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2303 } 2304 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2305 2306 // Splat the StartIdx 2307 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2308 2309 if (STy->isIntegerTy()) { 2310 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2311 Step = Builder.CreateVectorSplat(VLen, Step); 2312 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2313 // FIXME: The newly created binary instructions should contain nsw/nuw 2314 // flags, which can be found from the original scalar operations. 2315 Step = Builder.CreateMul(InitVec, Step); 2316 return Builder.CreateAdd(Val, Step, "induction"); 2317 } 2318 2319 // Floating point induction. 2320 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2321 "Binary Opcode should be specified for FP induction"); 2322 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2323 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2324 2325 Step = Builder.CreateVectorSplat(VLen, Step); 2326 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2327 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2328 } 2329 2330 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2331 /// variable on which to base the steps, \p Step is the size of the step. 2332 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2333 const InductionDescriptor &ID, VPValue *Def, 2334 VPTransformState &State) { 2335 IRBuilderBase &Builder = State.Builder; 2336 // We shouldn't have to build scalar steps if we aren't vectorizing. 2337 assert(State.VF.isVector() && "VF should be greater than one"); 2338 // Get the value type and ensure it and the step have the same integer type. 2339 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2340 assert(ScalarIVTy == Step->getType() && 2341 "Val and Step should have the same type"); 2342 2343 // We build scalar steps for both integer and floating-point induction 2344 // variables. Here, we determine the kind of arithmetic we will perform. 2345 Instruction::BinaryOps AddOp; 2346 Instruction::BinaryOps MulOp; 2347 if (ScalarIVTy->isIntegerTy()) { 2348 AddOp = Instruction::Add; 2349 MulOp = Instruction::Mul; 2350 } else { 2351 AddOp = ID.getInductionOpcode(); 2352 MulOp = Instruction::FMul; 2353 } 2354 2355 // Determine the number of scalars we need to generate for each unroll 2356 // iteration. 2357 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2358 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2359 // Compute the scalar steps and save the results in State. 2360 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2361 ScalarIVTy->getScalarSizeInBits()); 2362 Type *VecIVTy = nullptr; 2363 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2364 if (!FirstLaneOnly && State.VF.isScalable()) { 2365 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2366 UnitStepVec = 2367 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2368 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2369 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2370 } 2371 2372 for (unsigned Part = 0; Part < State.UF; ++Part) { 2373 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2374 2375 if (!FirstLaneOnly && State.VF.isScalable()) { 2376 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2377 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2378 if (ScalarIVTy->isFloatingPointTy()) 2379 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2380 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2381 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2382 State.set(Def, Add, Part); 2383 // It's useful to record the lane values too for the known minimum number 2384 // of elements so we do those below. This improves the code quality when 2385 // trying to extract the first element, for example. 2386 } 2387 2388 if (ScalarIVTy->isFloatingPointTy()) 2389 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2390 2391 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2392 Value *StartIdx = Builder.CreateBinOp( 2393 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2394 // The step returned by `createStepForVF` is a runtime-evaluated value 2395 // when VF is scalable. Otherwise, it should be folded into a Constant. 2396 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2397 "Expected StartIdx to be folded to a constant when VF is not " 2398 "scalable"); 2399 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2400 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2401 State.set(Def, Add, VPIteration(Part, Lane)); 2402 } 2403 } 2404 } 2405 2406 // Generate code for the induction step. Note that induction steps are 2407 // required to be loop-invariant 2408 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2409 Instruction *InsertBefore, 2410 Loop *OrigLoop = nullptr) { 2411 const DataLayout &DL = SE.getDataLayout(); 2412 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2413 "Induction step should be loop invariant"); 2414 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2415 return E->getValue(); 2416 2417 SCEVExpander Exp(SE, DL, "induction"); 2418 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2419 } 2420 2421 /// Compute the transformed value of Index at offset StartValue using step 2422 /// StepValue. 2423 /// For integer induction, returns StartValue + Index * StepValue. 2424 /// For pointer induction, returns StartValue[Index * StepValue]. 2425 /// FIXME: The newly created binary instructions should contain nsw/nuw 2426 /// flags, which can be found from the original scalar operations. 2427 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2428 Value *StartValue, Value *Step, 2429 const InductionDescriptor &ID) { 2430 assert(Index->getType()->getScalarType() == Step->getType() && 2431 "Index scalar type does not match StepValue type"); 2432 2433 // Note: the IR at this point is broken. We cannot use SE to create any new 2434 // SCEV and then expand it, hoping that SCEV's simplification will give us 2435 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2436 // lead to various SCEV crashes. So all we can do is to use builder and rely 2437 // on InstCombine for future simplifications. Here we handle some trivial 2438 // cases only. 2439 auto CreateAdd = [&B](Value *X, Value *Y) { 2440 assert(X->getType() == Y->getType() && "Types don't match!"); 2441 if (auto *CX = dyn_cast<ConstantInt>(X)) 2442 if (CX->isZero()) 2443 return Y; 2444 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2445 if (CY->isZero()) 2446 return X; 2447 return B.CreateAdd(X, Y); 2448 }; 2449 2450 // We allow X to be a vector type, in which case Y will potentially be 2451 // splatted into a vector with the same element count. 2452 auto CreateMul = [&B](Value *X, Value *Y) { 2453 assert(X->getType()->getScalarType() == Y->getType() && 2454 "Types don't match!"); 2455 if (auto *CX = dyn_cast<ConstantInt>(X)) 2456 if (CX->isOne()) 2457 return Y; 2458 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2459 if (CY->isOne()) 2460 return X; 2461 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2462 if (XVTy && !isa<VectorType>(Y->getType())) 2463 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2464 return B.CreateMul(X, Y); 2465 }; 2466 2467 switch (ID.getKind()) { 2468 case InductionDescriptor::IK_IntInduction: { 2469 assert(!isa<VectorType>(Index->getType()) && 2470 "Vector indices not supported for integer inductions yet"); 2471 assert(Index->getType() == StartValue->getType() && 2472 "Index type does not match StartValue type"); 2473 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2474 return B.CreateSub(StartValue, Index); 2475 auto *Offset = CreateMul(Index, Step); 2476 return CreateAdd(StartValue, Offset); 2477 } 2478 case InductionDescriptor::IK_PtrInduction: { 2479 assert(isa<Constant>(Step) && 2480 "Expected constant step for pointer induction"); 2481 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2482 } 2483 case InductionDescriptor::IK_FpInduction: { 2484 assert(!isa<VectorType>(Index->getType()) && 2485 "Vector indices not supported for FP inductions yet"); 2486 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2487 auto InductionBinOp = ID.getInductionBinOp(); 2488 assert(InductionBinOp && 2489 (InductionBinOp->getOpcode() == Instruction::FAdd || 2490 InductionBinOp->getOpcode() == Instruction::FSub) && 2491 "Original bin op should be defined for FP induction"); 2492 2493 Value *MulExp = B.CreateFMul(Step, Index); 2494 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2495 "induction"); 2496 } 2497 case InductionDescriptor::IK_NoInduction: 2498 return nullptr; 2499 } 2500 llvm_unreachable("invalid enum"); 2501 } 2502 2503 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2504 const VPIteration &Instance, 2505 VPTransformState &State) { 2506 Value *ScalarInst = State.get(Def, Instance); 2507 Value *VectorValue = State.get(Def, Instance.Part); 2508 VectorValue = Builder.CreateInsertElement( 2509 VectorValue, ScalarInst, 2510 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2511 State.set(Def, VectorValue, Instance.Part); 2512 } 2513 2514 // Return whether we allow using masked interleave-groups (for dealing with 2515 // strided loads/stores that reside in predicated blocks, or for dealing 2516 // with gaps). 2517 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2518 // If an override option has been passed in for interleaved accesses, use it. 2519 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2520 return EnableMaskedInterleavedMemAccesses; 2521 2522 return TTI.enableMaskedInterleavedAccessVectorization(); 2523 } 2524 2525 // Try to vectorize the interleave group that \p Instr belongs to. 2526 // 2527 // E.g. Translate following interleaved load group (factor = 3): 2528 // for (i = 0; i < N; i+=3) { 2529 // R = Pic[i]; // Member of index 0 2530 // G = Pic[i+1]; // Member of index 1 2531 // B = Pic[i+2]; // Member of index 2 2532 // ... // do something to R, G, B 2533 // } 2534 // To: 2535 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2536 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2537 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2538 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2539 // 2540 // Or translate following interleaved store group (factor = 3): 2541 // for (i = 0; i < N; i+=3) { 2542 // ... do something to R, G, B 2543 // Pic[i] = R; // Member of index 0 2544 // Pic[i+1] = G; // Member of index 1 2545 // Pic[i+2] = B; // Member of index 2 2546 // } 2547 // To: 2548 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2549 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2550 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2551 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2552 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2553 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2554 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2555 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2556 VPValue *BlockInMask) { 2557 Instruction *Instr = Group->getInsertPos(); 2558 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2559 2560 // Prepare for the vector type of the interleaved load/store. 2561 Type *ScalarTy = getLoadStoreType(Instr); 2562 unsigned InterleaveFactor = Group->getFactor(); 2563 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2564 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2565 2566 // Prepare for the new pointers. 2567 SmallVector<Value *, 2> AddrParts; 2568 unsigned Index = Group->getIndex(Instr); 2569 2570 // TODO: extend the masked interleaved-group support to reversed access. 2571 assert((!BlockInMask || !Group->isReverse()) && 2572 "Reversed masked interleave-group not supported."); 2573 2574 // If the group is reverse, adjust the index to refer to the last vector lane 2575 // instead of the first. We adjust the index from the first vector lane, 2576 // rather than directly getting the pointer for lane VF - 1, because the 2577 // pointer operand of the interleaved access is supposed to be uniform. For 2578 // uniform instructions, we're only required to generate a value for the 2579 // first vector lane in each unroll iteration. 2580 if (Group->isReverse()) 2581 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2582 2583 for (unsigned Part = 0; Part < UF; Part++) { 2584 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2585 setDebugLocFromInst(AddrPart); 2586 2587 // Notice current instruction could be any index. Need to adjust the address 2588 // to the member of index 0. 2589 // 2590 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2591 // b = A[i]; // Member of index 0 2592 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2593 // 2594 // E.g. A[i+1] = a; // Member of index 1 2595 // A[i] = b; // Member of index 0 2596 // A[i+2] = c; // Member of index 2 (Current instruction) 2597 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2598 2599 bool InBounds = false; 2600 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2601 InBounds = gep->isInBounds(); 2602 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2603 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2604 2605 // Cast to the vector pointer type. 2606 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2607 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2608 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2609 } 2610 2611 setDebugLocFromInst(Instr); 2612 Value *PoisonVec = PoisonValue::get(VecTy); 2613 2614 Value *MaskForGaps = nullptr; 2615 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2616 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2617 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2618 } 2619 2620 // Vectorize the interleaved load group. 2621 if (isa<LoadInst>(Instr)) { 2622 // For each unroll part, create a wide load for the group. 2623 SmallVector<Value *, 2> NewLoads; 2624 for (unsigned Part = 0; Part < UF; Part++) { 2625 Instruction *NewLoad; 2626 if (BlockInMask || MaskForGaps) { 2627 assert(useMaskedInterleavedAccesses(*TTI) && 2628 "masked interleaved groups are not allowed."); 2629 Value *GroupMask = MaskForGaps; 2630 if (BlockInMask) { 2631 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2632 Value *ShuffledMask = Builder.CreateShuffleVector( 2633 BlockInMaskPart, 2634 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2635 "interleaved.mask"); 2636 GroupMask = MaskForGaps 2637 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2638 MaskForGaps) 2639 : ShuffledMask; 2640 } 2641 NewLoad = 2642 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2643 GroupMask, PoisonVec, "wide.masked.vec"); 2644 } 2645 else 2646 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2647 Group->getAlign(), "wide.vec"); 2648 Group->addMetadata(NewLoad); 2649 NewLoads.push_back(NewLoad); 2650 } 2651 2652 // For each member in the group, shuffle out the appropriate data from the 2653 // wide loads. 2654 unsigned J = 0; 2655 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2656 Instruction *Member = Group->getMember(I); 2657 2658 // Skip the gaps in the group. 2659 if (!Member) 2660 continue; 2661 2662 auto StrideMask = 2663 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2664 for (unsigned Part = 0; Part < UF; Part++) { 2665 Value *StridedVec = Builder.CreateShuffleVector( 2666 NewLoads[Part], StrideMask, "strided.vec"); 2667 2668 // If this member has different type, cast the result type. 2669 if (Member->getType() != ScalarTy) { 2670 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2671 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2672 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2673 } 2674 2675 if (Group->isReverse()) 2676 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2677 2678 State.set(VPDefs[J], StridedVec, Part); 2679 } 2680 ++J; 2681 } 2682 return; 2683 } 2684 2685 // The sub vector type for current instruction. 2686 auto *SubVT = VectorType::get(ScalarTy, VF); 2687 2688 // Vectorize the interleaved store group. 2689 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2690 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2691 "masked interleaved groups are not allowed."); 2692 assert((!MaskForGaps || !VF.isScalable()) && 2693 "masking gaps for scalable vectors is not yet supported."); 2694 for (unsigned Part = 0; Part < UF; Part++) { 2695 // Collect the stored vector from each member. 2696 SmallVector<Value *, 4> StoredVecs; 2697 for (unsigned i = 0; i < InterleaveFactor; i++) { 2698 assert((Group->getMember(i) || MaskForGaps) && 2699 "Fail to get a member from an interleaved store group"); 2700 Instruction *Member = Group->getMember(i); 2701 2702 // Skip the gaps in the group. 2703 if (!Member) { 2704 Value *Undef = PoisonValue::get(SubVT); 2705 StoredVecs.push_back(Undef); 2706 continue; 2707 } 2708 2709 Value *StoredVec = State.get(StoredValues[i], Part); 2710 2711 if (Group->isReverse()) 2712 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2713 2714 // If this member has different type, cast it to a unified type. 2715 2716 if (StoredVec->getType() != SubVT) 2717 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2718 2719 StoredVecs.push_back(StoredVec); 2720 } 2721 2722 // Concatenate all vectors into a wide vector. 2723 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2724 2725 // Interleave the elements in the wide vector. 2726 Value *IVec = Builder.CreateShuffleVector( 2727 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2728 "interleaved.vec"); 2729 2730 Instruction *NewStoreInstr; 2731 if (BlockInMask || MaskForGaps) { 2732 Value *GroupMask = MaskForGaps; 2733 if (BlockInMask) { 2734 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2735 Value *ShuffledMask = Builder.CreateShuffleVector( 2736 BlockInMaskPart, 2737 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2738 "interleaved.mask"); 2739 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2740 ShuffledMask, MaskForGaps) 2741 : ShuffledMask; 2742 } 2743 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2744 Group->getAlign(), GroupMask); 2745 } else 2746 NewStoreInstr = 2747 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2748 2749 Group->addMetadata(NewStoreInstr); 2750 } 2751 } 2752 2753 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2754 VPReplicateRecipe *RepRecipe, 2755 const VPIteration &Instance, 2756 bool IfPredicateInstr, 2757 VPTransformState &State) { 2758 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2759 2760 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2761 // the first lane and part. 2762 if (isa<NoAliasScopeDeclInst>(Instr)) 2763 if (!Instance.isFirstIteration()) 2764 return; 2765 2766 // Does this instruction return a value ? 2767 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2768 2769 Instruction *Cloned = Instr->clone(); 2770 if (!IsVoidRetTy) 2771 Cloned->setName(Instr->getName() + ".cloned"); 2772 2773 // If the scalarized instruction contributes to the address computation of a 2774 // widen masked load/store which was in a basic block that needed predication 2775 // and is not predicated after vectorization, we can't propagate 2776 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2777 // instruction could feed a poison value to the base address of the widen 2778 // load/store. 2779 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2780 Cloned->dropPoisonGeneratingFlags(); 2781 2782 if (Instr->getDebugLoc()) 2783 setDebugLocFromInst(Instr); 2784 2785 // Replace the operands of the cloned instructions with their scalar 2786 // equivalents in the new loop. 2787 for (auto &I : enumerate(RepRecipe->operands())) { 2788 auto InputInstance = Instance; 2789 VPValue *Operand = I.value(); 2790 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2791 if (OperandR && OperandR->isUniform()) 2792 InputInstance.Lane = VPLane::getFirstLane(); 2793 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2794 } 2795 addNewMetadata(Cloned, Instr); 2796 2797 // Place the cloned scalar in the new loop. 2798 State.Builder.Insert(Cloned); 2799 2800 State.set(RepRecipe, Cloned, Instance); 2801 2802 // If we just cloned a new assumption, add it the assumption cache. 2803 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2804 AC->registerAssumption(II); 2805 2806 // End if-block. 2807 if (IfPredicateInstr) 2808 PredicatedInstructions.push_back(Cloned); 2809 } 2810 2811 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2812 if (TripCount) 2813 return TripCount; 2814 2815 assert(InsertBlock); 2816 IRBuilder<> Builder(InsertBlock->getTerminator()); 2817 // Find the loop boundaries. 2818 ScalarEvolution *SE = PSE.getSE(); 2819 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2820 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2821 "Invalid loop count"); 2822 2823 Type *IdxTy = Legal->getWidestInductionType(); 2824 assert(IdxTy && "No type for induction"); 2825 2826 // The exit count might have the type of i64 while the phi is i32. This can 2827 // happen if we have an induction variable that is sign extended before the 2828 // compare. The only way that we get a backedge taken count is that the 2829 // induction variable was signed and as such will not overflow. In such a case 2830 // truncation is legal. 2831 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2832 IdxTy->getPrimitiveSizeInBits()) 2833 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2834 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2835 2836 // Get the total trip count from the count by adding 1. 2837 const SCEV *ExitCount = SE->getAddExpr( 2838 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2839 2840 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2841 2842 // Expand the trip count and place the new instructions in the preheader. 2843 // Notice that the pre-header does not change, only the loop body. 2844 SCEVExpander Exp(*SE, DL, "induction"); 2845 2846 // Count holds the overall loop count (N). 2847 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2848 InsertBlock->getTerminator()); 2849 2850 if (TripCount->getType()->isPointerTy()) 2851 TripCount = 2852 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2853 InsertBlock->getTerminator()); 2854 2855 return TripCount; 2856 } 2857 2858 Value * 2859 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2860 if (VectorTripCount) 2861 return VectorTripCount; 2862 2863 Value *TC = getOrCreateTripCount(InsertBlock); 2864 IRBuilder<> Builder(InsertBlock->getTerminator()); 2865 2866 Type *Ty = TC->getType(); 2867 // This is where we can make the step a runtime constant. 2868 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2869 2870 // If the tail is to be folded by masking, round the number of iterations N 2871 // up to a multiple of Step instead of rounding down. This is done by first 2872 // adding Step-1 and then rounding down. Note that it's ok if this addition 2873 // overflows: the vector induction variable will eventually wrap to zero given 2874 // that it starts at zero and its Step is a power of two; the loop will then 2875 // exit, with the last early-exit vector comparison also producing all-true. 2876 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2877 // is accounted for in emitIterationCountCheck that adds an overflow check. 2878 if (Cost->foldTailByMasking()) { 2879 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2880 "VF*UF must be a power of 2 when folding tail by masking"); 2881 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2882 TC = Builder.CreateAdd( 2883 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2884 } 2885 2886 // Now we need to generate the expression for the part of the loop that the 2887 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2888 // iterations are not required for correctness, or N - Step, otherwise. Step 2889 // is equal to the vectorization factor (number of SIMD elements) times the 2890 // unroll factor (number of SIMD instructions). 2891 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2892 2893 // There are cases where we *must* run at least one iteration in the remainder 2894 // loop. See the cost model for when this can happen. If the step evenly 2895 // divides the trip count, we set the remainder to be equal to the step. If 2896 // the step does not evenly divide the trip count, no adjustment is necessary 2897 // since there will already be scalar iterations. Note that the minimum 2898 // iterations check ensures that N >= Step. 2899 if (Cost->requiresScalarEpilogue(VF)) { 2900 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2901 R = Builder.CreateSelect(IsZero, Step, R); 2902 } 2903 2904 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2905 2906 return VectorTripCount; 2907 } 2908 2909 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2910 const DataLayout &DL) { 2911 // Verify that V is a vector type with same number of elements as DstVTy. 2912 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2913 unsigned VF = DstFVTy->getNumElements(); 2914 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2915 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2916 Type *SrcElemTy = SrcVecTy->getElementType(); 2917 Type *DstElemTy = DstFVTy->getElementType(); 2918 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2919 "Vector elements must have same size"); 2920 2921 // Do a direct cast if element types are castable. 2922 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2923 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2924 } 2925 // V cannot be directly casted to desired vector type. 2926 // May happen when V is a floating point vector but DstVTy is a vector of 2927 // pointers or vice-versa. Handle this using a two-step bitcast using an 2928 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2929 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2930 "Only one type should be a pointer type"); 2931 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2932 "Only one type should be a floating point type"); 2933 Type *IntTy = 2934 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2935 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2936 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2937 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2938 } 2939 2940 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2941 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2942 // Reuse existing vector loop preheader for TC checks. 2943 // Note that new preheader block is generated for vector loop. 2944 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2945 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2946 2947 // Generate code to check if the loop's trip count is less than VF * UF, or 2948 // equal to it in case a scalar epilogue is required; this implies that the 2949 // vector trip count is zero. This check also covers the case where adding one 2950 // to the backedge-taken count overflowed leading to an incorrect trip count 2951 // of zero. In this case we will also jump to the scalar loop. 2952 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2953 : ICmpInst::ICMP_ULT; 2954 2955 // If tail is to be folded, vector loop takes care of all iterations. 2956 Type *CountTy = Count->getType(); 2957 Value *CheckMinIters = Builder.getFalse(); 2958 Value *Step = createStepForVF(Builder, CountTy, VF, UF); 2959 if (!Cost->foldTailByMasking()) 2960 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2961 else if (VF.isScalable()) { 2962 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2963 // an overflow to zero when updating induction variables and so an 2964 // additional overflow check is required before entering the vector loop. 2965 2966 // Get the maximum unsigned value for the type. 2967 Value *MaxUIntTripCount = 2968 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2969 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2970 2971 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2972 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); 2973 } 2974 // Create new preheader for vector loop. 2975 LoopVectorPreHeader = 2976 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2977 "vector.ph"); 2978 2979 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2980 DT->getNode(Bypass)->getIDom()) && 2981 "TC check is expected to dominate Bypass"); 2982 2983 // Update dominator for Bypass & LoopExit (if needed). 2984 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2985 if (!Cost->requiresScalarEpilogue(VF)) 2986 // If there is an epilogue which must run, there's no edge from the 2987 // middle block to exit blocks and thus no need to update the immediate 2988 // dominator of the exit blocks. 2989 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2990 2991 ReplaceInstWithInst( 2992 TCCheckBlock->getTerminator(), 2993 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2994 LoopBypassBlocks.push_back(TCCheckBlock); 2995 } 2996 2997 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2998 2999 BasicBlock *const SCEVCheckBlock = 3000 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3001 if (!SCEVCheckBlock) 3002 return nullptr; 3003 3004 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3005 (OptForSizeBasedOnProfile && 3006 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3007 "Cannot SCEV check stride or overflow when optimizing for size"); 3008 3009 3010 // Update dominator only if this is first RT check. 3011 if (LoopBypassBlocks.empty()) { 3012 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3013 if (!Cost->requiresScalarEpilogue(VF)) 3014 // If there is an epilogue which must run, there's no edge from the 3015 // middle block to exit blocks and thus no need to update the immediate 3016 // dominator of the exit blocks. 3017 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3018 } 3019 3020 LoopBypassBlocks.push_back(SCEVCheckBlock); 3021 AddedSafetyChecks = true; 3022 return SCEVCheckBlock; 3023 } 3024 3025 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3026 // VPlan-native path does not do any analysis for runtime checks currently. 3027 if (EnableVPlanNativePath) 3028 return nullptr; 3029 3030 BasicBlock *const MemCheckBlock = 3031 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3032 3033 // Check if we generated code that checks in runtime if arrays overlap. We put 3034 // the checks into a separate block to make the more common case of few 3035 // elements faster. 3036 if (!MemCheckBlock) 3037 return nullptr; 3038 3039 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3040 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3041 "Cannot emit memory checks when optimizing for size, unless forced " 3042 "to vectorize."); 3043 ORE->emit([&]() { 3044 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3045 OrigLoop->getStartLoc(), 3046 OrigLoop->getHeader()) 3047 << "Code-size may be reduced by not forcing " 3048 "vectorization, or by source-code modifications " 3049 "eliminating the need for runtime checks " 3050 "(e.g., adding 'restrict')."; 3051 }); 3052 } 3053 3054 LoopBypassBlocks.push_back(MemCheckBlock); 3055 3056 AddedSafetyChecks = true; 3057 3058 // Only use noalias metadata when using memory checks guaranteeing no overlap 3059 // across all iterations. 3060 if (!Legal->getLAI()->getRuntimePointerChecking()->getDiffChecks()) { 3061 // We currently don't use LoopVersioning for the actual loop cloning but we 3062 // still use it to add the noalias metadata. 3063 LVer = std::make_unique<LoopVersioning>( 3064 *Legal->getLAI(), 3065 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3066 DT, PSE.getSE()); 3067 LVer->prepareNoAliasMetadata(); 3068 } 3069 return MemCheckBlock; 3070 } 3071 3072 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3073 LoopScalarBody = OrigLoop->getHeader(); 3074 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3075 assert(LoopVectorPreHeader && "Invalid loop structure"); 3076 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3077 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3078 "multiple exit loop without required epilogue?"); 3079 3080 LoopMiddleBlock = 3081 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3082 LI, nullptr, Twine(Prefix) + "middle.block"); 3083 LoopScalarPreHeader = 3084 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3085 nullptr, Twine(Prefix) + "scalar.ph"); 3086 3087 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3088 3089 // Set up the middle block terminator. Two cases: 3090 // 1) If we know that we must execute the scalar epilogue, emit an 3091 // unconditional branch. 3092 // 2) Otherwise, we must have a single unique exit block (due to how we 3093 // implement the multiple exit case). In this case, set up a conditonal 3094 // branch from the middle block to the loop scalar preheader, and the 3095 // exit block. completeLoopSkeleton will update the condition to use an 3096 // iteration check, if required to decide whether to execute the remainder. 3097 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3098 BranchInst::Create(LoopScalarPreHeader) : 3099 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3100 Builder.getTrue()); 3101 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3102 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3103 3104 // Update dominator for loop exit. During skeleton creation, only the vector 3105 // pre-header and the middle block are created. The vector loop is entirely 3106 // created during VPlan exection. 3107 if (!Cost->requiresScalarEpilogue(VF)) 3108 // If there is an epilogue which must run, there's no edge from the 3109 // middle block to exit blocks and thus no need to update the immediate 3110 // dominator of the exit blocks. 3111 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3112 } 3113 3114 void InnerLoopVectorizer::createInductionResumeValues( 3115 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3116 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3117 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3118 "Inconsistent information about additional bypass."); 3119 3120 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3121 assert(VectorTripCount && "Expected valid arguments"); 3122 // We are going to resume the execution of the scalar loop. 3123 // Go over all of the induction variables that we found and fix the 3124 // PHIs that are left in the scalar version of the loop. 3125 // The starting values of PHI nodes depend on the counter of the last 3126 // iteration in the vectorized loop. 3127 // If we come from a bypass edge then we need to start from the original 3128 // start value. 3129 Instruction *OldInduction = Legal->getPrimaryInduction(); 3130 for (auto &InductionEntry : Legal->getInductionVars()) { 3131 PHINode *OrigPhi = InductionEntry.first; 3132 InductionDescriptor II = InductionEntry.second; 3133 3134 Value *&EndValue = IVEndValues[OrigPhi]; 3135 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3136 if (OrigPhi == OldInduction) { 3137 // We know what the end value is. 3138 EndValue = VectorTripCount; 3139 } else { 3140 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3141 3142 // Fast-math-flags propagate from the original induction instruction. 3143 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3144 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3145 3146 Type *StepType = II.getStep()->getType(); 3147 Instruction::CastOps CastOp = 3148 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3149 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3150 Value *Step = 3151 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3152 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3153 EndValue->setName("ind.end"); 3154 3155 // Compute the end value for the additional bypass (if applicable). 3156 if (AdditionalBypass.first) { 3157 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3158 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3159 StepType, true); 3160 Value *Step = 3161 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3162 VTC = 3163 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3164 EndValueFromAdditionalBypass = 3165 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3166 EndValueFromAdditionalBypass->setName("ind.end"); 3167 } 3168 } 3169 3170 // Create phi nodes to merge from the backedge-taken check block. 3171 PHINode *BCResumeVal = 3172 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3173 LoopScalarPreHeader->getTerminator()); 3174 // Copy original phi DL over to the new one. 3175 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3176 3177 // The new PHI merges the original incoming value, in case of a bypass, 3178 // or the value at the end of the vectorized loop. 3179 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3180 3181 // Fix the scalar body counter (PHI node). 3182 // The old induction's phi node in the scalar body needs the truncated 3183 // value. 3184 for (BasicBlock *BB : LoopBypassBlocks) 3185 BCResumeVal->addIncoming(II.getStartValue(), BB); 3186 3187 if (AdditionalBypass.first) 3188 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3189 EndValueFromAdditionalBypass); 3190 3191 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3192 } 3193 } 3194 3195 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3196 // The trip counts should be cached by now. 3197 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3198 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3199 3200 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3201 3202 // Add a check in the middle block to see if we have completed 3203 // all of the iterations in the first vector loop. Three cases: 3204 // 1) If we require a scalar epilogue, there is no conditional branch as 3205 // we unconditionally branch to the scalar preheader. Do nothing. 3206 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3207 // Thus if tail is to be folded, we know we don't need to run the 3208 // remainder and we can use the previous value for the condition (true). 3209 // 3) Otherwise, construct a runtime check. 3210 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3211 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3212 Count, VectorTripCount, "cmp.n", 3213 LoopMiddleBlock->getTerminator()); 3214 3215 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3216 // of the corresponding compare because they may have ended up with 3217 // different line numbers and we want to avoid awkward line stepping while 3218 // debugging. Eg. if the compare has got a line number inside the loop. 3219 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3220 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3221 } 3222 3223 #ifdef EXPENSIVE_CHECKS 3224 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3225 #endif 3226 3227 return LoopVectorPreHeader; 3228 } 3229 3230 std::pair<BasicBlock *, Value *> 3231 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3232 /* 3233 In this function we generate a new loop. The new loop will contain 3234 the vectorized instructions while the old loop will continue to run the 3235 scalar remainder. 3236 3237 [ ] <-- loop iteration number check. 3238 / | 3239 / v 3240 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3241 | / | 3242 | / v 3243 || [ ] <-- vector pre header. 3244 |/ | 3245 | v 3246 | [ ] \ 3247 | [ ]_| <-- vector loop (created during VPlan execution). 3248 | | 3249 | v 3250 \ -[ ] <--- middle-block. 3251 \/ | 3252 /\ v 3253 | ->[ ] <--- new preheader. 3254 | | 3255 (opt) v <-- edge from middle to exit iff epilogue is not required. 3256 | [ ] \ 3257 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3258 \ | 3259 \ v 3260 >[ ] <-- exit block(s). 3261 ... 3262 */ 3263 3264 // Get the metadata of the original loop before it gets modified. 3265 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3266 3267 // Workaround! Compute the trip count of the original loop and cache it 3268 // before we start modifying the CFG. This code has a systemic problem 3269 // wherein it tries to run analysis over partially constructed IR; this is 3270 // wrong, and not simply for SCEV. The trip count of the original loop 3271 // simply happens to be prone to hitting this in practice. In theory, we 3272 // can hit the same issue for any SCEV, or ValueTracking query done during 3273 // mutation. See PR49900. 3274 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3275 3276 // Create an empty vector loop, and prepare basic blocks for the runtime 3277 // checks. 3278 createVectorLoopSkeleton(""); 3279 3280 // Now, compare the new count to zero. If it is zero skip the vector loop and 3281 // jump to the scalar loop. This check also covers the case where the 3282 // backedge-taken count is uint##_max: adding one to it will overflow leading 3283 // to an incorrect trip count of zero. In this (rare) case we will also jump 3284 // to the scalar loop. 3285 emitIterationCountCheck(LoopScalarPreHeader); 3286 3287 // Generate the code to check any assumptions that we've made for SCEV 3288 // expressions. 3289 emitSCEVChecks(LoopScalarPreHeader); 3290 3291 // Generate the code that checks in runtime if arrays overlap. We put the 3292 // checks into a separate block to make the more common case of few elements 3293 // faster. 3294 emitMemRuntimeChecks(LoopScalarPreHeader); 3295 3296 // Emit phis for the new starting index of the scalar loop. 3297 createInductionResumeValues(); 3298 3299 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3300 } 3301 3302 // Fix up external users of the induction variable. At this point, we are 3303 // in LCSSA form, with all external PHIs that use the IV having one input value, 3304 // coming from the remainder loop. We need those PHIs to also have a correct 3305 // value for the IV when arriving directly from the middle block. 3306 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3307 const InductionDescriptor &II, 3308 Value *VectorTripCount, Value *EndValue, 3309 BasicBlock *MiddleBlock, 3310 BasicBlock *VectorHeader, VPlan &Plan) { 3311 // There are two kinds of external IV usages - those that use the value 3312 // computed in the last iteration (the PHI) and those that use the penultimate 3313 // value (the value that feeds into the phi from the loop latch). 3314 // We allow both, but they, obviously, have different values. 3315 3316 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3317 3318 DenseMap<Value *, Value *> MissingVals; 3319 3320 // An external user of the last iteration's value should see the value that 3321 // the remainder loop uses to initialize its own IV. 3322 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3323 for (User *U : PostInc->users()) { 3324 Instruction *UI = cast<Instruction>(U); 3325 if (!OrigLoop->contains(UI)) { 3326 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3327 MissingVals[UI] = EndValue; 3328 } 3329 } 3330 3331 // An external user of the penultimate value need to see EndValue - Step. 3332 // The simplest way to get this is to recompute it from the constituent SCEVs, 3333 // that is Start + (Step * (CRD - 1)). 3334 for (User *U : OrigPhi->users()) { 3335 auto *UI = cast<Instruction>(U); 3336 if (!OrigLoop->contains(UI)) { 3337 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3338 3339 IRBuilder<> B(MiddleBlock->getTerminator()); 3340 3341 // Fast-math-flags propagate from the original induction instruction. 3342 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3343 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3344 3345 Value *CountMinusOne = B.CreateSub( 3346 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3347 Value *CMO = 3348 !II.getStep()->getType()->isIntegerTy() 3349 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3350 II.getStep()->getType()) 3351 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3352 CMO->setName("cast.cmo"); 3353 3354 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3355 VectorHeader->getTerminator()); 3356 Value *Escape = 3357 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3358 Escape->setName("ind.escape"); 3359 MissingVals[UI] = Escape; 3360 } 3361 } 3362 3363 for (auto &I : MissingVals) { 3364 PHINode *PHI = cast<PHINode>(I.first); 3365 // One corner case we have to handle is two IVs "chasing" each-other, 3366 // that is %IV2 = phi [...], [ %IV1, %latch ] 3367 // In this case, if IV1 has an external use, we need to avoid adding both 3368 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3369 // don't already have an incoming value for the middle block. 3370 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3371 PHI->addIncoming(I.second, MiddleBlock); 3372 Plan.removeLiveOut(PHI); 3373 } 3374 } 3375 } 3376 3377 namespace { 3378 3379 struct CSEDenseMapInfo { 3380 static bool canHandle(const Instruction *I) { 3381 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3382 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3383 } 3384 3385 static inline Instruction *getEmptyKey() { 3386 return DenseMapInfo<Instruction *>::getEmptyKey(); 3387 } 3388 3389 static inline Instruction *getTombstoneKey() { 3390 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3391 } 3392 3393 static unsigned getHashValue(const Instruction *I) { 3394 assert(canHandle(I) && "Unknown instruction!"); 3395 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3396 I->value_op_end())); 3397 } 3398 3399 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3400 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3401 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3402 return LHS == RHS; 3403 return LHS->isIdenticalTo(RHS); 3404 } 3405 }; 3406 3407 } // end anonymous namespace 3408 3409 ///Perform cse of induction variable instructions. 3410 static void cse(BasicBlock *BB) { 3411 // Perform simple cse. 3412 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3413 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3414 if (!CSEDenseMapInfo::canHandle(&In)) 3415 continue; 3416 3417 // Check if we can replace this instruction with any of the 3418 // visited instructions. 3419 if (Instruction *V = CSEMap.lookup(&In)) { 3420 In.replaceAllUsesWith(V); 3421 In.eraseFromParent(); 3422 continue; 3423 } 3424 3425 CSEMap[&In] = &In; 3426 } 3427 } 3428 3429 InstructionCost 3430 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3431 bool &NeedToScalarize) const { 3432 Function *F = CI->getCalledFunction(); 3433 Type *ScalarRetTy = CI->getType(); 3434 SmallVector<Type *, 4> Tys, ScalarTys; 3435 for (auto &ArgOp : CI->args()) 3436 ScalarTys.push_back(ArgOp->getType()); 3437 3438 // Estimate cost of scalarized vector call. The source operands are assumed 3439 // to be vectors, so we need to extract individual elements from there, 3440 // execute VF scalar calls, and then gather the result into the vector return 3441 // value. 3442 InstructionCost ScalarCallCost = 3443 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3444 if (VF.isScalar()) 3445 return ScalarCallCost; 3446 3447 // Compute corresponding vector type for return value and arguments. 3448 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3449 for (Type *ScalarTy : ScalarTys) 3450 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3451 3452 // Compute costs of unpacking argument values for the scalar calls and 3453 // packing the return values to a vector. 3454 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3455 3456 InstructionCost Cost = 3457 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3458 3459 // If we can't emit a vector call for this function, then the currently found 3460 // cost is the cost we need to return. 3461 NeedToScalarize = true; 3462 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3463 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3464 3465 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3466 return Cost; 3467 3468 // If the corresponding vector cost is cheaper, return its cost. 3469 InstructionCost VectorCallCost = 3470 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3471 if (VectorCallCost < Cost) { 3472 NeedToScalarize = false; 3473 Cost = VectorCallCost; 3474 } 3475 return Cost; 3476 } 3477 3478 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3479 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3480 return Elt; 3481 return VectorType::get(Elt, VF); 3482 } 3483 3484 InstructionCost 3485 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3486 ElementCount VF) const { 3487 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3488 assert(ID && "Expected intrinsic call!"); 3489 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3490 FastMathFlags FMF; 3491 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3492 FMF = FPMO->getFastMathFlags(); 3493 3494 SmallVector<const Value *> Arguments(CI->args()); 3495 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3496 SmallVector<Type *> ParamTys; 3497 std::transform(FTy->param_begin(), FTy->param_end(), 3498 std::back_inserter(ParamTys), 3499 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3500 3501 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3502 dyn_cast<IntrinsicInst>(CI)); 3503 return TTI.getIntrinsicInstrCost(CostAttrs, 3504 TargetTransformInfo::TCK_RecipThroughput); 3505 } 3506 3507 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3508 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3509 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3510 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3511 } 3512 3513 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3514 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3515 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3516 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3517 } 3518 3519 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3520 // For every instruction `I` in MinBWs, truncate the operands, create a 3521 // truncated version of `I` and reextend its result. InstCombine runs 3522 // later and will remove any ext/trunc pairs. 3523 SmallPtrSet<Value *, 4> Erased; 3524 for (const auto &KV : Cost->getMinimalBitwidths()) { 3525 // If the value wasn't vectorized, we must maintain the original scalar 3526 // type. The absence of the value from State indicates that it 3527 // wasn't vectorized. 3528 // FIXME: Should not rely on getVPValue at this point. 3529 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3530 if (!State.hasAnyVectorValue(Def)) 3531 continue; 3532 for (unsigned Part = 0; Part < UF; ++Part) { 3533 Value *I = State.get(Def, Part); 3534 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3535 continue; 3536 Type *OriginalTy = I->getType(); 3537 Type *ScalarTruncatedTy = 3538 IntegerType::get(OriginalTy->getContext(), KV.second); 3539 auto *TruncatedTy = VectorType::get( 3540 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3541 if (TruncatedTy == OriginalTy) 3542 continue; 3543 3544 IRBuilder<> B(cast<Instruction>(I)); 3545 auto ShrinkOperand = [&](Value *V) -> Value * { 3546 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3547 if (ZI->getSrcTy() == TruncatedTy) 3548 return ZI->getOperand(0); 3549 return B.CreateZExtOrTrunc(V, TruncatedTy); 3550 }; 3551 3552 // The actual instruction modification depends on the instruction type, 3553 // unfortunately. 3554 Value *NewI = nullptr; 3555 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3556 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3557 ShrinkOperand(BO->getOperand(1))); 3558 3559 // Any wrapping introduced by shrinking this operation shouldn't be 3560 // considered undefined behavior. So, we can't unconditionally copy 3561 // arithmetic wrapping flags to NewI. 3562 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3563 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3564 NewI = 3565 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3566 ShrinkOperand(CI->getOperand(1))); 3567 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3568 NewI = B.CreateSelect(SI->getCondition(), 3569 ShrinkOperand(SI->getTrueValue()), 3570 ShrinkOperand(SI->getFalseValue())); 3571 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3572 switch (CI->getOpcode()) { 3573 default: 3574 llvm_unreachable("Unhandled cast!"); 3575 case Instruction::Trunc: 3576 NewI = ShrinkOperand(CI->getOperand(0)); 3577 break; 3578 case Instruction::SExt: 3579 NewI = B.CreateSExtOrTrunc( 3580 CI->getOperand(0), 3581 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3582 break; 3583 case Instruction::ZExt: 3584 NewI = B.CreateZExtOrTrunc( 3585 CI->getOperand(0), 3586 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3587 break; 3588 } 3589 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3590 auto Elements0 = 3591 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3592 auto *O0 = B.CreateZExtOrTrunc( 3593 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3594 auto Elements1 = 3595 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3596 auto *O1 = B.CreateZExtOrTrunc( 3597 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3598 3599 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3600 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3601 // Don't do anything with the operands, just extend the result. 3602 continue; 3603 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3604 auto Elements = 3605 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3606 auto *O0 = B.CreateZExtOrTrunc( 3607 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3608 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3609 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3610 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3611 auto Elements = 3612 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3613 auto *O0 = B.CreateZExtOrTrunc( 3614 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3615 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3616 } else { 3617 // If we don't know what to do, be conservative and don't do anything. 3618 continue; 3619 } 3620 3621 // Lastly, extend the result. 3622 NewI->takeName(cast<Instruction>(I)); 3623 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3624 I->replaceAllUsesWith(Res); 3625 cast<Instruction>(I)->eraseFromParent(); 3626 Erased.insert(I); 3627 State.reset(Def, Res, Part); 3628 } 3629 } 3630 3631 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3632 for (const auto &KV : Cost->getMinimalBitwidths()) { 3633 // If the value wasn't vectorized, we must maintain the original scalar 3634 // type. The absence of the value from State indicates that it 3635 // wasn't vectorized. 3636 // FIXME: Should not rely on getVPValue at this point. 3637 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3638 if (!State.hasAnyVectorValue(Def)) 3639 continue; 3640 for (unsigned Part = 0; Part < UF; ++Part) { 3641 Value *I = State.get(Def, Part); 3642 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3643 if (Inst && Inst->use_empty()) { 3644 Value *NewI = Inst->getOperand(0); 3645 Inst->eraseFromParent(); 3646 State.reset(Def, NewI, Part); 3647 } 3648 } 3649 } 3650 } 3651 3652 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3653 VPlan &Plan) { 3654 // Insert truncates and extends for any truncated instructions as hints to 3655 // InstCombine. 3656 if (VF.isVector()) 3657 truncateToMinimalBitwidths(State); 3658 3659 // Fix widened non-induction PHIs by setting up the PHI operands. 3660 if (EnableVPlanNativePath) 3661 fixNonInductionPHIs(Plan, State); 3662 3663 // At this point every instruction in the original loop is widened to a 3664 // vector form. Now we need to fix the recurrences in the loop. These PHI 3665 // nodes are currently empty because we did not want to introduce cycles. 3666 // This is the second stage of vectorizing recurrences. 3667 fixCrossIterationPHIs(State); 3668 3669 // Forget the original basic block. 3670 PSE.getSE()->forgetLoop(OrigLoop); 3671 3672 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3673 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3674 if (Cost->requiresScalarEpilogue(VF)) { 3675 // No edge from the middle block to the unique exit block has been inserted 3676 // and there is nothing to fix from vector loop; phis should have incoming 3677 // from scalar loop only. 3678 Plan.clearLiveOuts(); 3679 } else { 3680 // If we inserted an edge from the middle block to the unique exit block, 3681 // update uses outside the loop (phis) to account for the newly inserted 3682 // edge. 3683 3684 // Fix-up external users of the induction variables. 3685 for (auto &Entry : Legal->getInductionVars()) 3686 fixupIVUsers(Entry.first, Entry.second, 3687 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3688 IVEndValues[Entry.first], LoopMiddleBlock, 3689 VectorLoop->getHeader(), Plan); 3690 } 3691 3692 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3693 // in the exit block, so update the builder. 3694 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3695 for (auto &KV : Plan.getLiveOuts()) 3696 KV.second->fixPhi(Plan, State); 3697 3698 for (Instruction *PI : PredicatedInstructions) 3699 sinkScalarOperands(&*PI); 3700 3701 // Remove redundant induction instructions. 3702 cse(VectorLoop->getHeader()); 3703 3704 // Set/update profile weights for the vector and remainder loops as original 3705 // loop iterations are now distributed among them. Note that original loop 3706 // represented by LoopScalarBody becomes remainder loop after vectorization. 3707 // 3708 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3709 // end up getting slightly roughened result but that should be OK since 3710 // profile is not inherently precise anyway. Note also possible bypass of 3711 // vector code caused by legality checks is ignored, assigning all the weight 3712 // to the vector loop, optimistically. 3713 // 3714 // For scalable vectorization we can't know at compile time how many iterations 3715 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3716 // vscale of '1'. 3717 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3718 LI->getLoopFor(LoopScalarBody), 3719 VF.getKnownMinValue() * UF); 3720 } 3721 3722 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3723 // In order to support recurrences we need to be able to vectorize Phi nodes. 3724 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3725 // stage #2: We now need to fix the recurrences by adding incoming edges to 3726 // the currently empty PHI nodes. At this point every instruction in the 3727 // original loop is widened to a vector form so we can use them to construct 3728 // the incoming edges. 3729 VPBasicBlock *Header = 3730 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3731 for (VPRecipeBase &R : Header->phis()) { 3732 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3733 fixReduction(ReductionPhi, State); 3734 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3735 fixFirstOrderRecurrence(FOR, State); 3736 } 3737 } 3738 3739 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3740 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3741 // This is the second phase of vectorizing first-order recurrences. An 3742 // overview of the transformation is described below. Suppose we have the 3743 // following loop. 3744 // 3745 // for (int i = 0; i < n; ++i) 3746 // b[i] = a[i] - a[i - 1]; 3747 // 3748 // There is a first-order recurrence on "a". For this loop, the shorthand 3749 // scalar IR looks like: 3750 // 3751 // scalar.ph: 3752 // s_init = a[-1] 3753 // br scalar.body 3754 // 3755 // scalar.body: 3756 // i = phi [0, scalar.ph], [i+1, scalar.body] 3757 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3758 // s2 = a[i] 3759 // b[i] = s2 - s1 3760 // br cond, scalar.body, ... 3761 // 3762 // In this example, s1 is a recurrence because it's value depends on the 3763 // previous iteration. In the first phase of vectorization, we created a 3764 // vector phi v1 for s1. We now complete the vectorization and produce the 3765 // shorthand vector IR shown below (for VF = 4, UF = 1). 3766 // 3767 // vector.ph: 3768 // v_init = vector(..., ..., ..., a[-1]) 3769 // br vector.body 3770 // 3771 // vector.body 3772 // i = phi [0, vector.ph], [i+4, vector.body] 3773 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3774 // v2 = a[i, i+1, i+2, i+3]; 3775 // v3 = vector(v1(3), v2(0, 1, 2)) 3776 // b[i, i+1, i+2, i+3] = v2 - v3 3777 // br cond, vector.body, middle.block 3778 // 3779 // middle.block: 3780 // x = v2(3) 3781 // br scalar.ph 3782 // 3783 // scalar.ph: 3784 // s_init = phi [x, middle.block], [a[-1], otherwise] 3785 // br scalar.body 3786 // 3787 // After execution completes the vector loop, we extract the next value of 3788 // the recurrence (x) to use as the initial value in the scalar loop. 3789 3790 // Extract the last vector element in the middle block. This will be the 3791 // initial value for the recurrence when jumping to the scalar loop. 3792 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3793 Value *Incoming = State.get(PreviousDef, UF - 1); 3794 auto *ExtractForScalar = Incoming; 3795 auto *IdxTy = Builder.getInt32Ty(); 3796 if (VF.isVector()) { 3797 auto *One = ConstantInt::get(IdxTy, 1); 3798 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3799 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3800 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3801 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3802 "vector.recur.extract"); 3803 } 3804 // Extract the second last element in the middle block if the 3805 // Phi is used outside the loop. We need to extract the phi itself 3806 // and not the last element (the phi update in the current iteration). This 3807 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3808 // when the scalar loop is not run at all. 3809 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3810 if (VF.isVector()) { 3811 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3812 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3813 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3814 Incoming, Idx, "vector.recur.extract.for.phi"); 3815 } else if (UF > 1) 3816 // When loop is unrolled without vectorizing, initialize 3817 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3818 // of `Incoming`. This is analogous to the vectorized case above: extracting 3819 // the second last element when VF > 1. 3820 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3821 3822 // Fix the initial value of the original recurrence in the scalar loop. 3823 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3824 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3825 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3826 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3827 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3828 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3829 Start->addIncoming(Incoming, BB); 3830 } 3831 3832 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3833 Phi->setName("scalar.recur"); 3834 3835 // Finally, fix users of the recurrence outside the loop. The users will need 3836 // either the last value of the scalar recurrence or the last value of the 3837 // vector recurrence we extracted in the middle block. Since the loop is in 3838 // LCSSA form, we just need to find all the phi nodes for the original scalar 3839 // recurrence in the exit block, and then add an edge for the middle block. 3840 // Note that LCSSA does not imply single entry when the original scalar loop 3841 // had multiple exiting edges (as we always run the last iteration in the 3842 // scalar epilogue); in that case, there is no edge from middle to exit and 3843 // and thus no phis which needed updated. 3844 if (!Cost->requiresScalarEpilogue(VF)) 3845 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3846 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3847 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3848 State.Plan->removeLiveOut(&LCSSAPhi); 3849 } 3850 } 3851 3852 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3853 VPTransformState &State) { 3854 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3855 // Get it's reduction variable descriptor. 3856 assert(Legal->isReductionVariable(OrigPhi) && 3857 "Unable to find the reduction variable"); 3858 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3859 3860 RecurKind RK = RdxDesc.getRecurrenceKind(); 3861 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3862 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3863 setDebugLocFromInst(ReductionStartValue); 3864 3865 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3866 // This is the vector-clone of the value that leaves the loop. 3867 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3868 3869 // Wrap flags are in general invalid after vectorization, clear them. 3870 clearReductionWrapFlags(PhiR, State); 3871 3872 // Before each round, move the insertion point right between 3873 // the PHIs and the values we are going to write. 3874 // This allows us to write both PHINodes and the extractelement 3875 // instructions. 3876 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3877 3878 setDebugLocFromInst(LoopExitInst); 3879 3880 Type *PhiTy = OrigPhi->getType(); 3881 3882 VPBasicBlock *LatchVPBB = 3883 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3884 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3885 // If tail is folded by masking, the vector value to leave the loop should be 3886 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3887 // instead of the former. For an inloop reduction the reduction will already 3888 // be predicated, and does not need to be handled here. 3889 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3890 for (unsigned Part = 0; Part < UF; ++Part) { 3891 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3892 SelectInst *Sel = nullptr; 3893 for (User *U : VecLoopExitInst->users()) { 3894 if (isa<SelectInst>(U)) { 3895 assert(!Sel && "Reduction exit feeding two selects"); 3896 Sel = cast<SelectInst>(U); 3897 } else 3898 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3899 } 3900 assert(Sel && "Reduction exit feeds no select"); 3901 State.reset(LoopExitInstDef, Sel, Part); 3902 3903 if (isa<FPMathOperator>(Sel)) 3904 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3905 3906 // If the target can create a predicated operator for the reduction at no 3907 // extra cost in the loop (for example a predicated vadd), it can be 3908 // cheaper for the select to remain in the loop than be sunk out of it, 3909 // and so use the select value for the phi instead of the old 3910 // LoopExitValue. 3911 if (PreferPredicatedReductionSelect || 3912 TTI->preferPredicatedReductionSelect( 3913 RdxDesc.getOpcode(), PhiTy, 3914 TargetTransformInfo::ReductionFlags())) { 3915 auto *VecRdxPhi = 3916 cast<PHINode>(State.get(PhiR, Part)); 3917 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3918 } 3919 } 3920 } 3921 3922 // If the vector reduction can be performed in a smaller type, we truncate 3923 // then extend the loop exit value to enable InstCombine to evaluate the 3924 // entire expression in the smaller type. 3925 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3926 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3927 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3928 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3929 VectorParts RdxParts(UF); 3930 for (unsigned Part = 0; Part < UF; ++Part) { 3931 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3932 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3933 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3934 : Builder.CreateZExt(Trunc, VecTy); 3935 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3936 if (U != Trunc) { 3937 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3938 RdxParts[Part] = Extnd; 3939 } 3940 } 3941 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3942 for (unsigned Part = 0; Part < UF; ++Part) { 3943 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3944 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3945 } 3946 } 3947 3948 // Reduce all of the unrolled parts into a single vector. 3949 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3950 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3951 3952 // The middle block terminator has already been assigned a DebugLoc here (the 3953 // OrigLoop's single latch terminator). We want the whole middle block to 3954 // appear to execute on this line because: (a) it is all compiler generated, 3955 // (b) these instructions are always executed after evaluating the latch 3956 // conditional branch, and (c) other passes may add new predecessors which 3957 // terminate on this line. This is the easiest way to ensure we don't 3958 // accidentally cause an extra step back into the loop while debugging. 3959 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3960 if (PhiR->isOrdered()) 3961 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3962 else { 3963 // Floating-point operations should have some FMF to enable the reduction. 3964 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3965 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3966 for (unsigned Part = 1; Part < UF; ++Part) { 3967 Value *RdxPart = State.get(LoopExitInstDef, Part); 3968 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3969 ReducedPartRdx = Builder.CreateBinOp( 3970 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3971 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3972 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3973 ReducedPartRdx, RdxPart); 3974 else 3975 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3976 } 3977 } 3978 3979 // Create the reduction after the loop. Note that inloop reductions create the 3980 // target reduction in the loop using a Reduction recipe. 3981 if (VF.isVector() && !PhiR->isInLoop()) { 3982 ReducedPartRdx = 3983 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3984 // If the reduction can be performed in a smaller type, we need to extend 3985 // the reduction to the wider type before we branch to the original loop. 3986 if (PhiTy != RdxDesc.getRecurrenceType()) 3987 ReducedPartRdx = RdxDesc.isSigned() 3988 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3989 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3990 } 3991 3992 PHINode *ResumePhi = 3993 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 3994 3995 // Create a phi node that merges control-flow from the backedge-taken check 3996 // block and the middle block. 3997 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 3998 LoopScalarPreHeader->getTerminator()); 3999 4000 // If we are fixing reductions in the epilogue loop then we should already 4001 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4002 // we carry over the incoming values correctly. 4003 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4004 if (Incoming == LoopMiddleBlock) 4005 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4006 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4007 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4008 Incoming); 4009 else 4010 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4011 } 4012 4013 // Set the resume value for this reduction 4014 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4015 4016 // If there were stores of the reduction value to a uniform memory address 4017 // inside the loop, create the final store here. 4018 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4019 StoreInst *NewSI = 4020 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4021 propagateMetadata(NewSI, SI); 4022 4023 // If the reduction value is used in other places, 4024 // then let the code below create PHI's for that. 4025 } 4026 4027 // Now, we need to fix the users of the reduction variable 4028 // inside and outside of the scalar remainder loop. 4029 4030 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4031 // in the exit blocks. See comment on analogous loop in 4032 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4033 if (!Cost->requiresScalarEpilogue(VF)) 4034 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4035 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4036 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4037 State.Plan->removeLiveOut(&LCSSAPhi); 4038 } 4039 4040 // Fix the scalar loop reduction variable with the incoming reduction sum 4041 // from the vector body and from the backedge value. 4042 int IncomingEdgeBlockIdx = 4043 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4044 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4045 // Pick the other block. 4046 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4047 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4048 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4049 } 4050 4051 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4052 VPTransformState &State) { 4053 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4054 RecurKind RK = RdxDesc.getRecurrenceKind(); 4055 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4056 return; 4057 4058 SmallVector<VPValue *, 8> Worklist; 4059 SmallPtrSet<VPValue *, 8> Visited; 4060 Worklist.push_back(PhiR); 4061 Visited.insert(PhiR); 4062 4063 while (!Worklist.empty()) { 4064 VPValue *Cur = Worklist.pop_back_val(); 4065 for (unsigned Part = 0; Part < UF; ++Part) { 4066 Value *V = State.get(Cur, Part); 4067 if (!isa<OverflowingBinaryOperator>(V)) 4068 break; 4069 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4070 } 4071 4072 for (VPUser *U : Cur->users()) { 4073 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4074 if (!UserRecipe) 4075 continue; 4076 for (VPValue *V : UserRecipe->definedValues()) 4077 if (Visited.insert(V).second) 4078 Worklist.push_back(V); 4079 } 4080 } 4081 } 4082 4083 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4084 // The basic block and loop containing the predicated instruction. 4085 auto *PredBB = PredInst->getParent(); 4086 auto *VectorLoop = LI->getLoopFor(PredBB); 4087 4088 // Initialize a worklist with the operands of the predicated instruction. 4089 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4090 4091 // Holds instructions that we need to analyze again. An instruction may be 4092 // reanalyzed if we don't yet know if we can sink it or not. 4093 SmallVector<Instruction *, 8> InstsToReanalyze; 4094 4095 // Returns true if a given use occurs in the predicated block. Phi nodes use 4096 // their operands in their corresponding predecessor blocks. 4097 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4098 auto *I = cast<Instruction>(U.getUser()); 4099 BasicBlock *BB = I->getParent(); 4100 if (auto *Phi = dyn_cast<PHINode>(I)) 4101 BB = Phi->getIncomingBlock( 4102 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4103 return BB == PredBB; 4104 }; 4105 4106 // Iteratively sink the scalarized operands of the predicated instruction 4107 // into the block we created for it. When an instruction is sunk, it's 4108 // operands are then added to the worklist. The algorithm ends after one pass 4109 // through the worklist doesn't sink a single instruction. 4110 bool Changed; 4111 do { 4112 // Add the instructions that need to be reanalyzed to the worklist, and 4113 // reset the changed indicator. 4114 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4115 InstsToReanalyze.clear(); 4116 Changed = false; 4117 4118 while (!Worklist.empty()) { 4119 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4120 4121 // We can't sink an instruction if it is a phi node, is not in the loop, 4122 // or may have side effects. 4123 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4124 I->mayHaveSideEffects()) 4125 continue; 4126 4127 // If the instruction is already in PredBB, check if we can sink its 4128 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4129 // sinking the scalar instruction I, hence it appears in PredBB; but it 4130 // may have failed to sink I's operands (recursively), which we try 4131 // (again) here. 4132 if (I->getParent() == PredBB) { 4133 Worklist.insert(I->op_begin(), I->op_end()); 4134 continue; 4135 } 4136 4137 // It's legal to sink the instruction if all its uses occur in the 4138 // predicated block. Otherwise, there's nothing to do yet, and we may 4139 // need to reanalyze the instruction. 4140 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4141 InstsToReanalyze.push_back(I); 4142 continue; 4143 } 4144 4145 // Move the instruction to the beginning of the predicated block, and add 4146 // it's operands to the worklist. 4147 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4148 Worklist.insert(I->op_begin(), I->op_end()); 4149 4150 // The sinking may have enabled other instructions to be sunk, so we will 4151 // need to iterate. 4152 Changed = true; 4153 } 4154 } while (Changed); 4155 } 4156 4157 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4158 VPTransformState &State) { 4159 auto Iter = depth_first( 4160 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); 4161 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4162 for (VPRecipeBase &P : VPBB->phis()) { 4163 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4164 if (!VPPhi) 4165 continue; 4166 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4167 // Make sure the builder has a valid insert point. 4168 Builder.SetInsertPoint(NewPhi); 4169 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4170 VPValue *Inc = VPPhi->getIncomingValue(i); 4171 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4172 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4173 } 4174 } 4175 } 4176 } 4177 4178 bool InnerLoopVectorizer::useOrderedReductions( 4179 const RecurrenceDescriptor &RdxDesc) { 4180 return Cost->useOrderedReductions(RdxDesc); 4181 } 4182 4183 /// A helper function for checking whether an integer division-related 4184 /// instruction may divide by zero (in which case it must be predicated if 4185 /// executed conditionally in the scalar code). 4186 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4187 /// Non-zero divisors that are non compile-time constants will not be 4188 /// converted into multiplication, so we will still end up scalarizing 4189 /// the division, but can do so w/o predication. 4190 static bool mayDivideByZero(Instruction &I) { 4191 assert((I.getOpcode() == Instruction::UDiv || 4192 I.getOpcode() == Instruction::SDiv || 4193 I.getOpcode() == Instruction::URem || 4194 I.getOpcode() == Instruction::SRem) && 4195 "Unexpected instruction"); 4196 Value *Divisor = I.getOperand(1); 4197 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4198 return !CInt || CInt->isZero(); 4199 } 4200 4201 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4202 VPUser &ArgOperands, 4203 VPTransformState &State) { 4204 assert(!isa<DbgInfoIntrinsic>(I) && 4205 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4206 setDebugLocFromInst(&I); 4207 4208 Module *M = I.getParent()->getParent()->getParent(); 4209 auto *CI = cast<CallInst>(&I); 4210 4211 SmallVector<Type *, 4> Tys; 4212 for (Value *ArgOperand : CI->args()) 4213 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4214 4215 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4216 4217 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4218 // version of the instruction. 4219 // Is it beneficial to perform intrinsic call compared to lib call? 4220 bool NeedToScalarize = false; 4221 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4222 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4223 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4224 assert((UseVectorIntrinsic || !NeedToScalarize) && 4225 "Instruction should be scalarized elsewhere."); 4226 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4227 "Either the intrinsic cost or vector call cost must be valid"); 4228 4229 for (unsigned Part = 0; Part < UF; ++Part) { 4230 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4231 SmallVector<Value *, 4> Args; 4232 for (auto &I : enumerate(ArgOperands.operands())) { 4233 // Some intrinsics have a scalar argument - don't replace it with a 4234 // vector. 4235 Value *Arg; 4236 if (!UseVectorIntrinsic || 4237 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4238 Arg = State.get(I.value(), Part); 4239 else 4240 Arg = State.get(I.value(), VPIteration(0, 0)); 4241 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4242 TysForDecl.push_back(Arg->getType()); 4243 Args.push_back(Arg); 4244 } 4245 4246 Function *VectorF; 4247 if (UseVectorIntrinsic) { 4248 // Use vector version of the intrinsic. 4249 if (VF.isVector()) 4250 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4251 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4252 assert(VectorF && "Can't retrieve vector intrinsic."); 4253 } else { 4254 // Use vector version of the function call. 4255 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4256 #ifndef NDEBUG 4257 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4258 "Can't create vector function."); 4259 #endif 4260 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4261 } 4262 SmallVector<OperandBundleDef, 1> OpBundles; 4263 CI->getOperandBundlesAsDefs(OpBundles); 4264 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4265 4266 if (isa<FPMathOperator>(V)) 4267 V->copyFastMathFlags(CI); 4268 4269 State.set(Def, V, Part); 4270 addMetadata(V, &I); 4271 } 4272 } 4273 4274 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4275 // We should not collect Scalars more than once per VF. Right now, this 4276 // function is called from collectUniformsAndScalars(), which already does 4277 // this check. Collecting Scalars for VF=1 does not make any sense. 4278 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4279 "This function should not be visited twice for the same VF"); 4280 4281 // This avoids any chances of creating a REPLICATE recipe during planning 4282 // since that would result in generation of scalarized code during execution, 4283 // which is not supported for scalable vectors. 4284 if (VF.isScalable()) { 4285 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4286 return; 4287 } 4288 4289 SmallSetVector<Instruction *, 8> Worklist; 4290 4291 // These sets are used to seed the analysis with pointers used by memory 4292 // accesses that will remain scalar. 4293 SmallSetVector<Instruction *, 8> ScalarPtrs; 4294 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4295 auto *Latch = TheLoop->getLoopLatch(); 4296 4297 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4298 // The pointer operands of loads and stores will be scalar as long as the 4299 // memory access is not a gather or scatter operation. The value operand of a 4300 // store will remain scalar if the store is scalarized. 4301 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4302 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4303 assert(WideningDecision != CM_Unknown && 4304 "Widening decision should be ready at this moment"); 4305 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4306 if (Ptr == Store->getValueOperand()) 4307 return WideningDecision == CM_Scalarize; 4308 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4309 "Ptr is neither a value or pointer operand"); 4310 return WideningDecision != CM_GatherScatter; 4311 }; 4312 4313 // A helper that returns true if the given value is a bitcast or 4314 // getelementptr instruction contained in the loop. 4315 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4316 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4317 isa<GetElementPtrInst>(V)) && 4318 !TheLoop->isLoopInvariant(V); 4319 }; 4320 4321 // A helper that evaluates a memory access's use of a pointer. If the use will 4322 // be a scalar use and the pointer is only used by memory accesses, we place 4323 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4324 // PossibleNonScalarPtrs. 4325 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4326 // We only care about bitcast and getelementptr instructions contained in 4327 // the loop. 4328 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4329 return; 4330 4331 // If the pointer has already been identified as scalar (e.g., if it was 4332 // also identified as uniform), there's nothing to do. 4333 auto *I = cast<Instruction>(Ptr); 4334 if (Worklist.count(I)) 4335 return; 4336 4337 // If the use of the pointer will be a scalar use, and all users of the 4338 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4339 // place the pointer in PossibleNonScalarPtrs. 4340 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4341 return isa<LoadInst>(U) || isa<StoreInst>(U); 4342 })) 4343 ScalarPtrs.insert(I); 4344 else 4345 PossibleNonScalarPtrs.insert(I); 4346 }; 4347 4348 // We seed the scalars analysis with three classes of instructions: (1) 4349 // instructions marked uniform-after-vectorization and (2) bitcast, 4350 // getelementptr and (pointer) phi instructions used by memory accesses 4351 // requiring a scalar use. 4352 // 4353 // (1) Add to the worklist all instructions that have been identified as 4354 // uniform-after-vectorization. 4355 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4356 4357 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4358 // memory accesses requiring a scalar use. The pointer operands of loads and 4359 // stores will be scalar as long as the memory accesses is not a gather or 4360 // scatter operation. The value operand of a store will remain scalar if the 4361 // store is scalarized. 4362 for (auto *BB : TheLoop->blocks()) 4363 for (auto &I : *BB) { 4364 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4365 evaluatePtrUse(Load, Load->getPointerOperand()); 4366 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4367 evaluatePtrUse(Store, Store->getPointerOperand()); 4368 evaluatePtrUse(Store, Store->getValueOperand()); 4369 } 4370 } 4371 for (auto *I : ScalarPtrs) 4372 if (!PossibleNonScalarPtrs.count(I)) { 4373 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4374 Worklist.insert(I); 4375 } 4376 4377 // Insert the forced scalars. 4378 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4379 // induction variable when the PHI user is scalarized. 4380 auto ForcedScalar = ForcedScalars.find(VF); 4381 if (ForcedScalar != ForcedScalars.end()) 4382 for (auto *I : ForcedScalar->second) 4383 Worklist.insert(I); 4384 4385 // Expand the worklist by looking through any bitcasts and getelementptr 4386 // instructions we've already identified as scalar. This is similar to the 4387 // expansion step in collectLoopUniforms(); however, here we're only 4388 // expanding to include additional bitcasts and getelementptr instructions. 4389 unsigned Idx = 0; 4390 while (Idx != Worklist.size()) { 4391 Instruction *Dst = Worklist[Idx++]; 4392 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4393 continue; 4394 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4395 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4396 auto *J = cast<Instruction>(U); 4397 return !TheLoop->contains(J) || Worklist.count(J) || 4398 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4399 isScalarUse(J, Src)); 4400 })) { 4401 Worklist.insert(Src); 4402 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4403 } 4404 } 4405 4406 // An induction variable will remain scalar if all users of the induction 4407 // variable and induction variable update remain scalar. 4408 for (auto &Induction : Legal->getInductionVars()) { 4409 auto *Ind = Induction.first; 4410 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4411 4412 // If tail-folding is applied, the primary induction variable will be used 4413 // to feed a vector compare. 4414 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4415 continue; 4416 4417 // Returns true if \p Indvar is a pointer induction that is used directly by 4418 // load/store instruction \p I. 4419 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4420 Instruction *I) { 4421 return Induction.second.getKind() == 4422 InductionDescriptor::IK_PtrInduction && 4423 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4424 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4425 }; 4426 4427 // Determine if all users of the induction variable are scalar after 4428 // vectorization. 4429 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4430 auto *I = cast<Instruction>(U); 4431 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4432 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4433 }); 4434 if (!ScalarInd) 4435 continue; 4436 4437 // Determine if all users of the induction variable update instruction are 4438 // scalar after vectorization. 4439 auto ScalarIndUpdate = 4440 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4441 auto *I = cast<Instruction>(U); 4442 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4443 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4444 }); 4445 if (!ScalarIndUpdate) 4446 continue; 4447 4448 // The induction variable and its update instruction will remain scalar. 4449 Worklist.insert(Ind); 4450 Worklist.insert(IndUpdate); 4451 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4452 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4453 << "\n"); 4454 } 4455 4456 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4457 } 4458 4459 bool LoopVectorizationCostModel::isScalarWithPredication( 4460 Instruction *I, ElementCount VF) const { 4461 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4462 return false; 4463 switch(I->getOpcode()) { 4464 default: 4465 break; 4466 case Instruction::Load: 4467 case Instruction::Store: { 4468 if (!Legal->isMaskRequired(I)) 4469 return false; 4470 auto *Ptr = getLoadStorePointerOperand(I); 4471 auto *Ty = getLoadStoreType(I); 4472 Type *VTy = Ty; 4473 if (VF.isVector()) 4474 VTy = VectorType::get(Ty, VF); 4475 const Align Alignment = getLoadStoreAlignment(I); 4476 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4477 TTI.isLegalMaskedGather(VTy, Alignment)) 4478 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4479 TTI.isLegalMaskedScatter(VTy, Alignment)); 4480 } 4481 case Instruction::UDiv: 4482 case Instruction::SDiv: 4483 case Instruction::SRem: 4484 case Instruction::URem: 4485 return mayDivideByZero(*I); 4486 } 4487 return false; 4488 } 4489 4490 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4491 Instruction *I, ElementCount VF) { 4492 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4493 assert(getWideningDecision(I, VF) == CM_Unknown && 4494 "Decision should not be set yet."); 4495 auto *Group = getInterleavedAccessGroup(I); 4496 assert(Group && "Must have a group."); 4497 4498 // If the instruction's allocated size doesn't equal it's type size, it 4499 // requires padding and will be scalarized. 4500 auto &DL = I->getModule()->getDataLayout(); 4501 auto *ScalarTy = getLoadStoreType(I); 4502 if (hasIrregularType(ScalarTy, DL)) 4503 return false; 4504 4505 // If the group involves a non-integral pointer, we may not be able to 4506 // losslessly cast all values to a common type. 4507 unsigned InterleaveFactor = Group->getFactor(); 4508 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4509 for (unsigned i = 0; i < InterleaveFactor; i++) { 4510 Instruction *Member = Group->getMember(i); 4511 if (!Member) 4512 continue; 4513 auto *MemberTy = getLoadStoreType(Member); 4514 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4515 // Don't coerce non-integral pointers to integers or vice versa. 4516 if (MemberNI != ScalarNI) { 4517 // TODO: Consider adding special nullptr value case here 4518 return false; 4519 } else if (MemberNI && ScalarNI && 4520 ScalarTy->getPointerAddressSpace() != 4521 MemberTy->getPointerAddressSpace()) { 4522 return false; 4523 } 4524 } 4525 4526 // Check if masking is required. 4527 // A Group may need masking for one of two reasons: it resides in a block that 4528 // needs predication, or it was decided to use masking to deal with gaps 4529 // (either a gap at the end of a load-access that may result in a speculative 4530 // load, or any gaps in a store-access). 4531 bool PredicatedAccessRequiresMasking = 4532 blockNeedsPredicationForAnyReason(I->getParent()) && 4533 Legal->isMaskRequired(I); 4534 bool LoadAccessWithGapsRequiresEpilogMasking = 4535 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4536 !isScalarEpilogueAllowed(); 4537 bool StoreAccessWithGapsRequiresMasking = 4538 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4539 if (!PredicatedAccessRequiresMasking && 4540 !LoadAccessWithGapsRequiresEpilogMasking && 4541 !StoreAccessWithGapsRequiresMasking) 4542 return true; 4543 4544 // If masked interleaving is required, we expect that the user/target had 4545 // enabled it, because otherwise it either wouldn't have been created or 4546 // it should have been invalidated by the CostModel. 4547 assert(useMaskedInterleavedAccesses(TTI) && 4548 "Masked interleave-groups for predicated accesses are not enabled."); 4549 4550 if (Group->isReverse()) 4551 return false; 4552 4553 auto *Ty = getLoadStoreType(I); 4554 const Align Alignment = getLoadStoreAlignment(I); 4555 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4556 : TTI.isLegalMaskedStore(Ty, Alignment); 4557 } 4558 4559 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4560 Instruction *I, ElementCount VF) { 4561 // Get and ensure we have a valid memory instruction. 4562 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4563 4564 auto *Ptr = getLoadStorePointerOperand(I); 4565 auto *ScalarTy = getLoadStoreType(I); 4566 4567 // In order to be widened, the pointer should be consecutive, first of all. 4568 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4569 return false; 4570 4571 // If the instruction is a store located in a predicated block, it will be 4572 // scalarized. 4573 if (isScalarWithPredication(I, VF)) 4574 return false; 4575 4576 // If the instruction's allocated size doesn't equal it's type size, it 4577 // requires padding and will be scalarized. 4578 auto &DL = I->getModule()->getDataLayout(); 4579 if (hasIrregularType(ScalarTy, DL)) 4580 return false; 4581 4582 return true; 4583 } 4584 4585 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4586 // We should not collect Uniforms more than once per VF. Right now, 4587 // this function is called from collectUniformsAndScalars(), which 4588 // already does this check. Collecting Uniforms for VF=1 does not make any 4589 // sense. 4590 4591 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4592 "This function should not be visited twice for the same VF"); 4593 4594 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4595 // not analyze again. Uniforms.count(VF) will return 1. 4596 Uniforms[VF].clear(); 4597 4598 // We now know that the loop is vectorizable! 4599 // Collect instructions inside the loop that will remain uniform after 4600 // vectorization. 4601 4602 // Global values, params and instructions outside of current loop are out of 4603 // scope. 4604 auto isOutOfScope = [&](Value *V) -> bool { 4605 Instruction *I = dyn_cast<Instruction>(V); 4606 return (!I || !TheLoop->contains(I)); 4607 }; 4608 4609 // Worklist containing uniform instructions demanding lane 0. 4610 SetVector<Instruction *> Worklist; 4611 BasicBlock *Latch = TheLoop->getLoopLatch(); 4612 4613 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4614 // that are scalar with predication must not be considered uniform after 4615 // vectorization, because that would create an erroneous replicating region 4616 // where only a single instance out of VF should be formed. 4617 // TODO: optimize such seldom cases if found important, see PR40816. 4618 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4619 if (isOutOfScope(I)) { 4620 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4621 << *I << "\n"); 4622 return; 4623 } 4624 if (isScalarWithPredication(I, VF)) { 4625 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4626 << *I << "\n"); 4627 return; 4628 } 4629 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4630 Worklist.insert(I); 4631 }; 4632 4633 // Start with the conditional branch. If the branch condition is an 4634 // instruction contained in the loop that is only used by the branch, it is 4635 // uniform. 4636 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4637 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4638 addToWorklistIfAllowed(Cmp); 4639 4640 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4641 InstWidening WideningDecision = getWideningDecision(I, VF); 4642 assert(WideningDecision != CM_Unknown && 4643 "Widening decision should be ready at this moment"); 4644 4645 // A uniform memory op is itself uniform. We exclude uniform stores 4646 // here as they demand the last lane, not the first one. 4647 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4648 assert(WideningDecision == CM_Scalarize); 4649 return true; 4650 } 4651 4652 return (WideningDecision == CM_Widen || 4653 WideningDecision == CM_Widen_Reverse || 4654 WideningDecision == CM_Interleave); 4655 }; 4656 4657 4658 // Returns true if Ptr is the pointer operand of a memory access instruction 4659 // I, and I is known to not require scalarization. 4660 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4661 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4662 }; 4663 4664 // Holds a list of values which are known to have at least one uniform use. 4665 // Note that there may be other uses which aren't uniform. A "uniform use" 4666 // here is something which only demands lane 0 of the unrolled iterations; 4667 // it does not imply that all lanes produce the same value (e.g. this is not 4668 // the usual meaning of uniform) 4669 SetVector<Value *> HasUniformUse; 4670 4671 // Scan the loop for instructions which are either a) known to have only 4672 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4673 for (auto *BB : TheLoop->blocks()) 4674 for (auto &I : *BB) { 4675 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4676 switch (II->getIntrinsicID()) { 4677 case Intrinsic::sideeffect: 4678 case Intrinsic::experimental_noalias_scope_decl: 4679 case Intrinsic::assume: 4680 case Intrinsic::lifetime_start: 4681 case Intrinsic::lifetime_end: 4682 if (TheLoop->hasLoopInvariantOperands(&I)) 4683 addToWorklistIfAllowed(&I); 4684 break; 4685 default: 4686 break; 4687 } 4688 } 4689 4690 // ExtractValue instructions must be uniform, because the operands are 4691 // known to be loop-invariant. 4692 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4693 assert(isOutOfScope(EVI->getAggregateOperand()) && 4694 "Expected aggregate value to be loop invariant"); 4695 addToWorklistIfAllowed(EVI); 4696 continue; 4697 } 4698 4699 // If there's no pointer operand, there's nothing to do. 4700 auto *Ptr = getLoadStorePointerOperand(&I); 4701 if (!Ptr) 4702 continue; 4703 4704 // A uniform memory op is itself uniform. We exclude uniform stores 4705 // here as they demand the last lane, not the first one. 4706 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4707 addToWorklistIfAllowed(&I); 4708 4709 if (isUniformDecision(&I, VF)) { 4710 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4711 HasUniformUse.insert(Ptr); 4712 } 4713 } 4714 4715 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4716 // demanding) users. Since loops are assumed to be in LCSSA form, this 4717 // disallows uses outside the loop as well. 4718 for (auto *V : HasUniformUse) { 4719 if (isOutOfScope(V)) 4720 continue; 4721 auto *I = cast<Instruction>(V); 4722 auto UsersAreMemAccesses = 4723 llvm::all_of(I->users(), [&](User *U) -> bool { 4724 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4725 }); 4726 if (UsersAreMemAccesses) 4727 addToWorklistIfAllowed(I); 4728 } 4729 4730 // Expand Worklist in topological order: whenever a new instruction 4731 // is added , its users should be already inside Worklist. It ensures 4732 // a uniform instruction will only be used by uniform instructions. 4733 unsigned idx = 0; 4734 while (idx != Worklist.size()) { 4735 Instruction *I = Worklist[idx++]; 4736 4737 for (auto OV : I->operand_values()) { 4738 // isOutOfScope operands cannot be uniform instructions. 4739 if (isOutOfScope(OV)) 4740 continue; 4741 // First order recurrence Phi's should typically be considered 4742 // non-uniform. 4743 auto *OP = dyn_cast<PHINode>(OV); 4744 if (OP && Legal->isFirstOrderRecurrence(OP)) 4745 continue; 4746 // If all the users of the operand are uniform, then add the 4747 // operand into the uniform worklist. 4748 auto *OI = cast<Instruction>(OV); 4749 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4750 auto *J = cast<Instruction>(U); 4751 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4752 })) 4753 addToWorklistIfAllowed(OI); 4754 } 4755 } 4756 4757 // For an instruction to be added into Worklist above, all its users inside 4758 // the loop should also be in Worklist. However, this condition cannot be 4759 // true for phi nodes that form a cyclic dependence. We must process phi 4760 // nodes separately. An induction variable will remain uniform if all users 4761 // of the induction variable and induction variable update remain uniform. 4762 // The code below handles both pointer and non-pointer induction variables. 4763 for (auto &Induction : Legal->getInductionVars()) { 4764 auto *Ind = Induction.first; 4765 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4766 4767 // Determine if all users of the induction variable are uniform after 4768 // vectorization. 4769 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4770 auto *I = cast<Instruction>(U); 4771 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4772 isVectorizedMemAccessUse(I, Ind); 4773 }); 4774 if (!UniformInd) 4775 continue; 4776 4777 // Determine if all users of the induction variable update instruction are 4778 // uniform after vectorization. 4779 auto UniformIndUpdate = 4780 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4781 auto *I = cast<Instruction>(U); 4782 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4783 isVectorizedMemAccessUse(I, IndUpdate); 4784 }); 4785 if (!UniformIndUpdate) 4786 continue; 4787 4788 // The induction variable and its update instruction will remain uniform. 4789 addToWorklistIfAllowed(Ind); 4790 addToWorklistIfAllowed(IndUpdate); 4791 } 4792 4793 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4794 } 4795 4796 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4797 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4798 4799 if (Legal->getRuntimePointerChecking()->Need) { 4800 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4801 "runtime pointer checks needed. Enable vectorization of this " 4802 "loop with '#pragma clang loop vectorize(enable)' when " 4803 "compiling with -Os/-Oz", 4804 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4805 return true; 4806 } 4807 4808 if (!PSE.getPredicate().isAlwaysTrue()) { 4809 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4810 "runtime SCEV checks needed. Enable vectorization of this " 4811 "loop with '#pragma clang loop vectorize(enable)' when " 4812 "compiling with -Os/-Oz", 4813 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4814 return true; 4815 } 4816 4817 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4818 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4819 reportVectorizationFailure("Runtime stride check for small trip count", 4820 "runtime stride == 1 checks needed. Enable vectorization of " 4821 "this loop without such check by compiling with -Os/-Oz", 4822 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4823 return true; 4824 } 4825 4826 return false; 4827 } 4828 4829 ElementCount 4830 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4831 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4832 return ElementCount::getScalable(0); 4833 4834 if (Hints->isScalableVectorizationDisabled()) { 4835 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4836 "ScalableVectorizationDisabled", ORE, TheLoop); 4837 return ElementCount::getScalable(0); 4838 } 4839 4840 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4841 4842 auto MaxScalableVF = ElementCount::getScalable( 4843 std::numeric_limits<ElementCount::ScalarTy>::max()); 4844 4845 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4846 // FIXME: While for scalable vectors this is currently sufficient, this should 4847 // be replaced by a more detailed mechanism that filters out specific VFs, 4848 // instead of invalidating vectorization for a whole set of VFs based on the 4849 // MaxVF. 4850 4851 // Disable scalable vectorization if the loop contains unsupported reductions. 4852 if (!canVectorizeReductions(MaxScalableVF)) { 4853 reportVectorizationInfo( 4854 "Scalable vectorization not supported for the reduction " 4855 "operations found in this loop.", 4856 "ScalableVFUnfeasible", ORE, TheLoop); 4857 return ElementCount::getScalable(0); 4858 } 4859 4860 // Disable scalable vectorization if the loop contains any instructions 4861 // with element types not supported for scalable vectors. 4862 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4863 return !Ty->isVoidTy() && 4864 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4865 })) { 4866 reportVectorizationInfo("Scalable vectorization is not supported " 4867 "for all element types found in this loop.", 4868 "ScalableVFUnfeasible", ORE, TheLoop); 4869 return ElementCount::getScalable(0); 4870 } 4871 4872 if (Legal->isSafeForAnyVectorWidth()) 4873 return MaxScalableVF; 4874 4875 // Limit MaxScalableVF by the maximum safe dependence distance. 4876 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4877 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4878 MaxVScale = 4879 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4880 MaxScalableVF = ElementCount::getScalable( 4881 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 4882 if (!MaxScalableVF) 4883 reportVectorizationInfo( 4884 "Max legal vector width too small, scalable vectorization " 4885 "unfeasible.", 4886 "ScalableVFUnfeasible", ORE, TheLoop); 4887 4888 return MaxScalableVF; 4889 } 4890 4891 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4892 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4893 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4894 unsigned SmallestType, WidestType; 4895 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4896 4897 // Get the maximum safe dependence distance in bits computed by LAA. 4898 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4899 // the memory accesses that is most restrictive (involved in the smallest 4900 // dependence distance). 4901 unsigned MaxSafeElements = 4902 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4903 4904 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4905 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4906 4907 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4908 << ".\n"); 4909 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4910 << ".\n"); 4911 4912 // First analyze the UserVF, fall back if the UserVF should be ignored. 4913 if (UserVF) { 4914 auto MaxSafeUserVF = 4915 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4916 4917 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4918 // If `VF=vscale x N` is safe, then so is `VF=N` 4919 if (UserVF.isScalable()) 4920 return FixedScalableVFPair( 4921 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4922 else 4923 return UserVF; 4924 } 4925 4926 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4927 4928 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4929 // is better to ignore the hint and let the compiler choose a suitable VF. 4930 if (!UserVF.isScalable()) { 4931 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4932 << " is unsafe, clamping to max safe VF=" 4933 << MaxSafeFixedVF << ".\n"); 4934 ORE->emit([&]() { 4935 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4936 TheLoop->getStartLoc(), 4937 TheLoop->getHeader()) 4938 << "User-specified vectorization factor " 4939 << ore::NV("UserVectorizationFactor", UserVF) 4940 << " is unsafe, clamping to maximum safe vectorization factor " 4941 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4942 }); 4943 return MaxSafeFixedVF; 4944 } 4945 4946 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4947 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4948 << " is ignored because scalable vectors are not " 4949 "available.\n"); 4950 ORE->emit([&]() { 4951 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4952 TheLoop->getStartLoc(), 4953 TheLoop->getHeader()) 4954 << "User-specified vectorization factor " 4955 << ore::NV("UserVectorizationFactor", UserVF) 4956 << " is ignored because the target does not support scalable " 4957 "vectors. The compiler will pick a more suitable value."; 4958 }); 4959 } else { 4960 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4961 << " is unsafe. Ignoring scalable UserVF.\n"); 4962 ORE->emit([&]() { 4963 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4964 TheLoop->getStartLoc(), 4965 TheLoop->getHeader()) 4966 << "User-specified vectorization factor " 4967 << ore::NV("UserVectorizationFactor", UserVF) 4968 << " is unsafe. Ignoring the hint to let the compiler pick a " 4969 "more suitable value."; 4970 }); 4971 } 4972 } 4973 4974 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4975 << " / " << WidestType << " bits.\n"); 4976 4977 FixedScalableVFPair Result(ElementCount::getFixed(1), 4978 ElementCount::getScalable(0)); 4979 if (auto MaxVF = 4980 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4981 MaxSafeFixedVF, FoldTailByMasking)) 4982 Result.FixedVF = MaxVF; 4983 4984 if (auto MaxVF = 4985 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4986 MaxSafeScalableVF, FoldTailByMasking)) 4987 if (MaxVF.isScalable()) { 4988 Result.ScalableVF = MaxVF; 4989 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4990 << "\n"); 4991 } 4992 4993 return Result; 4994 } 4995 4996 FixedScalableVFPair 4997 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4998 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4999 // TODO: It may by useful to do since it's still likely to be dynamically 5000 // uniform if the target can skip. 5001 reportVectorizationFailure( 5002 "Not inserting runtime ptr check for divergent target", 5003 "runtime pointer checks needed. Not enabled for divergent target", 5004 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5005 return FixedScalableVFPair::getNone(); 5006 } 5007 5008 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5009 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5010 if (TC == 1) { 5011 reportVectorizationFailure("Single iteration (non) loop", 5012 "loop trip count is one, irrelevant for vectorization", 5013 "SingleIterationLoop", ORE, TheLoop); 5014 return FixedScalableVFPair::getNone(); 5015 } 5016 5017 switch (ScalarEpilogueStatus) { 5018 case CM_ScalarEpilogueAllowed: 5019 return computeFeasibleMaxVF(TC, UserVF, false); 5020 case CM_ScalarEpilogueNotAllowedUsePredicate: 5021 LLVM_FALLTHROUGH; 5022 case CM_ScalarEpilogueNotNeededUsePredicate: 5023 LLVM_DEBUG( 5024 dbgs() << "LV: vector predicate hint/switch found.\n" 5025 << "LV: Not allowing scalar epilogue, creating predicated " 5026 << "vector loop.\n"); 5027 break; 5028 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5029 // fallthrough as a special case of OptForSize 5030 case CM_ScalarEpilogueNotAllowedOptSize: 5031 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5032 LLVM_DEBUG( 5033 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5034 else 5035 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5036 << "count.\n"); 5037 5038 // Bail if runtime checks are required, which are not good when optimising 5039 // for size. 5040 if (runtimeChecksRequired()) 5041 return FixedScalableVFPair::getNone(); 5042 5043 break; 5044 } 5045 5046 // The only loops we can vectorize without a scalar epilogue, are loops with 5047 // a bottom-test and a single exiting block. We'd have to handle the fact 5048 // that not every instruction executes on the last iteration. This will 5049 // require a lane mask which varies through the vector loop body. (TODO) 5050 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5051 // If there was a tail-folding hint/switch, but we can't fold the tail by 5052 // masking, fallback to a vectorization with a scalar epilogue. 5053 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5054 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5055 "scalar epilogue instead.\n"); 5056 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5057 return computeFeasibleMaxVF(TC, UserVF, false); 5058 } 5059 return FixedScalableVFPair::getNone(); 5060 } 5061 5062 // Now try the tail folding 5063 5064 // Invalidate interleave groups that require an epilogue if we can't mask 5065 // the interleave-group. 5066 if (!useMaskedInterleavedAccesses(TTI)) { 5067 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5068 "No decisions should have been taken at this point"); 5069 // Note: There is no need to invalidate any cost modeling decisions here, as 5070 // non where taken so far. 5071 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5072 } 5073 5074 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5075 // Avoid tail folding if the trip count is known to be a multiple of any VF 5076 // we chose. 5077 // FIXME: The condition below pessimises the case for fixed-width vectors, 5078 // when scalable VFs are also candidates for vectorization. 5079 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5080 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5081 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5082 "MaxFixedVF must be a power of 2"); 5083 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5084 : MaxFixedVF.getFixedValue(); 5085 ScalarEvolution *SE = PSE.getSE(); 5086 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5087 const SCEV *ExitCount = SE->getAddExpr( 5088 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5089 const SCEV *Rem = SE->getURemExpr( 5090 SE->applyLoopGuards(ExitCount, TheLoop), 5091 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5092 if (Rem->isZero()) { 5093 // Accept MaxFixedVF if we do not have a tail. 5094 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5095 return MaxFactors; 5096 } 5097 } 5098 5099 // If we don't know the precise trip count, or if the trip count that we 5100 // found modulo the vectorization factor is not zero, try to fold the tail 5101 // by masking. 5102 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5103 if (Legal->prepareToFoldTailByMasking()) { 5104 FoldTailByMasking = true; 5105 return MaxFactors; 5106 } 5107 5108 // If there was a tail-folding hint/switch, but we can't fold the tail by 5109 // masking, fallback to a vectorization with a scalar epilogue. 5110 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5111 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5112 "scalar epilogue instead.\n"); 5113 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5114 return MaxFactors; 5115 } 5116 5117 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5118 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5119 return FixedScalableVFPair::getNone(); 5120 } 5121 5122 if (TC == 0) { 5123 reportVectorizationFailure( 5124 "Unable to calculate the loop count due to complex control flow", 5125 "unable to calculate the loop count due to complex control flow", 5126 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5127 return FixedScalableVFPair::getNone(); 5128 } 5129 5130 reportVectorizationFailure( 5131 "Cannot optimize for size and vectorize at the same time.", 5132 "cannot optimize for size and vectorize at the same time. " 5133 "Enable vectorization of this loop with '#pragma clang loop " 5134 "vectorize(enable)' when compiling with -Os/-Oz", 5135 "NoTailLoopWithOptForSize", ORE, TheLoop); 5136 return FixedScalableVFPair::getNone(); 5137 } 5138 5139 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5140 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5141 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5142 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5143 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5144 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5145 : TargetTransformInfo::RGK_FixedWidthVector); 5146 5147 // Convenience function to return the minimum of two ElementCounts. 5148 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5149 assert((LHS.isScalable() == RHS.isScalable()) && 5150 "Scalable flags must match"); 5151 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5152 }; 5153 5154 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5155 // Note that both WidestRegister and WidestType may not be a powers of 2. 5156 auto MaxVectorElementCount = ElementCount::get( 5157 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5158 ComputeScalableMaxVF); 5159 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5160 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5161 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5162 5163 if (!MaxVectorElementCount) { 5164 LLVM_DEBUG(dbgs() << "LV: The target has no " 5165 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5166 << " vector registers.\n"); 5167 return ElementCount::getFixed(1); 5168 } 5169 5170 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5171 if (ConstTripCount && 5172 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5173 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5174 // If loop trip count (TC) is known at compile time there is no point in 5175 // choosing VF greater than TC (as done in the loop below). Select maximum 5176 // power of two which doesn't exceed TC. 5177 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5178 // when the TC is less than or equal to the known number of lanes. 5179 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5180 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5181 "exceeding the constant trip count: " 5182 << ClampedConstTripCount << "\n"); 5183 return ElementCount::getFixed(ClampedConstTripCount); 5184 } 5185 5186 TargetTransformInfo::RegisterKind RegKind = 5187 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5188 : TargetTransformInfo::RGK_FixedWidthVector; 5189 ElementCount MaxVF = MaxVectorElementCount; 5190 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5191 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5192 auto MaxVectorElementCountMaxBW = ElementCount::get( 5193 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5194 ComputeScalableMaxVF); 5195 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5196 5197 // Collect all viable vectorization factors larger than the default MaxVF 5198 // (i.e. MaxVectorElementCount). 5199 SmallVector<ElementCount, 8> VFs; 5200 for (ElementCount VS = MaxVectorElementCount * 2; 5201 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5202 VFs.push_back(VS); 5203 5204 // For each VF calculate its register usage. 5205 auto RUs = calculateRegisterUsage(VFs); 5206 5207 // Select the largest VF which doesn't require more registers than existing 5208 // ones. 5209 for (int i = RUs.size() - 1; i >= 0; --i) { 5210 bool Selected = true; 5211 for (auto &pair : RUs[i].MaxLocalUsers) { 5212 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5213 if (pair.second > TargetNumRegisters) 5214 Selected = false; 5215 } 5216 if (Selected) { 5217 MaxVF = VFs[i]; 5218 break; 5219 } 5220 } 5221 if (ElementCount MinVF = 5222 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5223 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5224 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5225 << ") with target's minimum: " << MinVF << '\n'); 5226 MaxVF = MinVF; 5227 } 5228 } 5229 5230 // Invalidate any widening decisions we might have made, in case the loop 5231 // requires prediction (decided later), but we have already made some 5232 // load/store widening decisions. 5233 invalidateCostModelingDecisions(); 5234 } 5235 return MaxVF; 5236 } 5237 5238 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5239 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5240 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5241 auto Min = Attr.getVScaleRangeMin(); 5242 auto Max = Attr.getVScaleRangeMax(); 5243 if (Max && Min == Max) 5244 return Max; 5245 } 5246 5247 return TTI.getVScaleForTuning(); 5248 } 5249 5250 bool LoopVectorizationCostModel::isMoreProfitable( 5251 const VectorizationFactor &A, const VectorizationFactor &B) const { 5252 InstructionCost CostA = A.Cost; 5253 InstructionCost CostB = B.Cost; 5254 5255 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5256 5257 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5258 MaxTripCount) { 5259 // If we are folding the tail and the trip count is a known (possibly small) 5260 // constant, the trip count will be rounded up to an integer number of 5261 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5262 // which we compare directly. When not folding the tail, the total cost will 5263 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5264 // approximated with the per-lane cost below instead of using the tripcount 5265 // as here. 5266 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5267 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5268 return RTCostA < RTCostB; 5269 } 5270 5271 // Improve estimate for the vector width if it is scalable. 5272 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5273 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5274 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5275 if (A.Width.isScalable()) 5276 EstimatedWidthA *= VScale.getValue(); 5277 if (B.Width.isScalable()) 5278 EstimatedWidthB *= VScale.getValue(); 5279 } 5280 5281 // Assume vscale may be larger than 1 (or the value being tuned for), 5282 // so that scalable vectorization is slightly favorable over fixed-width 5283 // vectorization. 5284 if (A.Width.isScalable() && !B.Width.isScalable()) 5285 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5286 5287 // To avoid the need for FP division: 5288 // (CostA / A.Width) < (CostB / B.Width) 5289 // <=> (CostA * B.Width) < (CostB * A.Width) 5290 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5291 } 5292 5293 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5294 const ElementCountSet &VFCandidates) { 5295 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5296 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5297 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5298 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5299 "Expected Scalar VF to be a candidate"); 5300 5301 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5302 ExpectedCost); 5303 VectorizationFactor ChosenFactor = ScalarCost; 5304 5305 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5306 if (ForceVectorization && VFCandidates.size() > 1) { 5307 // Ignore scalar width, because the user explicitly wants vectorization. 5308 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5309 // evaluation. 5310 ChosenFactor.Cost = InstructionCost::getMax(); 5311 } 5312 5313 SmallVector<InstructionVFPair> InvalidCosts; 5314 for (const auto &i : VFCandidates) { 5315 // The cost for scalar VF=1 is already calculated, so ignore it. 5316 if (i.isScalar()) 5317 continue; 5318 5319 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5320 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5321 5322 #ifndef NDEBUG 5323 unsigned AssumedMinimumVscale = 1; 5324 if (Optional<unsigned> VScale = getVScaleForTuning()) 5325 AssumedMinimumVscale = *VScale; 5326 unsigned Width = 5327 Candidate.Width.isScalable() 5328 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5329 : Candidate.Width.getFixedValue(); 5330 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5331 << " costs: " << (Candidate.Cost / Width)); 5332 if (i.isScalable()) 5333 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5334 << AssumedMinimumVscale << ")"); 5335 LLVM_DEBUG(dbgs() << ".\n"); 5336 #endif 5337 5338 if (!C.second && !ForceVectorization) { 5339 LLVM_DEBUG( 5340 dbgs() << "LV: Not considering vector loop of width " << i 5341 << " because it will not generate any vector instructions.\n"); 5342 continue; 5343 } 5344 5345 // If profitable add it to ProfitableVF list. 5346 if (isMoreProfitable(Candidate, ScalarCost)) 5347 ProfitableVFs.push_back(Candidate); 5348 5349 if (isMoreProfitable(Candidate, ChosenFactor)) 5350 ChosenFactor = Candidate; 5351 } 5352 5353 // Emit a report of VFs with invalid costs in the loop. 5354 if (!InvalidCosts.empty()) { 5355 // Group the remarks per instruction, keeping the instruction order from 5356 // InvalidCosts. 5357 std::map<Instruction *, unsigned> Numbering; 5358 unsigned I = 0; 5359 for (auto &Pair : InvalidCosts) 5360 if (!Numbering.count(Pair.first)) 5361 Numbering[Pair.first] = I++; 5362 5363 // Sort the list, first on instruction(number) then on VF. 5364 llvm::sort(InvalidCosts, 5365 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5366 if (Numbering[A.first] != Numbering[B.first]) 5367 return Numbering[A.first] < Numbering[B.first]; 5368 ElementCountComparator ECC; 5369 return ECC(A.second, B.second); 5370 }); 5371 5372 // For a list of ordered instruction-vf pairs: 5373 // [(load, vf1), (load, vf2), (store, vf1)] 5374 // Group the instructions together to emit separate remarks for: 5375 // load (vf1, vf2) 5376 // store (vf1) 5377 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5378 auto Subset = ArrayRef<InstructionVFPair>(); 5379 do { 5380 if (Subset.empty()) 5381 Subset = Tail.take_front(1); 5382 5383 Instruction *I = Subset.front().first; 5384 5385 // If the next instruction is different, or if there are no other pairs, 5386 // emit a remark for the collated subset. e.g. 5387 // [(load, vf1), (load, vf2))] 5388 // to emit: 5389 // remark: invalid costs for 'load' at VF=(vf, vf2) 5390 if (Subset == Tail || Tail[Subset.size()].first != I) { 5391 std::string OutString; 5392 raw_string_ostream OS(OutString); 5393 assert(!Subset.empty() && "Unexpected empty range"); 5394 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5395 for (auto &Pair : Subset) 5396 OS << (Pair.second == Subset.front().second ? "" : ", ") 5397 << Pair.second; 5398 OS << "):"; 5399 if (auto *CI = dyn_cast<CallInst>(I)) 5400 OS << " call to " << CI->getCalledFunction()->getName(); 5401 else 5402 OS << " " << I->getOpcodeName(); 5403 OS.flush(); 5404 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5405 Tail = Tail.drop_front(Subset.size()); 5406 Subset = {}; 5407 } else 5408 // Grow the subset by one element 5409 Subset = Tail.take_front(Subset.size() + 1); 5410 } while (!Tail.empty()); 5411 } 5412 5413 if (!EnableCondStoresVectorization && NumPredStores) { 5414 reportVectorizationFailure("There are conditional stores.", 5415 "store that is conditionally executed prevents vectorization", 5416 "ConditionalStore", ORE, TheLoop); 5417 ChosenFactor = ScalarCost; 5418 } 5419 5420 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5421 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5422 << "LV: Vectorization seems to be not beneficial, " 5423 << "but was forced by a user.\n"); 5424 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5425 return ChosenFactor; 5426 } 5427 5428 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5429 const Loop &L, ElementCount VF) const { 5430 // Cross iteration phis such as reductions need special handling and are 5431 // currently unsupported. 5432 if (any_of(L.getHeader()->phis(), 5433 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5434 return false; 5435 5436 // Phis with uses outside of the loop require special handling and are 5437 // currently unsupported. 5438 for (auto &Entry : Legal->getInductionVars()) { 5439 // Look for uses of the value of the induction at the last iteration. 5440 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5441 for (User *U : PostInc->users()) 5442 if (!L.contains(cast<Instruction>(U))) 5443 return false; 5444 // Look for uses of penultimate value of the induction. 5445 for (User *U : Entry.first->users()) 5446 if (!L.contains(cast<Instruction>(U))) 5447 return false; 5448 } 5449 5450 // Induction variables that are widened require special handling that is 5451 // currently not supported. 5452 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5453 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5454 this->isProfitableToScalarize(Entry.first, VF)); 5455 })) 5456 return false; 5457 5458 // Epilogue vectorization code has not been auditted to ensure it handles 5459 // non-latch exits properly. It may be fine, but it needs auditted and 5460 // tested. 5461 if (L.getExitingBlock() != L.getLoopLatch()) 5462 return false; 5463 5464 return true; 5465 } 5466 5467 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5468 const ElementCount VF) const { 5469 // FIXME: We need a much better cost-model to take different parameters such 5470 // as register pressure, code size increase and cost of extra branches into 5471 // account. For now we apply a very crude heuristic and only consider loops 5472 // with vectorization factors larger than a certain value. 5473 // We also consider epilogue vectorization unprofitable for targets that don't 5474 // consider interleaving beneficial (eg. MVE). 5475 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5476 return false; 5477 // FIXME: We should consider changing the threshold for scalable 5478 // vectors to take VScaleForTuning into account. 5479 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5480 return true; 5481 return false; 5482 } 5483 5484 VectorizationFactor 5485 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5486 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5487 VectorizationFactor Result = VectorizationFactor::Disabled(); 5488 if (!EnableEpilogueVectorization) { 5489 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5490 return Result; 5491 } 5492 5493 if (!isScalarEpilogueAllowed()) { 5494 LLVM_DEBUG( 5495 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5496 "allowed.\n";); 5497 return Result; 5498 } 5499 5500 // Not really a cost consideration, but check for unsupported cases here to 5501 // simplify the logic. 5502 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5503 LLVM_DEBUG( 5504 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5505 "not a supported candidate.\n";); 5506 return Result; 5507 } 5508 5509 if (EpilogueVectorizationForceVF > 1) { 5510 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5511 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5512 if (LVP.hasPlanWithVF(ForcedEC)) 5513 return {ForcedEC, 0, 0}; 5514 else { 5515 LLVM_DEBUG( 5516 dbgs() 5517 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5518 return Result; 5519 } 5520 } 5521 5522 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5523 TheLoop->getHeader()->getParent()->hasMinSize()) { 5524 LLVM_DEBUG( 5525 dbgs() 5526 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5527 return Result; 5528 } 5529 5530 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5531 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5532 "this loop\n"); 5533 return Result; 5534 } 5535 5536 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5537 // the main loop handles 8 lanes per iteration. We could still benefit from 5538 // vectorizing the epilogue loop with VF=4. 5539 ElementCount EstimatedRuntimeVF = MainLoopVF; 5540 if (MainLoopVF.isScalable()) { 5541 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5542 if (Optional<unsigned> VScale = getVScaleForTuning()) 5543 EstimatedRuntimeVF *= *VScale; 5544 } 5545 5546 for (auto &NextVF : ProfitableVFs) 5547 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5548 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5549 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5550 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5551 LVP.hasPlanWithVF(NextVF.Width)) 5552 Result = NextVF; 5553 5554 if (Result != VectorizationFactor::Disabled()) 5555 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5556 << Result.Width << "\n";); 5557 return Result; 5558 } 5559 5560 std::pair<unsigned, unsigned> 5561 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5562 unsigned MinWidth = -1U; 5563 unsigned MaxWidth = 8; 5564 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5565 // For in-loop reductions, no element types are added to ElementTypesInLoop 5566 // if there are no loads/stores in the loop. In this case, check through the 5567 // reduction variables to determine the maximum width. 5568 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5569 // Reset MaxWidth so that we can find the smallest type used by recurrences 5570 // in the loop. 5571 MaxWidth = -1U; 5572 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5573 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5574 // When finding the min width used by the recurrence we need to account 5575 // for casts on the input operands of the recurrence. 5576 MaxWidth = std::min<unsigned>( 5577 MaxWidth, std::min<unsigned>( 5578 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5579 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5580 } 5581 } else { 5582 for (Type *T : ElementTypesInLoop) { 5583 MinWidth = std::min<unsigned>( 5584 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5585 MaxWidth = std::max<unsigned>( 5586 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5587 } 5588 } 5589 return {MinWidth, MaxWidth}; 5590 } 5591 5592 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5593 ElementTypesInLoop.clear(); 5594 // For each block. 5595 for (BasicBlock *BB : TheLoop->blocks()) { 5596 // For each instruction in the loop. 5597 for (Instruction &I : BB->instructionsWithoutDebug()) { 5598 Type *T = I.getType(); 5599 5600 // Skip ignored values. 5601 if (ValuesToIgnore.count(&I)) 5602 continue; 5603 5604 // Only examine Loads, Stores and PHINodes. 5605 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5606 continue; 5607 5608 // Examine PHI nodes that are reduction variables. Update the type to 5609 // account for the recurrence type. 5610 if (auto *PN = dyn_cast<PHINode>(&I)) { 5611 if (!Legal->isReductionVariable(PN)) 5612 continue; 5613 const RecurrenceDescriptor &RdxDesc = 5614 Legal->getReductionVars().find(PN)->second; 5615 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5616 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5617 RdxDesc.getRecurrenceType(), 5618 TargetTransformInfo::ReductionFlags())) 5619 continue; 5620 T = RdxDesc.getRecurrenceType(); 5621 } 5622 5623 // Examine the stored values. 5624 if (auto *ST = dyn_cast<StoreInst>(&I)) 5625 T = ST->getValueOperand()->getType(); 5626 5627 assert(T->isSized() && 5628 "Expected the load/store/recurrence type to be sized"); 5629 5630 ElementTypesInLoop.insert(T); 5631 } 5632 } 5633 } 5634 5635 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5636 unsigned LoopCost) { 5637 // -- The interleave heuristics -- 5638 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5639 // There are many micro-architectural considerations that we can't predict 5640 // at this level. For example, frontend pressure (on decode or fetch) due to 5641 // code size, or the number and capabilities of the execution ports. 5642 // 5643 // We use the following heuristics to select the interleave count: 5644 // 1. If the code has reductions, then we interleave to break the cross 5645 // iteration dependency. 5646 // 2. If the loop is really small, then we interleave to reduce the loop 5647 // overhead. 5648 // 3. We don't interleave if we think that we will spill registers to memory 5649 // due to the increased register pressure. 5650 5651 if (!isScalarEpilogueAllowed()) 5652 return 1; 5653 5654 // We used the distance for the interleave count. 5655 if (Legal->getMaxSafeDepDistBytes() != -1U) 5656 return 1; 5657 5658 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5659 const bool HasReductions = !Legal->getReductionVars().empty(); 5660 // Do not interleave loops with a relatively small known or estimated trip 5661 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5662 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5663 // because with the above conditions interleaving can expose ILP and break 5664 // cross iteration dependences for reductions. 5665 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5666 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5667 return 1; 5668 5669 // If we did not calculate the cost for VF (because the user selected the VF) 5670 // then we calculate the cost of VF here. 5671 if (LoopCost == 0) { 5672 InstructionCost C = expectedCost(VF).first; 5673 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5674 LoopCost = *C.getValue(); 5675 5676 // Loop body is free and there is no need for interleaving. 5677 if (LoopCost == 0) 5678 return 1; 5679 } 5680 5681 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5682 // We divide by these constants so assume that we have at least one 5683 // instruction that uses at least one register. 5684 for (auto& pair : R.MaxLocalUsers) { 5685 pair.second = std::max(pair.second, 1U); 5686 } 5687 5688 // We calculate the interleave count using the following formula. 5689 // Subtract the number of loop invariants from the number of available 5690 // registers. These registers are used by all of the interleaved instances. 5691 // Next, divide the remaining registers by the number of registers that is 5692 // required by the loop, in order to estimate how many parallel instances 5693 // fit without causing spills. All of this is rounded down if necessary to be 5694 // a power of two. We want power of two interleave count to simplify any 5695 // addressing operations or alignment considerations. 5696 // We also want power of two interleave counts to ensure that the induction 5697 // variable of the vector loop wraps to zero, when tail is folded by masking; 5698 // this currently happens when OptForSize, in which case IC is set to 1 above. 5699 unsigned IC = UINT_MAX; 5700 5701 for (auto& pair : R.MaxLocalUsers) { 5702 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5703 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5704 << " registers of " 5705 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5706 if (VF.isScalar()) { 5707 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5708 TargetNumRegisters = ForceTargetNumScalarRegs; 5709 } else { 5710 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5711 TargetNumRegisters = ForceTargetNumVectorRegs; 5712 } 5713 unsigned MaxLocalUsers = pair.second; 5714 unsigned LoopInvariantRegs = 0; 5715 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5716 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5717 5718 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5719 // Don't count the induction variable as interleaved. 5720 if (EnableIndVarRegisterHeur) { 5721 TmpIC = 5722 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5723 std::max(1U, (MaxLocalUsers - 1))); 5724 } 5725 5726 IC = std::min(IC, TmpIC); 5727 } 5728 5729 // Clamp the interleave ranges to reasonable counts. 5730 unsigned MaxInterleaveCount = 5731 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5732 5733 // Check if the user has overridden the max. 5734 if (VF.isScalar()) { 5735 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5736 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5737 } else { 5738 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5739 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5740 } 5741 5742 // If trip count is known or estimated compile time constant, limit the 5743 // interleave count to be less than the trip count divided by VF, provided it 5744 // is at least 1. 5745 // 5746 // For scalable vectors we can't know if interleaving is beneficial. It may 5747 // not be beneficial for small loops if none of the lanes in the second vector 5748 // iterations is enabled. However, for larger loops, there is likely to be a 5749 // similar benefit as for fixed-width vectors. For now, we choose to leave 5750 // the InterleaveCount as if vscale is '1', although if some information about 5751 // the vector is known (e.g. min vector size), we can make a better decision. 5752 if (BestKnownTC) { 5753 MaxInterleaveCount = 5754 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5755 // Make sure MaxInterleaveCount is greater than 0. 5756 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5757 } 5758 5759 assert(MaxInterleaveCount > 0 && 5760 "Maximum interleave count must be greater than 0"); 5761 5762 // Clamp the calculated IC to be between the 1 and the max interleave count 5763 // that the target and trip count allows. 5764 if (IC > MaxInterleaveCount) 5765 IC = MaxInterleaveCount; 5766 else 5767 // Make sure IC is greater than 0. 5768 IC = std::max(1u, IC); 5769 5770 assert(IC > 0 && "Interleave count must be greater than 0."); 5771 5772 // Interleave if we vectorized this loop and there is a reduction that could 5773 // benefit from interleaving. 5774 if (VF.isVector() && HasReductions) { 5775 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5776 return IC; 5777 } 5778 5779 // For any scalar loop that either requires runtime checks or predication we 5780 // are better off leaving this to the unroller. Note that if we've already 5781 // vectorized the loop we will have done the runtime check and so interleaving 5782 // won't require further checks. 5783 bool ScalarInterleavingRequiresPredication = 5784 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5785 return Legal->blockNeedsPredication(BB); 5786 })); 5787 bool ScalarInterleavingRequiresRuntimePointerCheck = 5788 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5789 5790 // We want to interleave small loops in order to reduce the loop overhead and 5791 // potentially expose ILP opportunities. 5792 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5793 << "LV: IC is " << IC << '\n' 5794 << "LV: VF is " << VF << '\n'); 5795 const bool AggressivelyInterleaveReductions = 5796 TTI.enableAggressiveInterleaving(HasReductions); 5797 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5798 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5799 // We assume that the cost overhead is 1 and we use the cost model 5800 // to estimate the cost of the loop and interleave until the cost of the 5801 // loop overhead is about 5% of the cost of the loop. 5802 unsigned SmallIC = 5803 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5804 5805 // Interleave until store/load ports (estimated by max interleave count) are 5806 // saturated. 5807 unsigned NumStores = Legal->getNumStores(); 5808 unsigned NumLoads = Legal->getNumLoads(); 5809 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5810 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5811 5812 // There is little point in interleaving for reductions containing selects 5813 // and compares when VF=1 since it may just create more overhead than it's 5814 // worth for loops with small trip counts. This is because we still have to 5815 // do the final reduction after the loop. 5816 bool HasSelectCmpReductions = 5817 HasReductions && 5818 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5819 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5820 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5821 RdxDesc.getRecurrenceKind()); 5822 }); 5823 if (HasSelectCmpReductions) { 5824 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5825 return 1; 5826 } 5827 5828 // If we have a scalar reduction (vector reductions are already dealt with 5829 // by this point), we can increase the critical path length if the loop 5830 // we're interleaving is inside another loop. For tree-wise reductions 5831 // set the limit to 2, and for ordered reductions it's best to disable 5832 // interleaving entirely. 5833 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5834 bool HasOrderedReductions = 5835 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5836 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5837 return RdxDesc.isOrdered(); 5838 }); 5839 if (HasOrderedReductions) { 5840 LLVM_DEBUG( 5841 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5842 return 1; 5843 } 5844 5845 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5846 SmallIC = std::min(SmallIC, F); 5847 StoresIC = std::min(StoresIC, F); 5848 LoadsIC = std::min(LoadsIC, F); 5849 } 5850 5851 if (EnableLoadStoreRuntimeInterleave && 5852 std::max(StoresIC, LoadsIC) > SmallIC) { 5853 LLVM_DEBUG( 5854 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5855 return std::max(StoresIC, LoadsIC); 5856 } 5857 5858 // If there are scalar reductions and TTI has enabled aggressive 5859 // interleaving for reductions, we will interleave to expose ILP. 5860 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5861 AggressivelyInterleaveReductions) { 5862 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5863 // Interleave no less than SmallIC but not as aggressive as the normal IC 5864 // to satisfy the rare situation when resources are too limited. 5865 return std::max(IC / 2, SmallIC); 5866 } else { 5867 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5868 return SmallIC; 5869 } 5870 } 5871 5872 // Interleave if this is a large loop (small loops are already dealt with by 5873 // this point) that could benefit from interleaving. 5874 if (AggressivelyInterleaveReductions) { 5875 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5876 return IC; 5877 } 5878 5879 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5880 return 1; 5881 } 5882 5883 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5884 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5885 // This function calculates the register usage by measuring the highest number 5886 // of values that are alive at a single location. Obviously, this is a very 5887 // rough estimation. We scan the loop in a topological order in order and 5888 // assign a number to each instruction. We use RPO to ensure that defs are 5889 // met before their users. We assume that each instruction that has in-loop 5890 // users starts an interval. We record every time that an in-loop value is 5891 // used, so we have a list of the first and last occurrences of each 5892 // instruction. Next, we transpose this data structure into a multi map that 5893 // holds the list of intervals that *end* at a specific location. This multi 5894 // map allows us to perform a linear search. We scan the instructions linearly 5895 // and record each time that a new interval starts, by placing it in a set. 5896 // If we find this value in the multi-map then we remove it from the set. 5897 // The max register usage is the maximum size of the set. 5898 // We also search for instructions that are defined outside the loop, but are 5899 // used inside the loop. We need this number separately from the max-interval 5900 // usage number because when we unroll, loop-invariant values do not take 5901 // more register. 5902 LoopBlocksDFS DFS(TheLoop); 5903 DFS.perform(LI); 5904 5905 RegisterUsage RU; 5906 5907 // Each 'key' in the map opens a new interval. The values 5908 // of the map are the index of the 'last seen' usage of the 5909 // instruction that is the key. 5910 using IntervalMap = DenseMap<Instruction *, unsigned>; 5911 5912 // Maps instruction to its index. 5913 SmallVector<Instruction *, 64> IdxToInstr; 5914 // Marks the end of each interval. 5915 IntervalMap EndPoint; 5916 // Saves the list of instruction indices that are used in the loop. 5917 SmallPtrSet<Instruction *, 8> Ends; 5918 // Saves the list of values that are used in the loop but are 5919 // defined outside the loop, such as arguments and constants. 5920 SmallPtrSet<Value *, 8> LoopInvariants; 5921 5922 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5923 for (Instruction &I : BB->instructionsWithoutDebug()) { 5924 IdxToInstr.push_back(&I); 5925 5926 // Save the end location of each USE. 5927 for (Value *U : I.operands()) { 5928 auto *Instr = dyn_cast<Instruction>(U); 5929 5930 // Ignore non-instruction values such as arguments, constants, etc. 5931 if (!Instr) 5932 continue; 5933 5934 // If this instruction is outside the loop then record it and continue. 5935 if (!TheLoop->contains(Instr)) { 5936 LoopInvariants.insert(Instr); 5937 continue; 5938 } 5939 5940 // Overwrite previous end points. 5941 EndPoint[Instr] = IdxToInstr.size(); 5942 Ends.insert(Instr); 5943 } 5944 } 5945 } 5946 5947 // Saves the list of intervals that end with the index in 'key'. 5948 using InstrList = SmallVector<Instruction *, 2>; 5949 DenseMap<unsigned, InstrList> TransposeEnds; 5950 5951 // Transpose the EndPoints to a list of values that end at each index. 5952 for (auto &Interval : EndPoint) 5953 TransposeEnds[Interval.second].push_back(Interval.first); 5954 5955 SmallPtrSet<Instruction *, 8> OpenIntervals; 5956 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5957 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5958 5959 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5960 5961 auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { 5962 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5963 return 0; 5964 return TTI.getRegUsageForType(VectorType::get(Ty, VF)); 5965 }; 5966 5967 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5968 Instruction *I = IdxToInstr[i]; 5969 5970 // Remove all of the instructions that end at this location. 5971 InstrList &List = TransposeEnds[i]; 5972 for (Instruction *ToRemove : List) 5973 OpenIntervals.erase(ToRemove); 5974 5975 // Ignore instructions that are never used within the loop. 5976 if (!Ends.count(I)) 5977 continue; 5978 5979 // Skip ignored values. 5980 if (ValuesToIgnore.count(I)) 5981 continue; 5982 5983 // For each VF find the maximum usage of registers. 5984 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5985 // Count the number of live intervals. 5986 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5987 5988 if (VFs[j].isScalar()) { 5989 for (auto Inst : OpenIntervals) { 5990 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5991 if (RegUsage.find(ClassID) == RegUsage.end()) 5992 RegUsage[ClassID] = 1; 5993 else 5994 RegUsage[ClassID] += 1; 5995 } 5996 } else { 5997 collectUniformsAndScalars(VFs[j]); 5998 for (auto Inst : OpenIntervals) { 5999 // Skip ignored values for VF > 1. 6000 if (VecValuesToIgnore.count(Inst)) 6001 continue; 6002 if (isScalarAfterVectorization(Inst, VFs[j])) { 6003 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6004 if (RegUsage.find(ClassID) == RegUsage.end()) 6005 RegUsage[ClassID] = 1; 6006 else 6007 RegUsage[ClassID] += 1; 6008 } else { 6009 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6010 if (RegUsage.find(ClassID) == RegUsage.end()) 6011 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6012 else 6013 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6014 } 6015 } 6016 } 6017 6018 for (auto& pair : RegUsage) { 6019 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6020 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6021 else 6022 MaxUsages[j][pair.first] = pair.second; 6023 } 6024 } 6025 6026 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6027 << OpenIntervals.size() << '\n'); 6028 6029 // Add the current instruction to the list of open intervals. 6030 OpenIntervals.insert(I); 6031 } 6032 6033 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6034 SmallMapVector<unsigned, unsigned, 4> Invariant; 6035 6036 for (auto Inst : LoopInvariants) { 6037 unsigned Usage = 6038 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6039 unsigned ClassID = 6040 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6041 if (Invariant.find(ClassID) == Invariant.end()) 6042 Invariant[ClassID] = Usage; 6043 else 6044 Invariant[ClassID] += Usage; 6045 } 6046 6047 LLVM_DEBUG({ 6048 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6049 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6050 << " item\n"; 6051 for (const auto &pair : MaxUsages[i]) { 6052 dbgs() << "LV(REG): RegisterClass: " 6053 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6054 << " registers\n"; 6055 } 6056 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6057 << " item\n"; 6058 for (const auto &pair : Invariant) { 6059 dbgs() << "LV(REG): RegisterClass: " 6060 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6061 << " registers\n"; 6062 } 6063 }); 6064 6065 RU.LoopInvariantRegs = Invariant; 6066 RU.MaxLocalUsers = MaxUsages[i]; 6067 RUs[i] = RU; 6068 } 6069 6070 return RUs; 6071 } 6072 6073 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6074 ElementCount VF) { 6075 // TODO: Cost model for emulated masked load/store is completely 6076 // broken. This hack guides the cost model to use an artificially 6077 // high enough value to practically disable vectorization with such 6078 // operations, except where previously deployed legality hack allowed 6079 // using very low cost values. This is to avoid regressions coming simply 6080 // from moving "masked load/store" check from legality to cost model. 6081 // Masked Load/Gather emulation was previously never allowed. 6082 // Limited number of Masked Store/Scatter emulation was allowed. 6083 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6084 return isa<LoadInst>(I) || 6085 (isa<StoreInst>(I) && 6086 NumPredStores > NumberOfStoresToPredicate); 6087 } 6088 6089 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6090 // If we aren't vectorizing the loop, or if we've already collected the 6091 // instructions to scalarize, there's nothing to do. Collection may already 6092 // have occurred if we have a user-selected VF and are now computing the 6093 // expected cost for interleaving. 6094 if (VF.isScalar() || VF.isZero() || 6095 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6096 return; 6097 6098 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6099 // not profitable to scalarize any instructions, the presence of VF in the 6100 // map will indicate that we've analyzed it already. 6101 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6102 6103 // Find all the instructions that are scalar with predication in the loop and 6104 // determine if it would be better to not if-convert the blocks they are in. 6105 // If so, we also record the instructions to scalarize. 6106 for (BasicBlock *BB : TheLoop->blocks()) { 6107 if (!blockNeedsPredicationForAnyReason(BB)) 6108 continue; 6109 for (Instruction &I : *BB) 6110 if (isScalarWithPredication(&I, VF)) { 6111 ScalarCostsTy ScalarCosts; 6112 // Do not apply discount if scalable, because that would lead to 6113 // invalid scalarization costs. 6114 // Do not apply discount logic if hacked cost is needed 6115 // for emulated masked memrefs. 6116 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6117 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6118 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6119 // Remember that BB will remain after vectorization. 6120 PredicatedBBsAfterVectorization.insert(BB); 6121 } 6122 } 6123 } 6124 6125 int LoopVectorizationCostModel::computePredInstDiscount( 6126 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6127 assert(!isUniformAfterVectorization(PredInst, VF) && 6128 "Instruction marked uniform-after-vectorization will be predicated"); 6129 6130 // Initialize the discount to zero, meaning that the scalar version and the 6131 // vector version cost the same. 6132 InstructionCost Discount = 0; 6133 6134 // Holds instructions to analyze. The instructions we visit are mapped in 6135 // ScalarCosts. Those instructions are the ones that would be scalarized if 6136 // we find that the scalar version costs less. 6137 SmallVector<Instruction *, 8> Worklist; 6138 6139 // Returns true if the given instruction can be scalarized. 6140 auto canBeScalarized = [&](Instruction *I) -> bool { 6141 // We only attempt to scalarize instructions forming a single-use chain 6142 // from the original predicated block that would otherwise be vectorized. 6143 // Although not strictly necessary, we give up on instructions we know will 6144 // already be scalar to avoid traversing chains that are unlikely to be 6145 // beneficial. 6146 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6147 isScalarAfterVectorization(I, VF)) 6148 return false; 6149 6150 // If the instruction is scalar with predication, it will be analyzed 6151 // separately. We ignore it within the context of PredInst. 6152 if (isScalarWithPredication(I, VF)) 6153 return false; 6154 6155 // If any of the instruction's operands are uniform after vectorization, 6156 // the instruction cannot be scalarized. This prevents, for example, a 6157 // masked load from being scalarized. 6158 // 6159 // We assume we will only emit a value for lane zero of an instruction 6160 // marked uniform after vectorization, rather than VF identical values. 6161 // Thus, if we scalarize an instruction that uses a uniform, we would 6162 // create uses of values corresponding to the lanes we aren't emitting code 6163 // for. This behavior can be changed by allowing getScalarValue to clone 6164 // the lane zero values for uniforms rather than asserting. 6165 for (Use &U : I->operands()) 6166 if (auto *J = dyn_cast<Instruction>(U.get())) 6167 if (isUniformAfterVectorization(J, VF)) 6168 return false; 6169 6170 // Otherwise, we can scalarize the instruction. 6171 return true; 6172 }; 6173 6174 // Compute the expected cost discount from scalarizing the entire expression 6175 // feeding the predicated instruction. We currently only consider expressions 6176 // that are single-use instruction chains. 6177 Worklist.push_back(PredInst); 6178 while (!Worklist.empty()) { 6179 Instruction *I = Worklist.pop_back_val(); 6180 6181 // If we've already analyzed the instruction, there's nothing to do. 6182 if (ScalarCosts.find(I) != ScalarCosts.end()) 6183 continue; 6184 6185 // Compute the cost of the vector instruction. Note that this cost already 6186 // includes the scalarization overhead of the predicated instruction. 6187 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6188 6189 // Compute the cost of the scalarized instruction. This cost is the cost of 6190 // the instruction as if it wasn't if-converted and instead remained in the 6191 // predicated block. We will scale this cost by block probability after 6192 // computing the scalarization overhead. 6193 InstructionCost ScalarCost = 6194 VF.getFixedValue() * 6195 getInstructionCost(I, ElementCount::getFixed(1)).first; 6196 6197 // Compute the scalarization overhead of needed insertelement instructions 6198 // and phi nodes. 6199 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6200 ScalarCost += TTI.getScalarizationOverhead( 6201 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6202 APInt::getAllOnes(VF.getFixedValue()), true, false); 6203 ScalarCost += 6204 VF.getFixedValue() * 6205 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6206 } 6207 6208 // Compute the scalarization overhead of needed extractelement 6209 // instructions. For each of the instruction's operands, if the operand can 6210 // be scalarized, add it to the worklist; otherwise, account for the 6211 // overhead. 6212 for (Use &U : I->operands()) 6213 if (auto *J = dyn_cast<Instruction>(U.get())) { 6214 assert(VectorType::isValidElementType(J->getType()) && 6215 "Instruction has non-scalar type"); 6216 if (canBeScalarized(J)) 6217 Worklist.push_back(J); 6218 else if (needsExtract(J, VF)) { 6219 ScalarCost += TTI.getScalarizationOverhead( 6220 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6221 APInt::getAllOnes(VF.getFixedValue()), false, true); 6222 } 6223 } 6224 6225 // Scale the total scalar cost by block probability. 6226 ScalarCost /= getReciprocalPredBlockProb(); 6227 6228 // Compute the discount. A non-negative discount means the vector version 6229 // of the instruction costs more, and scalarizing would be beneficial. 6230 Discount += VectorCost - ScalarCost; 6231 ScalarCosts[I] = ScalarCost; 6232 } 6233 6234 return *Discount.getValue(); 6235 } 6236 6237 LoopVectorizationCostModel::VectorizationCostTy 6238 LoopVectorizationCostModel::expectedCost( 6239 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6240 VectorizationCostTy Cost; 6241 6242 // For each block. 6243 for (BasicBlock *BB : TheLoop->blocks()) { 6244 VectorizationCostTy BlockCost; 6245 6246 // For each instruction in the old loop. 6247 for (Instruction &I : BB->instructionsWithoutDebug()) { 6248 // Skip ignored values. 6249 if (ValuesToIgnore.count(&I) || 6250 (VF.isVector() && VecValuesToIgnore.count(&I))) 6251 continue; 6252 6253 VectorizationCostTy C = getInstructionCost(&I, VF); 6254 6255 // Check if we should override the cost. 6256 if (C.first.isValid() && 6257 ForceTargetInstructionCost.getNumOccurrences() > 0) 6258 C.first = InstructionCost(ForceTargetInstructionCost); 6259 6260 // Keep a list of instructions with invalid costs. 6261 if (Invalid && !C.first.isValid()) 6262 Invalid->emplace_back(&I, VF); 6263 6264 BlockCost.first += C.first; 6265 BlockCost.second |= C.second; 6266 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6267 << " for VF " << VF << " For instruction: " << I 6268 << '\n'); 6269 } 6270 6271 // If we are vectorizing a predicated block, it will have been 6272 // if-converted. This means that the block's instructions (aside from 6273 // stores and instructions that may divide by zero) will now be 6274 // unconditionally executed. For the scalar case, we may not always execute 6275 // the predicated block, if it is an if-else block. Thus, scale the block's 6276 // cost by the probability of executing it. blockNeedsPredication from 6277 // Legal is used so as to not include all blocks in tail folded loops. 6278 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6279 BlockCost.first /= getReciprocalPredBlockProb(); 6280 6281 Cost.first += BlockCost.first; 6282 Cost.second |= BlockCost.second; 6283 } 6284 6285 return Cost; 6286 } 6287 6288 /// Gets Address Access SCEV after verifying that the access pattern 6289 /// is loop invariant except the induction variable dependence. 6290 /// 6291 /// This SCEV can be sent to the Target in order to estimate the address 6292 /// calculation cost. 6293 static const SCEV *getAddressAccessSCEV( 6294 Value *Ptr, 6295 LoopVectorizationLegality *Legal, 6296 PredicatedScalarEvolution &PSE, 6297 const Loop *TheLoop) { 6298 6299 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6300 if (!Gep) 6301 return nullptr; 6302 6303 // We are looking for a gep with all loop invariant indices except for one 6304 // which should be an induction variable. 6305 auto SE = PSE.getSE(); 6306 unsigned NumOperands = Gep->getNumOperands(); 6307 for (unsigned i = 1; i < NumOperands; ++i) { 6308 Value *Opd = Gep->getOperand(i); 6309 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6310 !Legal->isInductionVariable(Opd)) 6311 return nullptr; 6312 } 6313 6314 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6315 return PSE.getSCEV(Ptr); 6316 } 6317 6318 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6319 return Legal->hasStride(I->getOperand(0)) || 6320 Legal->hasStride(I->getOperand(1)); 6321 } 6322 6323 InstructionCost 6324 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6325 ElementCount VF) { 6326 assert(VF.isVector() && 6327 "Scalarization cost of instruction implies vectorization."); 6328 if (VF.isScalable()) 6329 return InstructionCost::getInvalid(); 6330 6331 Type *ValTy = getLoadStoreType(I); 6332 auto SE = PSE.getSE(); 6333 6334 unsigned AS = getLoadStoreAddressSpace(I); 6335 Value *Ptr = getLoadStorePointerOperand(I); 6336 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6337 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6338 // that it is being called from this specific place. 6339 6340 // Figure out whether the access is strided and get the stride value 6341 // if it's known in compile time 6342 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6343 6344 // Get the cost of the scalar memory instruction and address computation. 6345 InstructionCost Cost = 6346 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6347 6348 // Don't pass *I here, since it is scalar but will actually be part of a 6349 // vectorized loop where the user of it is a vectorized instruction. 6350 const Align Alignment = getLoadStoreAlignment(I); 6351 Cost += VF.getKnownMinValue() * 6352 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6353 AS, TTI::TCK_RecipThroughput); 6354 6355 // Get the overhead of the extractelement and insertelement instructions 6356 // we might create due to scalarization. 6357 Cost += getScalarizationOverhead(I, VF); 6358 6359 // If we have a predicated load/store, it will need extra i1 extracts and 6360 // conditional branches, but may not be executed for each vector lane. Scale 6361 // the cost by the probability of executing the predicated block. 6362 if (isPredicatedInst(I, VF)) { 6363 Cost /= getReciprocalPredBlockProb(); 6364 6365 // Add the cost of an i1 extract and a branch 6366 auto *Vec_i1Ty = 6367 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6368 Cost += TTI.getScalarizationOverhead( 6369 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6370 /*Insert=*/false, /*Extract=*/true); 6371 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6372 6373 if (useEmulatedMaskMemRefHack(I, VF)) 6374 // Artificially setting to a high enough value to practically disable 6375 // vectorization with such operations. 6376 Cost = 3000000; 6377 } 6378 6379 return Cost; 6380 } 6381 6382 InstructionCost 6383 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6384 ElementCount VF) { 6385 Type *ValTy = getLoadStoreType(I); 6386 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6387 Value *Ptr = getLoadStorePointerOperand(I); 6388 unsigned AS = getLoadStoreAddressSpace(I); 6389 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6390 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6391 6392 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6393 "Stride should be 1 or -1 for consecutive memory access"); 6394 const Align Alignment = getLoadStoreAlignment(I); 6395 InstructionCost Cost = 0; 6396 if (Legal->isMaskRequired(I)) 6397 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6398 CostKind); 6399 else 6400 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6401 CostKind, I); 6402 6403 bool Reverse = ConsecutiveStride < 0; 6404 if (Reverse) 6405 Cost += 6406 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6407 return Cost; 6408 } 6409 6410 InstructionCost 6411 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6412 ElementCount VF) { 6413 assert(Legal->isUniformMemOp(*I)); 6414 6415 Type *ValTy = getLoadStoreType(I); 6416 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6417 const Align Alignment = getLoadStoreAlignment(I); 6418 unsigned AS = getLoadStoreAddressSpace(I); 6419 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6420 if (isa<LoadInst>(I)) { 6421 return TTI.getAddressComputationCost(ValTy) + 6422 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6423 CostKind) + 6424 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6425 } 6426 StoreInst *SI = cast<StoreInst>(I); 6427 6428 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6429 return TTI.getAddressComputationCost(ValTy) + 6430 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6431 CostKind) + 6432 (isLoopInvariantStoreValue 6433 ? 0 6434 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6435 VF.getKnownMinValue() - 1)); 6436 } 6437 6438 InstructionCost 6439 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6440 ElementCount VF) { 6441 Type *ValTy = getLoadStoreType(I); 6442 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6443 const Align Alignment = getLoadStoreAlignment(I); 6444 const Value *Ptr = getLoadStorePointerOperand(I); 6445 6446 return TTI.getAddressComputationCost(VectorTy) + 6447 TTI.getGatherScatterOpCost( 6448 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6449 TargetTransformInfo::TCK_RecipThroughput, I); 6450 } 6451 6452 InstructionCost 6453 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6454 ElementCount VF) { 6455 // TODO: Once we have support for interleaving with scalable vectors 6456 // we can calculate the cost properly here. 6457 if (VF.isScalable()) 6458 return InstructionCost::getInvalid(); 6459 6460 Type *ValTy = getLoadStoreType(I); 6461 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6462 unsigned AS = getLoadStoreAddressSpace(I); 6463 6464 auto Group = getInterleavedAccessGroup(I); 6465 assert(Group && "Fail to get an interleaved access group."); 6466 6467 unsigned InterleaveFactor = Group->getFactor(); 6468 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6469 6470 // Holds the indices of existing members in the interleaved group. 6471 SmallVector<unsigned, 4> Indices; 6472 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6473 if (Group->getMember(IF)) 6474 Indices.push_back(IF); 6475 6476 // Calculate the cost of the whole interleaved group. 6477 bool UseMaskForGaps = 6478 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6479 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6480 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6481 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6482 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6483 6484 if (Group->isReverse()) { 6485 // TODO: Add support for reversed masked interleaved access. 6486 assert(!Legal->isMaskRequired(I) && 6487 "Reverse masked interleaved access not supported."); 6488 Cost += 6489 Group->getNumMembers() * 6490 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6491 } 6492 return Cost; 6493 } 6494 6495 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6496 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6497 using namespace llvm::PatternMatch; 6498 // Early exit for no inloop reductions 6499 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6500 return None; 6501 auto *VectorTy = cast<VectorType>(Ty); 6502 6503 // We are looking for a pattern of, and finding the minimal acceptable cost: 6504 // reduce(mul(ext(A), ext(B))) or 6505 // reduce(mul(A, B)) or 6506 // reduce(ext(A)) or 6507 // reduce(A). 6508 // The basic idea is that we walk down the tree to do that, finding the root 6509 // reduction instruction in InLoopReductionImmediateChains. From there we find 6510 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6511 // of the components. If the reduction cost is lower then we return it for the 6512 // reduction instruction and 0 for the other instructions in the pattern. If 6513 // it is not we return an invalid cost specifying the orignal cost method 6514 // should be used. 6515 Instruction *RetI = I; 6516 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6517 if (!RetI->hasOneUser()) 6518 return None; 6519 RetI = RetI->user_back(); 6520 } 6521 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6522 RetI->user_back()->getOpcode() == Instruction::Add) { 6523 if (!RetI->hasOneUser()) 6524 return None; 6525 RetI = RetI->user_back(); 6526 } 6527 6528 // Test if the found instruction is a reduction, and if not return an invalid 6529 // cost specifying the parent to use the original cost modelling. 6530 if (!InLoopReductionImmediateChains.count(RetI)) 6531 return None; 6532 6533 // Find the reduction this chain is a part of and calculate the basic cost of 6534 // the reduction on its own. 6535 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6536 Instruction *ReductionPhi = LastChain; 6537 while (!isa<PHINode>(ReductionPhi)) 6538 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6539 6540 const RecurrenceDescriptor &RdxDesc = 6541 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6542 6543 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6544 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6545 6546 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6547 // normal fmul instruction to the cost of the fadd reduction. 6548 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6549 BaseCost += 6550 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6551 6552 // If we're using ordered reductions then we can just return the base cost 6553 // here, since getArithmeticReductionCost calculates the full ordered 6554 // reduction cost when FP reassociation is not allowed. 6555 if (useOrderedReductions(RdxDesc)) 6556 return BaseCost; 6557 6558 // Get the operand that was not the reduction chain and match it to one of the 6559 // patterns, returning the better cost if it is found. 6560 Instruction *RedOp = RetI->getOperand(1) == LastChain 6561 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6562 : dyn_cast<Instruction>(RetI->getOperand(1)); 6563 6564 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6565 6566 Instruction *Op0, *Op1; 6567 if (RedOp && 6568 match(RedOp, 6569 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6570 match(Op0, m_ZExtOrSExt(m_Value())) && 6571 Op0->getOpcode() == Op1->getOpcode() && 6572 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6573 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6574 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6575 6576 // Matched reduce(ext(mul(ext(A), ext(B))) 6577 // Note that the extend opcodes need to all match, or if A==B they will have 6578 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6579 // which is equally fine. 6580 bool IsUnsigned = isa<ZExtInst>(Op0); 6581 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6582 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6583 6584 InstructionCost ExtCost = 6585 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6586 TTI::CastContextHint::None, CostKind, Op0); 6587 InstructionCost MulCost = 6588 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6589 InstructionCost Ext2Cost = 6590 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6591 TTI::CastContextHint::None, CostKind, RedOp); 6592 6593 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6594 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6595 CostKind); 6596 6597 if (RedCost.isValid() && 6598 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6599 return I == RetI ? RedCost : 0; 6600 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6601 !TheLoop->isLoopInvariant(RedOp)) { 6602 // Matched reduce(ext(A)) 6603 bool IsUnsigned = isa<ZExtInst>(RedOp); 6604 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6605 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6606 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6607 CostKind); 6608 6609 InstructionCost ExtCost = 6610 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6611 TTI::CastContextHint::None, CostKind, RedOp); 6612 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6613 return I == RetI ? RedCost : 0; 6614 } else if (RedOp && 6615 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6616 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6617 Op0->getOpcode() == Op1->getOpcode() && 6618 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6619 bool IsUnsigned = isa<ZExtInst>(Op0); 6620 Type *Op0Ty = Op0->getOperand(0)->getType(); 6621 Type *Op1Ty = Op1->getOperand(0)->getType(); 6622 Type *LargestOpTy = 6623 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6624 : Op0Ty; 6625 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6626 6627 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6628 // different sizes. We take the largest type as the ext to reduce, and add 6629 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6630 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6631 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6632 TTI::CastContextHint::None, CostKind, Op0); 6633 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6634 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6635 TTI::CastContextHint::None, CostKind, Op1); 6636 InstructionCost MulCost = 6637 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6638 6639 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6640 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6641 CostKind); 6642 InstructionCost ExtraExtCost = 0; 6643 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6644 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6645 ExtraExtCost = TTI.getCastInstrCost( 6646 ExtraExtOp->getOpcode(), ExtType, 6647 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6648 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6649 } 6650 6651 if (RedCost.isValid() && 6652 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6653 return I == RetI ? RedCost : 0; 6654 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6655 // Matched reduce(mul()) 6656 InstructionCost MulCost = 6657 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6658 6659 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6660 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6661 CostKind); 6662 6663 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6664 return I == RetI ? RedCost : 0; 6665 } 6666 } 6667 6668 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6669 } 6670 6671 InstructionCost 6672 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6673 ElementCount VF) { 6674 // Calculate scalar cost only. Vectorization cost should be ready at this 6675 // moment. 6676 if (VF.isScalar()) { 6677 Type *ValTy = getLoadStoreType(I); 6678 const Align Alignment = getLoadStoreAlignment(I); 6679 unsigned AS = getLoadStoreAddressSpace(I); 6680 6681 return TTI.getAddressComputationCost(ValTy) + 6682 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6683 TTI::TCK_RecipThroughput, I); 6684 } 6685 return getWideningCost(I, VF); 6686 } 6687 6688 LoopVectorizationCostModel::VectorizationCostTy 6689 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6690 ElementCount VF) { 6691 // If we know that this instruction will remain uniform, check the cost of 6692 // the scalar version. 6693 if (isUniformAfterVectorization(I, VF)) 6694 VF = ElementCount::getFixed(1); 6695 6696 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6697 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6698 6699 // Forced scalars do not have any scalarization overhead. 6700 auto ForcedScalar = ForcedScalars.find(VF); 6701 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6702 auto InstSet = ForcedScalar->second; 6703 if (InstSet.count(I)) 6704 return VectorizationCostTy( 6705 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6706 VF.getKnownMinValue()), 6707 false); 6708 } 6709 6710 Type *VectorTy; 6711 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6712 6713 bool TypeNotScalarized = false; 6714 if (VF.isVector() && VectorTy->isVectorTy()) { 6715 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 6716 if (NumParts) 6717 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6718 else 6719 C = InstructionCost::getInvalid(); 6720 } 6721 return VectorizationCostTy(C, TypeNotScalarized); 6722 } 6723 6724 InstructionCost 6725 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6726 ElementCount VF) const { 6727 6728 // There is no mechanism yet to create a scalable scalarization loop, 6729 // so this is currently Invalid. 6730 if (VF.isScalable()) 6731 return InstructionCost::getInvalid(); 6732 6733 if (VF.isScalar()) 6734 return 0; 6735 6736 InstructionCost Cost = 0; 6737 Type *RetTy = ToVectorTy(I->getType(), VF); 6738 if (!RetTy->isVoidTy() && 6739 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6740 Cost += TTI.getScalarizationOverhead( 6741 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6742 false); 6743 6744 // Some targets keep addresses scalar. 6745 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6746 return Cost; 6747 6748 // Some targets support efficient element stores. 6749 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6750 return Cost; 6751 6752 // Collect operands to consider. 6753 CallInst *CI = dyn_cast<CallInst>(I); 6754 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6755 6756 // Skip operands that do not require extraction/scalarization and do not incur 6757 // any overhead. 6758 SmallVector<Type *> Tys; 6759 for (auto *V : filterExtractingOperands(Ops, VF)) 6760 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6761 return Cost + TTI.getOperandsScalarizationOverhead( 6762 filterExtractingOperands(Ops, VF), Tys); 6763 } 6764 6765 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6766 if (VF.isScalar()) 6767 return; 6768 NumPredStores = 0; 6769 for (BasicBlock *BB : TheLoop->blocks()) { 6770 // For each instruction in the old loop. 6771 for (Instruction &I : *BB) { 6772 Value *Ptr = getLoadStorePointerOperand(&I); 6773 if (!Ptr) 6774 continue; 6775 6776 // TODO: We should generate better code and update the cost model for 6777 // predicated uniform stores. Today they are treated as any other 6778 // predicated store (see added test cases in 6779 // invariant-store-vectorization.ll). 6780 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6781 NumPredStores++; 6782 6783 if (Legal->isUniformMemOp(I)) { 6784 // TODO: Avoid replicating loads and stores instead of 6785 // relying on instcombine to remove them. 6786 // Load: Scalar load + broadcast 6787 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6788 InstructionCost Cost; 6789 if (isa<StoreInst>(&I) && VF.isScalable() && 6790 isLegalGatherOrScatter(&I, VF)) { 6791 Cost = getGatherScatterCost(&I, VF); 6792 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6793 } else { 6794 Cost = getUniformMemOpCost(&I, VF); 6795 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6796 } 6797 continue; 6798 } 6799 6800 // We assume that widening is the best solution when possible. 6801 if (memoryInstructionCanBeWidened(&I, VF)) { 6802 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6803 int ConsecutiveStride = Legal->isConsecutivePtr( 6804 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6805 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6806 "Expected consecutive stride."); 6807 InstWidening Decision = 6808 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6809 setWideningDecision(&I, VF, Decision, Cost); 6810 continue; 6811 } 6812 6813 // Choose between Interleaving, Gather/Scatter or Scalarization. 6814 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6815 unsigned NumAccesses = 1; 6816 if (isAccessInterleaved(&I)) { 6817 auto Group = getInterleavedAccessGroup(&I); 6818 assert(Group && "Fail to get an interleaved access group."); 6819 6820 // Make one decision for the whole group. 6821 if (getWideningDecision(&I, VF) != CM_Unknown) 6822 continue; 6823 6824 NumAccesses = Group->getNumMembers(); 6825 if (interleavedAccessCanBeWidened(&I, VF)) 6826 InterleaveCost = getInterleaveGroupCost(&I, VF); 6827 } 6828 6829 InstructionCost GatherScatterCost = 6830 isLegalGatherOrScatter(&I, VF) 6831 ? getGatherScatterCost(&I, VF) * NumAccesses 6832 : InstructionCost::getInvalid(); 6833 6834 InstructionCost ScalarizationCost = 6835 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6836 6837 // Choose better solution for the current VF, 6838 // write down this decision and use it during vectorization. 6839 InstructionCost Cost; 6840 InstWidening Decision; 6841 if (InterleaveCost <= GatherScatterCost && 6842 InterleaveCost < ScalarizationCost) { 6843 Decision = CM_Interleave; 6844 Cost = InterleaveCost; 6845 } else if (GatherScatterCost < ScalarizationCost) { 6846 Decision = CM_GatherScatter; 6847 Cost = GatherScatterCost; 6848 } else { 6849 Decision = CM_Scalarize; 6850 Cost = ScalarizationCost; 6851 } 6852 // If the instructions belongs to an interleave group, the whole group 6853 // receives the same decision. The whole group receives the cost, but 6854 // the cost will actually be assigned to one instruction. 6855 if (auto Group = getInterleavedAccessGroup(&I)) 6856 setWideningDecision(Group, VF, Decision, Cost); 6857 else 6858 setWideningDecision(&I, VF, Decision, Cost); 6859 } 6860 } 6861 6862 // Make sure that any load of address and any other address computation 6863 // remains scalar unless there is gather/scatter support. This avoids 6864 // inevitable extracts into address registers, and also has the benefit of 6865 // activating LSR more, since that pass can't optimize vectorized 6866 // addresses. 6867 if (TTI.prefersVectorizedAddressing()) 6868 return; 6869 6870 // Start with all scalar pointer uses. 6871 SmallPtrSet<Instruction *, 8> AddrDefs; 6872 for (BasicBlock *BB : TheLoop->blocks()) 6873 for (Instruction &I : *BB) { 6874 Instruction *PtrDef = 6875 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6876 if (PtrDef && TheLoop->contains(PtrDef) && 6877 getWideningDecision(&I, VF) != CM_GatherScatter) 6878 AddrDefs.insert(PtrDef); 6879 } 6880 6881 // Add all instructions used to generate the addresses. 6882 SmallVector<Instruction *, 4> Worklist; 6883 append_range(Worklist, AddrDefs); 6884 while (!Worklist.empty()) { 6885 Instruction *I = Worklist.pop_back_val(); 6886 for (auto &Op : I->operands()) 6887 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6888 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6889 AddrDefs.insert(InstOp).second) 6890 Worklist.push_back(InstOp); 6891 } 6892 6893 for (auto *I : AddrDefs) { 6894 if (isa<LoadInst>(I)) { 6895 // Setting the desired widening decision should ideally be handled in 6896 // by cost functions, but since this involves the task of finding out 6897 // if the loaded register is involved in an address computation, it is 6898 // instead changed here when we know this is the case. 6899 InstWidening Decision = getWideningDecision(I, VF); 6900 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6901 // Scalarize a widened load of address. 6902 setWideningDecision( 6903 I, VF, CM_Scalarize, 6904 (VF.getKnownMinValue() * 6905 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6906 else if (auto Group = getInterleavedAccessGroup(I)) { 6907 // Scalarize an interleave group of address loads. 6908 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6909 if (Instruction *Member = Group->getMember(I)) 6910 setWideningDecision( 6911 Member, VF, CM_Scalarize, 6912 (VF.getKnownMinValue() * 6913 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6914 } 6915 } 6916 } else 6917 // Make sure I gets scalarized and a cost estimate without 6918 // scalarization overhead. 6919 ForcedScalars[VF].insert(I); 6920 } 6921 } 6922 6923 InstructionCost 6924 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6925 Type *&VectorTy) { 6926 Type *RetTy = I->getType(); 6927 if (canTruncateToMinimalBitwidth(I, VF)) 6928 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6929 auto SE = PSE.getSE(); 6930 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6931 6932 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6933 ElementCount VF) -> bool { 6934 if (VF.isScalar()) 6935 return true; 6936 6937 auto Scalarized = InstsToScalarize.find(VF); 6938 assert(Scalarized != InstsToScalarize.end() && 6939 "VF not yet analyzed for scalarization profitability"); 6940 return !Scalarized->second.count(I) && 6941 llvm::all_of(I->users(), [&](User *U) { 6942 auto *UI = cast<Instruction>(U); 6943 return !Scalarized->second.count(UI); 6944 }); 6945 }; 6946 (void) hasSingleCopyAfterVectorization; 6947 6948 if (isScalarAfterVectorization(I, VF)) { 6949 // With the exception of GEPs and PHIs, after scalarization there should 6950 // only be one copy of the instruction generated in the loop. This is 6951 // because the VF is either 1, or any instructions that need scalarizing 6952 // have already been dealt with by the the time we get here. As a result, 6953 // it means we don't have to multiply the instruction cost by VF. 6954 assert(I->getOpcode() == Instruction::GetElementPtr || 6955 I->getOpcode() == Instruction::PHI || 6956 (I->getOpcode() == Instruction::BitCast && 6957 I->getType()->isPointerTy()) || 6958 hasSingleCopyAfterVectorization(I, VF)); 6959 VectorTy = RetTy; 6960 } else 6961 VectorTy = ToVectorTy(RetTy, VF); 6962 6963 // TODO: We need to estimate the cost of intrinsic calls. 6964 switch (I->getOpcode()) { 6965 case Instruction::GetElementPtr: 6966 // We mark this instruction as zero-cost because the cost of GEPs in 6967 // vectorized code depends on whether the corresponding memory instruction 6968 // is scalarized or not. Therefore, we handle GEPs with the memory 6969 // instruction cost. 6970 return 0; 6971 case Instruction::Br: { 6972 // In cases of scalarized and predicated instructions, there will be VF 6973 // predicated blocks in the vectorized loop. Each branch around these 6974 // blocks requires also an extract of its vector compare i1 element. 6975 bool ScalarPredicatedBB = false; 6976 BranchInst *BI = cast<BranchInst>(I); 6977 if (VF.isVector() && BI->isConditional() && 6978 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6979 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6980 ScalarPredicatedBB = true; 6981 6982 if (ScalarPredicatedBB) { 6983 // Not possible to scalarize scalable vector with predicated instructions. 6984 if (VF.isScalable()) 6985 return InstructionCost::getInvalid(); 6986 // Return cost for branches around scalarized and predicated blocks. 6987 auto *Vec_i1Ty = 6988 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6989 return ( 6990 TTI.getScalarizationOverhead( 6991 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 6992 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6993 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6994 // The back-edge branch will remain, as will all scalar branches. 6995 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6996 else 6997 // This branch will be eliminated by if-conversion. 6998 return 0; 6999 // Note: We currently assume zero cost for an unconditional branch inside 7000 // a predicated block since it will become a fall-through, although we 7001 // may decide in the future to call TTI for all branches. 7002 } 7003 case Instruction::PHI: { 7004 auto *Phi = cast<PHINode>(I); 7005 7006 // First-order recurrences are replaced by vector shuffles inside the loop. 7007 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7008 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7009 return TTI.getShuffleCost( 7010 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7011 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7012 7013 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7014 // converted into select instructions. We require N - 1 selects per phi 7015 // node, where N is the number of incoming values. 7016 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7017 return (Phi->getNumIncomingValues() - 1) * 7018 TTI.getCmpSelInstrCost( 7019 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7020 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7021 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7022 7023 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7024 } 7025 case Instruction::UDiv: 7026 case Instruction::SDiv: 7027 case Instruction::URem: 7028 case Instruction::SRem: 7029 // If we have a predicated instruction, it may not be executed for each 7030 // vector lane. Get the scalarization cost and scale this amount by the 7031 // probability of executing the predicated block. If the instruction is not 7032 // predicated, we fall through to the next case. 7033 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7034 InstructionCost Cost = 0; 7035 7036 // These instructions have a non-void type, so account for the phi nodes 7037 // that we will create. This cost is likely to be zero. The phi node 7038 // cost, if any, should be scaled by the block probability because it 7039 // models a copy at the end of each predicated block. 7040 Cost += VF.getKnownMinValue() * 7041 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7042 7043 // The cost of the non-predicated instruction. 7044 Cost += VF.getKnownMinValue() * 7045 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7046 7047 // The cost of insertelement and extractelement instructions needed for 7048 // scalarization. 7049 Cost += getScalarizationOverhead(I, VF); 7050 7051 // Scale the cost by the probability of executing the predicated blocks. 7052 // This assumes the predicated block for each vector lane is equally 7053 // likely. 7054 return Cost / getReciprocalPredBlockProb(); 7055 } 7056 LLVM_FALLTHROUGH; 7057 case Instruction::Add: 7058 case Instruction::FAdd: 7059 case Instruction::Sub: 7060 case Instruction::FSub: 7061 case Instruction::Mul: 7062 case Instruction::FMul: 7063 case Instruction::FDiv: 7064 case Instruction::FRem: 7065 case Instruction::Shl: 7066 case Instruction::LShr: 7067 case Instruction::AShr: 7068 case Instruction::And: 7069 case Instruction::Or: 7070 case Instruction::Xor: { 7071 // Since we will replace the stride by 1 the multiplication should go away. 7072 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7073 return 0; 7074 7075 // Detect reduction patterns 7076 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7077 return *RedCost; 7078 7079 // Certain instructions can be cheaper to vectorize if they have a constant 7080 // second vector operand. One example of this are shifts on x86. 7081 Value *Op2 = I->getOperand(1); 7082 TargetTransformInfo::OperandValueProperties Op2VP; 7083 TargetTransformInfo::OperandValueKind Op2VK = 7084 TTI.getOperandInfo(Op2, Op2VP); 7085 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7086 Op2VK = TargetTransformInfo::OK_UniformValue; 7087 7088 SmallVector<const Value *, 4> Operands(I->operand_values()); 7089 return TTI.getArithmeticInstrCost( 7090 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7091 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7092 } 7093 case Instruction::FNeg: { 7094 return TTI.getArithmeticInstrCost( 7095 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7096 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7097 TargetTransformInfo::OP_None, I->getOperand(0), I); 7098 } 7099 case Instruction::Select: { 7100 SelectInst *SI = cast<SelectInst>(I); 7101 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7102 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7103 7104 const Value *Op0, *Op1; 7105 using namespace llvm::PatternMatch; 7106 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7107 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7108 // select x, y, false --> x & y 7109 // select x, true, y --> x | y 7110 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7111 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7112 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7113 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7114 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7115 Op1->getType()->getScalarSizeInBits() == 1); 7116 7117 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7118 return TTI.getArithmeticInstrCost( 7119 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7120 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7121 } 7122 7123 Type *CondTy = SI->getCondition()->getType(); 7124 if (!ScalarCond) 7125 CondTy = VectorType::get(CondTy, VF); 7126 7127 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7128 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7129 Pred = Cmp->getPredicate(); 7130 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7131 CostKind, I); 7132 } 7133 case Instruction::ICmp: 7134 case Instruction::FCmp: { 7135 Type *ValTy = I->getOperand(0)->getType(); 7136 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7137 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7138 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7139 VectorTy = ToVectorTy(ValTy, VF); 7140 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7141 cast<CmpInst>(I)->getPredicate(), CostKind, 7142 I); 7143 } 7144 case Instruction::Store: 7145 case Instruction::Load: { 7146 ElementCount Width = VF; 7147 if (Width.isVector()) { 7148 InstWidening Decision = getWideningDecision(I, Width); 7149 assert(Decision != CM_Unknown && 7150 "CM decision should be taken at this point"); 7151 if (Decision == CM_Scalarize) { 7152 if (VF.isScalable() && isa<StoreInst>(I)) 7153 // We can't scalarize a scalable vector store (even a uniform one 7154 // currently), return an invalid cost so as to prevent vectorization. 7155 return InstructionCost::getInvalid(); 7156 Width = ElementCount::getFixed(1); 7157 } 7158 } 7159 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7160 return getMemoryInstructionCost(I, VF); 7161 } 7162 case Instruction::BitCast: 7163 if (I->getType()->isPointerTy()) 7164 return 0; 7165 LLVM_FALLTHROUGH; 7166 case Instruction::ZExt: 7167 case Instruction::SExt: 7168 case Instruction::FPToUI: 7169 case Instruction::FPToSI: 7170 case Instruction::FPExt: 7171 case Instruction::PtrToInt: 7172 case Instruction::IntToPtr: 7173 case Instruction::SIToFP: 7174 case Instruction::UIToFP: 7175 case Instruction::Trunc: 7176 case Instruction::FPTrunc: { 7177 // Computes the CastContextHint from a Load/Store instruction. 7178 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7179 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7180 "Expected a load or a store!"); 7181 7182 if (VF.isScalar() || !TheLoop->contains(I)) 7183 return TTI::CastContextHint::Normal; 7184 7185 switch (getWideningDecision(I, VF)) { 7186 case LoopVectorizationCostModel::CM_GatherScatter: 7187 return TTI::CastContextHint::GatherScatter; 7188 case LoopVectorizationCostModel::CM_Interleave: 7189 return TTI::CastContextHint::Interleave; 7190 case LoopVectorizationCostModel::CM_Scalarize: 7191 case LoopVectorizationCostModel::CM_Widen: 7192 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7193 : TTI::CastContextHint::Normal; 7194 case LoopVectorizationCostModel::CM_Widen_Reverse: 7195 return TTI::CastContextHint::Reversed; 7196 case LoopVectorizationCostModel::CM_Unknown: 7197 llvm_unreachable("Instr did not go through cost modelling?"); 7198 } 7199 7200 llvm_unreachable("Unhandled case!"); 7201 }; 7202 7203 unsigned Opcode = I->getOpcode(); 7204 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7205 // For Trunc, the context is the only user, which must be a StoreInst. 7206 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7207 if (I->hasOneUse()) 7208 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7209 CCH = ComputeCCH(Store); 7210 } 7211 // For Z/Sext, the context is the operand, which must be a LoadInst. 7212 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7213 Opcode == Instruction::FPExt) { 7214 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7215 CCH = ComputeCCH(Load); 7216 } 7217 7218 // We optimize the truncation of induction variables having constant 7219 // integer steps. The cost of these truncations is the same as the scalar 7220 // operation. 7221 if (isOptimizableIVTruncate(I, VF)) { 7222 auto *Trunc = cast<TruncInst>(I); 7223 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7224 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7225 } 7226 7227 // Detect reduction patterns 7228 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7229 return *RedCost; 7230 7231 Type *SrcScalarTy = I->getOperand(0)->getType(); 7232 Type *SrcVecTy = 7233 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7234 if (canTruncateToMinimalBitwidth(I, VF)) { 7235 // This cast is going to be shrunk. This may remove the cast or it might 7236 // turn it into slightly different cast. For example, if MinBW == 16, 7237 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7238 // 7239 // Calculate the modified src and dest types. 7240 Type *MinVecTy = VectorTy; 7241 if (Opcode == Instruction::Trunc) { 7242 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7243 VectorTy = 7244 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7245 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7246 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7247 VectorTy = 7248 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7249 } 7250 } 7251 7252 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7253 } 7254 case Instruction::Call: { 7255 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7256 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7257 return *RedCost; 7258 bool NeedToScalarize; 7259 CallInst *CI = cast<CallInst>(I); 7260 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7261 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7262 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7263 return std::min(CallCost, IntrinsicCost); 7264 } 7265 return CallCost; 7266 } 7267 case Instruction::ExtractValue: 7268 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7269 case Instruction::Alloca: 7270 // We cannot easily widen alloca to a scalable alloca, as 7271 // the result would need to be a vector of pointers. 7272 if (VF.isScalable()) 7273 return InstructionCost::getInvalid(); 7274 LLVM_FALLTHROUGH; 7275 default: 7276 // This opcode is unknown. Assume that it is the same as 'mul'. 7277 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7278 } // end of switch. 7279 } 7280 7281 char LoopVectorize::ID = 0; 7282 7283 static const char lv_name[] = "Loop Vectorization"; 7284 7285 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7286 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7287 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7288 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7289 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7290 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7291 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7292 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7293 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7294 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7295 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7296 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7297 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7298 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7299 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7300 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7301 7302 namespace llvm { 7303 7304 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7305 7306 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7307 bool VectorizeOnlyWhenForced) { 7308 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7309 } 7310 7311 } // end namespace llvm 7312 7313 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7314 // Check if the pointer operand of a load or store instruction is 7315 // consecutive. 7316 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7317 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7318 return false; 7319 } 7320 7321 void LoopVectorizationCostModel::collectValuesToIgnore() { 7322 // Ignore ephemeral values. 7323 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7324 7325 // Find all stores to invariant variables. Since they are going to sink 7326 // outside the loop we do not need calculate cost for them. 7327 for (BasicBlock *BB : TheLoop->blocks()) 7328 for (Instruction &I : *BB) { 7329 StoreInst *SI; 7330 if ((SI = dyn_cast<StoreInst>(&I)) && 7331 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7332 ValuesToIgnore.insert(&I); 7333 } 7334 7335 // Ignore type-promoting instructions we identified during reduction 7336 // detection. 7337 for (auto &Reduction : Legal->getReductionVars()) { 7338 const RecurrenceDescriptor &RedDes = Reduction.second; 7339 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7340 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7341 } 7342 // Ignore type-casting instructions we identified during induction 7343 // detection. 7344 for (auto &Induction : Legal->getInductionVars()) { 7345 const InductionDescriptor &IndDes = Induction.second; 7346 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7347 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7348 } 7349 } 7350 7351 void LoopVectorizationCostModel::collectInLoopReductions() { 7352 for (auto &Reduction : Legal->getReductionVars()) { 7353 PHINode *Phi = Reduction.first; 7354 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7355 7356 // We don't collect reductions that are type promoted (yet). 7357 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7358 continue; 7359 7360 // If the target would prefer this reduction to happen "in-loop", then we 7361 // want to record it as such. 7362 unsigned Opcode = RdxDesc.getOpcode(); 7363 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7364 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7365 TargetTransformInfo::ReductionFlags())) 7366 continue; 7367 7368 // Check that we can correctly put the reductions into the loop, by 7369 // finding the chain of operations that leads from the phi to the loop 7370 // exit value. 7371 SmallVector<Instruction *, 4> ReductionOperations = 7372 RdxDesc.getReductionOpChain(Phi, TheLoop); 7373 bool InLoop = !ReductionOperations.empty(); 7374 if (InLoop) { 7375 InLoopReductionChains[Phi] = ReductionOperations; 7376 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7377 Instruction *LastChain = Phi; 7378 for (auto *I : ReductionOperations) { 7379 InLoopReductionImmediateChains[I] = LastChain; 7380 LastChain = I; 7381 } 7382 } 7383 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7384 << " reduction for phi: " << *Phi << "\n"); 7385 } 7386 } 7387 7388 // TODO: we could return a pair of values that specify the max VF and 7389 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7390 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7391 // doesn't have a cost model that can choose which plan to execute if 7392 // more than one is generated. 7393 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7394 LoopVectorizationCostModel &CM) { 7395 unsigned WidestType; 7396 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7397 return WidestVectorRegBits / WidestType; 7398 } 7399 7400 VectorizationFactor 7401 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7402 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7403 ElementCount VF = UserVF; 7404 // Outer loop handling: They may require CFG and instruction level 7405 // transformations before even evaluating whether vectorization is profitable. 7406 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7407 // the vectorization pipeline. 7408 if (!OrigLoop->isInnermost()) { 7409 // If the user doesn't provide a vectorization factor, determine a 7410 // reasonable one. 7411 if (UserVF.isZero()) { 7412 VF = ElementCount::getFixed(determineVPlanVF( 7413 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7414 .getFixedSize(), 7415 CM)); 7416 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7417 7418 // Make sure we have a VF > 1 for stress testing. 7419 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7420 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7421 << "overriding computed VF.\n"); 7422 VF = ElementCount::getFixed(4); 7423 } 7424 } 7425 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7426 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7427 "VF needs to be a power of two"); 7428 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7429 << "VF " << VF << " to build VPlans.\n"); 7430 buildVPlans(VF, VF); 7431 7432 // For VPlan build stress testing, we bail out after VPlan construction. 7433 if (VPlanBuildStressTest) 7434 return VectorizationFactor::Disabled(); 7435 7436 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7437 } 7438 7439 LLVM_DEBUG( 7440 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7441 "VPlan-native path.\n"); 7442 return VectorizationFactor::Disabled(); 7443 } 7444 7445 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { 7446 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7447 return (NumRuntimePointerChecks > 7448 VectorizerParams::RuntimeMemoryCheckThreshold && 7449 !Hints.allowReordering()) || 7450 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7451 } 7452 7453 Optional<VectorizationFactor> 7454 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7455 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7456 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7457 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7458 return None; 7459 7460 // Invalidate interleave groups if all blocks of loop will be predicated. 7461 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7462 !useMaskedInterleavedAccesses(*TTI)) { 7463 LLVM_DEBUG( 7464 dbgs() 7465 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7466 "which requires masked-interleaved support.\n"); 7467 if (CM.InterleaveInfo.invalidateGroups()) 7468 // Invalidating interleave groups also requires invalidating all decisions 7469 // based on them, which includes widening decisions and uniform and scalar 7470 // values. 7471 CM.invalidateCostModelingDecisions(); 7472 } 7473 7474 ElementCount MaxUserVF = 7475 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7476 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7477 if (!UserVF.isZero() && UserVFIsLegal) { 7478 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7479 "VF needs to be a power of two"); 7480 // Collect the instructions (and their associated costs) that will be more 7481 // profitable to scalarize. 7482 if (CM.selectUserVectorizationFactor(UserVF)) { 7483 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7484 CM.collectInLoopReductions(); 7485 buildVPlansWithVPRecipes(UserVF, UserVF); 7486 LLVM_DEBUG(printPlans(dbgs())); 7487 return {{UserVF, 0, 0}}; 7488 } else 7489 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7490 "InvalidCost", ORE, OrigLoop); 7491 } 7492 7493 // Populate the set of Vectorization Factor Candidates. 7494 ElementCountSet VFCandidates; 7495 for (auto VF = ElementCount::getFixed(1); 7496 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7497 VFCandidates.insert(VF); 7498 for (auto VF = ElementCount::getScalable(1); 7499 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7500 VFCandidates.insert(VF); 7501 7502 for (const auto &VF : VFCandidates) { 7503 // Collect Uniform and Scalar instructions after vectorization with VF. 7504 CM.collectUniformsAndScalars(VF); 7505 7506 // Collect the instructions (and their associated costs) that will be more 7507 // profitable to scalarize. 7508 if (VF.isVector()) 7509 CM.collectInstsToScalarize(VF); 7510 } 7511 7512 CM.collectInLoopReductions(); 7513 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7514 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7515 7516 LLVM_DEBUG(printPlans(dbgs())); 7517 if (!MaxFactors.hasVector()) 7518 return VectorizationFactor::Disabled(); 7519 7520 // Select the optimal vectorization factor. 7521 return CM.selectVectorizationFactor(VFCandidates); 7522 } 7523 7524 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7525 assert(count_if(VPlans, 7526 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7527 1 && 7528 "Best VF has not a single VPlan."); 7529 7530 for (const VPlanPtr &Plan : VPlans) { 7531 if (Plan->hasVF(VF)) 7532 return *Plan.get(); 7533 } 7534 llvm_unreachable("No plan found!"); 7535 } 7536 7537 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7538 SmallVector<Metadata *, 4> MDs; 7539 // Reserve first location for self reference to the LoopID metadata node. 7540 MDs.push_back(nullptr); 7541 bool IsUnrollMetadata = false; 7542 MDNode *LoopID = L->getLoopID(); 7543 if (LoopID) { 7544 // First find existing loop unrolling disable metadata. 7545 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7546 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7547 if (MD) { 7548 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7549 IsUnrollMetadata = 7550 S && S->getString().startswith("llvm.loop.unroll.disable"); 7551 } 7552 MDs.push_back(LoopID->getOperand(i)); 7553 } 7554 } 7555 7556 if (!IsUnrollMetadata) { 7557 // Add runtime unroll disable metadata. 7558 LLVMContext &Context = L->getHeader()->getContext(); 7559 SmallVector<Metadata *, 1> DisableOperands; 7560 DisableOperands.push_back( 7561 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7562 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7563 MDs.push_back(DisableNode); 7564 MDNode *NewLoopID = MDNode::get(Context, MDs); 7565 // Set operand 0 to refer to the loop id itself. 7566 NewLoopID->replaceOperandWith(0, NewLoopID); 7567 L->setLoopID(NewLoopID); 7568 } 7569 } 7570 7571 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7572 VPlan &BestVPlan, 7573 InnerLoopVectorizer &ILV, 7574 DominatorTree *DT) { 7575 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7576 << '\n'); 7577 7578 // Perform the actual loop transformation. 7579 7580 // 1. Set up the skeleton for vectorization, including vector pre-header and 7581 // middle block. The vector loop is created during VPlan execution. 7582 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7583 Value *CanonicalIVStartValue; 7584 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7585 ILV.createVectorizedLoopSkeleton(); 7586 ILV.collectPoisonGeneratingRecipes(State); 7587 7588 ILV.printDebugTracesAtStart(); 7589 7590 //===------------------------------------------------===// 7591 // 7592 // Notice: any optimization or new instruction that go 7593 // into the code below should also be implemented in 7594 // the cost-model. 7595 // 7596 //===------------------------------------------------===// 7597 7598 // 2. Copy and widen instructions from the old loop into the new loop. 7599 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7600 ILV.getOrCreateVectorTripCount(nullptr), 7601 CanonicalIVStartValue, State); 7602 BestVPlan.execute(&State); 7603 7604 // Keep all loop hints from the original loop on the vector loop (we'll 7605 // replace the vectorizer-specific hints below). 7606 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7607 7608 Optional<MDNode *> VectorizedLoopID = 7609 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7610 LLVMLoopVectorizeFollowupVectorized}); 7611 7612 VPBasicBlock *HeaderVPBB = 7613 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7614 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7615 if (VectorizedLoopID.hasValue()) 7616 L->setLoopID(VectorizedLoopID.getValue()); 7617 else { 7618 // Keep all loop hints from the original loop on the vector loop (we'll 7619 // replace the vectorizer-specific hints below). 7620 if (MDNode *LID = OrigLoop->getLoopID()) 7621 L->setLoopID(LID); 7622 7623 LoopVectorizeHints Hints(L, true, *ORE); 7624 Hints.setAlreadyVectorized(); 7625 } 7626 // Disable runtime unrolling when vectorizing the epilogue loop. 7627 if (CanonicalIVStartValue) 7628 AddRuntimeUnrollDisableMetaData(L); 7629 7630 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7631 // predication, updating analyses. 7632 ILV.fixVectorizedLoop(State, BestVPlan); 7633 7634 ILV.printDebugTracesAtEnd(); 7635 } 7636 7637 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7638 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7639 for (const auto &Plan : VPlans) 7640 if (PrintVPlansInDotFormat) 7641 Plan->printDOT(O); 7642 else 7643 Plan->print(O); 7644 } 7645 #endif 7646 7647 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7648 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7649 7650 // We create new control-flow for the vectorized loop, so the original exit 7651 // conditions will be dead after vectorization if it's only used by the 7652 // terminator 7653 SmallVector<BasicBlock*> ExitingBlocks; 7654 OrigLoop->getExitingBlocks(ExitingBlocks); 7655 for (auto *BB : ExitingBlocks) { 7656 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7657 if (!Cmp || !Cmp->hasOneUse()) 7658 continue; 7659 7660 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7661 if (!DeadInstructions.insert(Cmp).second) 7662 continue; 7663 7664 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7665 // TODO: can recurse through operands in general 7666 for (Value *Op : Cmp->operands()) { 7667 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7668 DeadInstructions.insert(cast<Instruction>(Op)); 7669 } 7670 } 7671 7672 // We create new "steps" for induction variable updates to which the original 7673 // induction variables map. An original update instruction will be dead if 7674 // all its users except the induction variable are dead. 7675 auto *Latch = OrigLoop->getLoopLatch(); 7676 for (auto &Induction : Legal->getInductionVars()) { 7677 PHINode *Ind = Induction.first; 7678 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7679 7680 // If the tail is to be folded by masking, the primary induction variable, 7681 // if exists, isn't dead: it will be used for masking. Don't kill it. 7682 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7683 continue; 7684 7685 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7686 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7687 })) 7688 DeadInstructions.insert(IndUpdate); 7689 } 7690 } 7691 7692 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7693 7694 //===--------------------------------------------------------------------===// 7695 // EpilogueVectorizerMainLoop 7696 //===--------------------------------------------------------------------===// 7697 7698 /// This function is partially responsible for generating the control flow 7699 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7700 std::pair<BasicBlock *, Value *> 7701 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7702 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7703 7704 // Workaround! Compute the trip count of the original loop and cache it 7705 // before we start modifying the CFG. This code has a systemic problem 7706 // wherein it tries to run analysis over partially constructed IR; this is 7707 // wrong, and not simply for SCEV. The trip count of the original loop 7708 // simply happens to be prone to hitting this in practice. In theory, we 7709 // can hit the same issue for any SCEV, or ValueTracking query done during 7710 // mutation. See PR49900. 7711 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7712 createVectorLoopSkeleton(""); 7713 7714 // Generate the code to check the minimum iteration count of the vector 7715 // epilogue (see below). 7716 EPI.EpilogueIterationCountCheck = 7717 emitIterationCountCheck(LoopScalarPreHeader, true); 7718 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7719 7720 // Generate the code to check any assumptions that we've made for SCEV 7721 // expressions. 7722 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7723 7724 // Generate the code that checks at runtime if arrays overlap. We put the 7725 // checks into a separate block to make the more common case of few elements 7726 // faster. 7727 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7728 7729 // Generate the iteration count check for the main loop, *after* the check 7730 // for the epilogue loop, so that the path-length is shorter for the case 7731 // that goes directly through the vector epilogue. The longer-path length for 7732 // the main loop is compensated for, by the gain from vectorizing the larger 7733 // trip count. Note: the branch will get updated later on when we vectorize 7734 // the epilogue. 7735 EPI.MainLoopIterationCountCheck = 7736 emitIterationCountCheck(LoopScalarPreHeader, false); 7737 7738 // Generate the induction variable. 7739 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7740 7741 // Skip induction resume value creation here because they will be created in 7742 // the second pass. If we created them here, they wouldn't be used anyway, 7743 // because the vplan in the second pass still contains the inductions from the 7744 // original loop. 7745 7746 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7747 } 7748 7749 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7750 LLVM_DEBUG({ 7751 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7752 << "Main Loop VF:" << EPI.MainLoopVF 7753 << ", Main Loop UF:" << EPI.MainLoopUF 7754 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7755 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7756 }); 7757 } 7758 7759 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7760 DEBUG_WITH_TYPE(VerboseDebug, { 7761 dbgs() << "intermediate fn:\n" 7762 << *OrigLoop->getHeader()->getParent() << "\n"; 7763 }); 7764 } 7765 7766 BasicBlock * 7767 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7768 bool ForEpilogue) { 7769 assert(Bypass && "Expected valid bypass basic block."); 7770 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7771 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7772 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7773 // Reuse existing vector loop preheader for TC checks. 7774 // Note that new preheader block is generated for vector loop. 7775 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7776 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7777 7778 // Generate code to check if the loop's trip count is less than VF * UF of the 7779 // main vector loop. 7780 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7781 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7782 7783 Value *CheckMinIters = Builder.CreateICmp( 7784 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7785 "min.iters.check"); 7786 7787 if (!ForEpilogue) 7788 TCCheckBlock->setName("vector.main.loop.iter.check"); 7789 7790 // Create new preheader for vector loop. 7791 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7792 DT, LI, nullptr, "vector.ph"); 7793 7794 if (ForEpilogue) { 7795 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7796 DT->getNode(Bypass)->getIDom()) && 7797 "TC check is expected to dominate Bypass"); 7798 7799 // Update dominator for Bypass & LoopExit. 7800 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7801 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7802 // For loops with multiple exits, there's no edge from the middle block 7803 // to exit blocks (as the epilogue must run) and thus no need to update 7804 // the immediate dominator of the exit blocks. 7805 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7806 7807 LoopBypassBlocks.push_back(TCCheckBlock); 7808 7809 // Save the trip count so we don't have to regenerate it in the 7810 // vec.epilog.iter.check. This is safe to do because the trip count 7811 // generated here dominates the vector epilog iter check. 7812 EPI.TripCount = Count; 7813 } 7814 7815 ReplaceInstWithInst( 7816 TCCheckBlock->getTerminator(), 7817 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7818 7819 return TCCheckBlock; 7820 } 7821 7822 //===--------------------------------------------------------------------===// 7823 // EpilogueVectorizerEpilogueLoop 7824 //===--------------------------------------------------------------------===// 7825 7826 /// This function is partially responsible for generating the control flow 7827 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7828 std::pair<BasicBlock *, Value *> 7829 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7830 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7831 createVectorLoopSkeleton("vec.epilog."); 7832 7833 // Now, compare the remaining count and if there aren't enough iterations to 7834 // execute the vectorized epilogue skip to the scalar part. 7835 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7836 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7837 LoopVectorPreHeader = 7838 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7839 LI, nullptr, "vec.epilog.ph"); 7840 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7841 VecEpilogueIterationCountCheck); 7842 7843 // Adjust the control flow taking the state info from the main loop 7844 // vectorization into account. 7845 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7846 "expected this to be saved from the previous pass."); 7847 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7848 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7849 7850 DT->changeImmediateDominator(LoopVectorPreHeader, 7851 EPI.MainLoopIterationCountCheck); 7852 7853 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7854 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7855 7856 if (EPI.SCEVSafetyCheck) 7857 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7858 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7859 if (EPI.MemSafetyCheck) 7860 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7861 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7862 7863 DT->changeImmediateDominator( 7864 VecEpilogueIterationCountCheck, 7865 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7866 7867 DT->changeImmediateDominator(LoopScalarPreHeader, 7868 EPI.EpilogueIterationCountCheck); 7869 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7870 // If there is an epilogue which must run, there's no edge from the 7871 // middle block to exit blocks and thus no need to update the immediate 7872 // dominator of the exit blocks. 7873 DT->changeImmediateDominator(LoopExitBlock, 7874 EPI.EpilogueIterationCountCheck); 7875 7876 // Keep track of bypass blocks, as they feed start values to the induction 7877 // phis in the scalar loop preheader. 7878 if (EPI.SCEVSafetyCheck) 7879 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7880 if (EPI.MemSafetyCheck) 7881 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7882 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7883 7884 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7885 // merge control-flow from the latch block and the middle block. Update the 7886 // incoming values here and move the Phi into the preheader. 7887 SmallVector<PHINode *, 4> PhisInBlock; 7888 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7889 PhisInBlock.push_back(&Phi); 7890 7891 for (PHINode *Phi : PhisInBlock) { 7892 Phi->replaceIncomingBlockWith( 7893 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7894 VecEpilogueIterationCountCheck); 7895 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7896 if (EPI.SCEVSafetyCheck) 7897 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7898 if (EPI.MemSafetyCheck) 7899 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7900 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7901 } 7902 7903 // Generate a resume induction for the vector epilogue and put it in the 7904 // vector epilogue preheader 7905 Type *IdxTy = Legal->getWidestInductionType(); 7906 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7907 LoopVectorPreHeader->getFirstNonPHI()); 7908 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7909 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7910 EPI.MainLoopIterationCountCheck); 7911 7912 // Generate induction resume values. These variables save the new starting 7913 // indexes for the scalar loop. They are used to test if there are any tail 7914 // iterations left once the vector loop has completed. 7915 // Note that when the vectorized epilogue is skipped due to iteration count 7916 // check, then the resume value for the induction variable comes from 7917 // the trip count of the main vector loop, hence passing the AdditionalBypass 7918 // argument. 7919 createInductionResumeValues({VecEpilogueIterationCountCheck, 7920 EPI.VectorTripCount} /* AdditionalBypass */); 7921 7922 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7923 } 7924 7925 BasicBlock * 7926 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7927 BasicBlock *Bypass, BasicBlock *Insert) { 7928 7929 assert(EPI.TripCount && 7930 "Expected trip count to have been safed in the first pass."); 7931 assert( 7932 (!isa<Instruction>(EPI.TripCount) || 7933 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7934 "saved trip count does not dominate insertion point."); 7935 Value *TC = EPI.TripCount; 7936 IRBuilder<> Builder(Insert->getTerminator()); 7937 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7938 7939 // Generate code to check if the loop's trip count is less than VF * UF of the 7940 // vector epilogue loop. 7941 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7942 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7943 7944 Value *CheckMinIters = 7945 Builder.CreateICmp(P, Count, 7946 createStepForVF(Builder, Count->getType(), 7947 EPI.EpilogueVF, EPI.EpilogueUF), 7948 "min.epilog.iters.check"); 7949 7950 ReplaceInstWithInst( 7951 Insert->getTerminator(), 7952 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7953 7954 LoopBypassBlocks.push_back(Insert); 7955 return Insert; 7956 } 7957 7958 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7959 LLVM_DEBUG({ 7960 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7961 << "Epilogue Loop VF:" << EPI.EpilogueVF 7962 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7963 }); 7964 } 7965 7966 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7967 DEBUG_WITH_TYPE(VerboseDebug, { 7968 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7969 }); 7970 } 7971 7972 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7973 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7974 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7975 bool PredicateAtRangeStart = Predicate(Range.Start); 7976 7977 for (ElementCount TmpVF = Range.Start * 2; 7978 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7979 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7980 Range.End = TmpVF; 7981 break; 7982 } 7983 7984 return PredicateAtRangeStart; 7985 } 7986 7987 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7988 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7989 /// of VF's starting at a given VF and extending it as much as possible. Each 7990 /// vectorization decision can potentially shorten this sub-range during 7991 /// buildVPlan(). 7992 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7993 ElementCount MaxVF) { 7994 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7995 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7996 VFRange SubRange = {VF, MaxVFPlusOne}; 7997 VPlans.push_back(buildVPlan(SubRange)); 7998 VF = SubRange.End; 7999 } 8000 } 8001 8002 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8003 VPlanPtr &Plan) { 8004 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8005 8006 // Look for cached value. 8007 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8008 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8009 if (ECEntryIt != EdgeMaskCache.end()) 8010 return ECEntryIt->second; 8011 8012 VPValue *SrcMask = createBlockInMask(Src, Plan); 8013 8014 // The terminator has to be a branch inst! 8015 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8016 assert(BI && "Unexpected terminator found"); 8017 8018 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8019 return EdgeMaskCache[Edge] = SrcMask; 8020 8021 // If source is an exiting block, we know the exit edge is dynamically dead 8022 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8023 // adding uses of an otherwise potentially dead instruction. 8024 if (OrigLoop->isLoopExiting(Src)) 8025 return EdgeMaskCache[Edge] = SrcMask; 8026 8027 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8028 assert(EdgeMask && "No Edge Mask found for condition"); 8029 8030 if (BI->getSuccessor(0) != Dst) 8031 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8032 8033 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8034 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8035 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8036 // The select version does not introduce new UB if SrcMask is false and 8037 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8038 VPValue *False = Plan->getOrAddVPValue( 8039 ConstantInt::getFalse(BI->getCondition()->getType())); 8040 EdgeMask = 8041 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8042 } 8043 8044 return EdgeMaskCache[Edge] = EdgeMask; 8045 } 8046 8047 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8048 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8049 8050 // Look for cached value. 8051 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8052 if (BCEntryIt != BlockMaskCache.end()) 8053 return BCEntryIt->second; 8054 8055 // All-one mask is modelled as no-mask following the convention for masked 8056 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8057 VPValue *BlockMask = nullptr; 8058 8059 if (OrigLoop->getHeader() == BB) { 8060 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8061 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8062 8063 // Introduce the early-exit compare IV <= BTC to form header block mask. 8064 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8065 // constructing the desired canonical IV in the header block as its first 8066 // non-phi instructions. 8067 assert(CM.foldTailByMasking() && "must fold the tail"); 8068 VPBasicBlock *HeaderVPBB = 8069 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8070 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8071 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8072 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8073 8074 VPBuilder::InsertPointGuard Guard(Builder); 8075 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8076 if (CM.TTI.emitGetActiveLaneMask()) { 8077 VPValue *TC = Plan->getOrCreateTripCount(); 8078 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8079 } else { 8080 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8081 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8082 } 8083 return BlockMaskCache[BB] = BlockMask; 8084 } 8085 8086 // This is the block mask. We OR all incoming edges. 8087 for (auto *Predecessor : predecessors(BB)) { 8088 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8089 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8090 return BlockMaskCache[BB] = EdgeMask; 8091 8092 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8093 BlockMask = EdgeMask; 8094 continue; 8095 } 8096 8097 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8098 } 8099 8100 return BlockMaskCache[BB] = BlockMask; 8101 } 8102 8103 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8104 ArrayRef<VPValue *> Operands, 8105 VFRange &Range, 8106 VPlanPtr &Plan) { 8107 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8108 "Must be called with either a load or store"); 8109 8110 auto willWiden = [&](ElementCount VF) -> bool { 8111 LoopVectorizationCostModel::InstWidening Decision = 8112 CM.getWideningDecision(I, VF); 8113 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8114 "CM decision should be taken at this point."); 8115 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8116 return true; 8117 if (CM.isScalarAfterVectorization(I, VF) || 8118 CM.isProfitableToScalarize(I, VF)) 8119 return false; 8120 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8121 }; 8122 8123 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8124 return nullptr; 8125 8126 VPValue *Mask = nullptr; 8127 if (Legal->isMaskRequired(I)) 8128 Mask = createBlockInMask(I->getParent(), Plan); 8129 8130 // Determine if the pointer operand of the access is either consecutive or 8131 // reverse consecutive. 8132 LoopVectorizationCostModel::InstWidening Decision = 8133 CM.getWideningDecision(I, Range.Start); 8134 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8135 bool Consecutive = 8136 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8137 8138 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8139 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8140 Consecutive, Reverse); 8141 8142 StoreInst *Store = cast<StoreInst>(I); 8143 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8144 Mask, Consecutive, Reverse); 8145 } 8146 8147 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8148 /// insert a recipe to expand the step for the induction recipe. 8149 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8150 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8151 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8152 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8153 // Returns true if an instruction \p I should be scalarized instead of 8154 // vectorized for the chosen vectorization factor. 8155 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8156 return CM.isScalarAfterVectorization(I, VF) || 8157 CM.isProfitableToScalarize(I, VF); 8158 }; 8159 8160 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8161 [&](ElementCount VF) { 8162 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8163 }, 8164 Range); 8165 assert(IndDesc.getStartValue() == 8166 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8167 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8168 "step must be loop invariant"); 8169 8170 VPValue *Step = 8171 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8172 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8173 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8174 !NeedsScalarIVOnly); 8175 } 8176 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8177 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8178 !NeedsScalarIVOnly); 8179 } 8180 8181 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8182 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8183 8184 // Check if this is an integer or fp induction. If so, build the recipe that 8185 // produces its scalar and vector values. 8186 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8187 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8188 *PSE.getSE(), *OrigLoop, Range); 8189 8190 // Check if this is pointer induction. If so, build the recipe for it. 8191 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8192 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8193 *PSE.getSE()); 8194 return nullptr; 8195 } 8196 8197 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8198 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8199 // Optimize the special case where the source is a constant integer 8200 // induction variable. Notice that we can only optimize the 'trunc' case 8201 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8202 // (c) other casts depend on pointer size. 8203 8204 // Determine whether \p K is a truncation based on an induction variable that 8205 // can be optimized. 8206 auto isOptimizableIVTruncate = 8207 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8208 return [=](ElementCount VF) -> bool { 8209 return CM.isOptimizableIVTruncate(K, VF); 8210 }; 8211 }; 8212 8213 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8214 isOptimizableIVTruncate(I), Range)) { 8215 8216 auto *Phi = cast<PHINode>(I->getOperand(0)); 8217 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8218 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8219 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8220 *PSE.getSE(), *OrigLoop, Range); 8221 } 8222 return nullptr; 8223 } 8224 8225 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8226 ArrayRef<VPValue *> Operands, 8227 VPlanPtr &Plan) { 8228 // If all incoming values are equal, the incoming VPValue can be used directly 8229 // instead of creating a new VPBlendRecipe. 8230 VPValue *FirstIncoming = Operands[0]; 8231 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8232 return FirstIncoming == Inc; 8233 })) { 8234 return Operands[0]; 8235 } 8236 8237 unsigned NumIncoming = Phi->getNumIncomingValues(); 8238 // For in-loop reductions, we do not need to create an additional select. 8239 VPValue *InLoopVal = nullptr; 8240 for (unsigned In = 0; In < NumIncoming; In++) { 8241 PHINode *PhiOp = 8242 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8243 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8244 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8245 InLoopVal = Operands[In]; 8246 } 8247 } 8248 8249 assert((!InLoopVal || NumIncoming == 2) && 8250 "Found an in-loop reduction for PHI with unexpected number of " 8251 "incoming values"); 8252 if (InLoopVal) 8253 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8254 8255 // We know that all PHIs in non-header blocks are converted into selects, so 8256 // we don't have to worry about the insertion order and we can just use the 8257 // builder. At this point we generate the predication tree. There may be 8258 // duplications since this is a simple recursive scan, but future 8259 // optimizations will clean it up. 8260 SmallVector<VPValue *, 2> OperandsWithMask; 8261 8262 for (unsigned In = 0; In < NumIncoming; In++) { 8263 VPValue *EdgeMask = 8264 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8265 assert((EdgeMask || NumIncoming == 1) && 8266 "Multiple predecessors with one having a full mask"); 8267 OperandsWithMask.push_back(Operands[In]); 8268 if (EdgeMask) 8269 OperandsWithMask.push_back(EdgeMask); 8270 } 8271 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8272 } 8273 8274 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8275 ArrayRef<VPValue *> Operands, 8276 VFRange &Range) const { 8277 8278 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8279 [this, CI](ElementCount VF) { 8280 return CM.isScalarWithPredication(CI, VF); 8281 }, 8282 Range); 8283 8284 if (IsPredicated) 8285 return nullptr; 8286 8287 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8288 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8289 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8290 ID == Intrinsic::pseudoprobe || 8291 ID == Intrinsic::experimental_noalias_scope_decl)) 8292 return nullptr; 8293 8294 auto willWiden = [&](ElementCount VF) -> bool { 8295 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8296 // The following case may be scalarized depending on the VF. 8297 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8298 // version of the instruction. 8299 // Is it beneficial to perform intrinsic call compared to lib call? 8300 bool NeedToScalarize = false; 8301 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8302 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8303 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8304 return UseVectorIntrinsic || !NeedToScalarize; 8305 }; 8306 8307 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8308 return nullptr; 8309 8310 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8311 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8312 } 8313 8314 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8315 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8316 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8317 // Instruction should be widened, unless it is scalar after vectorization, 8318 // scalarization is profitable or it is predicated. 8319 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8320 return CM.isScalarAfterVectorization(I, VF) || 8321 CM.isProfitableToScalarize(I, VF) || 8322 CM.isScalarWithPredication(I, VF); 8323 }; 8324 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8325 Range); 8326 } 8327 8328 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8329 ArrayRef<VPValue *> Operands) const { 8330 auto IsVectorizableOpcode = [](unsigned Opcode) { 8331 switch (Opcode) { 8332 case Instruction::Add: 8333 case Instruction::And: 8334 case Instruction::AShr: 8335 case Instruction::BitCast: 8336 case Instruction::FAdd: 8337 case Instruction::FCmp: 8338 case Instruction::FDiv: 8339 case Instruction::FMul: 8340 case Instruction::FNeg: 8341 case Instruction::FPExt: 8342 case Instruction::FPToSI: 8343 case Instruction::FPToUI: 8344 case Instruction::FPTrunc: 8345 case Instruction::FRem: 8346 case Instruction::FSub: 8347 case Instruction::ICmp: 8348 case Instruction::IntToPtr: 8349 case Instruction::LShr: 8350 case Instruction::Mul: 8351 case Instruction::Or: 8352 case Instruction::PtrToInt: 8353 case Instruction::SDiv: 8354 case Instruction::Select: 8355 case Instruction::SExt: 8356 case Instruction::Shl: 8357 case Instruction::SIToFP: 8358 case Instruction::SRem: 8359 case Instruction::Sub: 8360 case Instruction::Trunc: 8361 case Instruction::UDiv: 8362 case Instruction::UIToFP: 8363 case Instruction::URem: 8364 case Instruction::Xor: 8365 case Instruction::ZExt: 8366 case Instruction::Freeze: 8367 return true; 8368 } 8369 return false; 8370 }; 8371 8372 if (!IsVectorizableOpcode(I->getOpcode())) 8373 return nullptr; 8374 8375 // Success: widen this instruction. 8376 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8377 } 8378 8379 void VPRecipeBuilder::fixHeaderPhis() { 8380 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8381 for (VPHeaderPHIRecipe *R : PhisToFix) { 8382 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8383 VPRecipeBase *IncR = 8384 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8385 R->addOperand(IncR->getVPSingleValue()); 8386 } 8387 } 8388 8389 VPBasicBlock *VPRecipeBuilder::handleReplication( 8390 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8391 VPlanPtr &Plan) { 8392 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8393 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8394 Range); 8395 8396 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8397 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8398 Range); 8399 8400 // Even if the instruction is not marked as uniform, there are certain 8401 // intrinsic calls that can be effectively treated as such, so we check for 8402 // them here. Conservatively, we only do this for scalable vectors, since 8403 // for fixed-width VFs we can always fall back on full scalarization. 8404 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8405 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8406 case Intrinsic::assume: 8407 case Intrinsic::lifetime_start: 8408 case Intrinsic::lifetime_end: 8409 // For scalable vectors if one of the operands is variant then we still 8410 // want to mark as uniform, which will generate one instruction for just 8411 // the first lane of the vector. We can't scalarize the call in the same 8412 // way as for fixed-width vectors because we don't know how many lanes 8413 // there are. 8414 // 8415 // The reasons for doing it this way for scalable vectors are: 8416 // 1. For the assume intrinsic generating the instruction for the first 8417 // lane is still be better than not generating any at all. For 8418 // example, the input may be a splat across all lanes. 8419 // 2. For the lifetime start/end intrinsics the pointer operand only 8420 // does anything useful when the input comes from a stack object, 8421 // which suggests it should always be uniform. For non-stack objects 8422 // the effect is to poison the object, which still allows us to 8423 // remove the call. 8424 IsUniform = true; 8425 break; 8426 default: 8427 break; 8428 } 8429 } 8430 8431 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8432 IsUniform, IsPredicated); 8433 setRecipe(I, Recipe); 8434 Plan->addVPValue(I, Recipe); 8435 8436 // Find if I uses a predicated instruction. If so, it will use its scalar 8437 // value. Avoid hoisting the insert-element which packs the scalar value into 8438 // a vector value, as that happens iff all users use the vector value. 8439 for (VPValue *Op : Recipe->operands()) { 8440 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8441 if (!PredR) 8442 continue; 8443 auto *RepR = 8444 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8445 assert(RepR->isPredicated() && 8446 "expected Replicate recipe to be predicated"); 8447 RepR->setAlsoPack(false); 8448 } 8449 8450 // Finalize the recipe for Instr, first if it is not predicated. 8451 if (!IsPredicated) { 8452 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8453 VPBB->appendRecipe(Recipe); 8454 return VPBB; 8455 } 8456 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8457 8458 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8459 assert(SingleSucc && "VPBB must have a single successor when handling " 8460 "predicated replication."); 8461 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8462 // Record predicated instructions for above packing optimizations. 8463 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8464 VPBlockUtils::insertBlockAfter(Region, VPBB); 8465 auto *RegSucc = new VPBasicBlock(); 8466 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8467 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8468 return RegSucc; 8469 } 8470 8471 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8472 VPRecipeBase *PredRecipe, 8473 VPlanPtr &Plan) { 8474 // Instructions marked for predication are replicated and placed under an 8475 // if-then construct to prevent side-effects. 8476 8477 // Generate recipes to compute the block mask for this region. 8478 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8479 8480 // Build the triangular if-then region. 8481 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8482 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8483 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8484 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8485 auto *PHIRecipe = Instr->getType()->isVoidTy() 8486 ? nullptr 8487 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8488 if (PHIRecipe) { 8489 Plan->removeVPValueFor(Instr); 8490 Plan->addVPValue(Instr, PHIRecipe); 8491 } 8492 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8493 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8494 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8495 8496 // Note: first set Entry as region entry and then connect successors starting 8497 // from it in order, to propagate the "parent" of each VPBasicBlock. 8498 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8499 VPBlockUtils::connectBlocks(Pred, Exiting); 8500 8501 return Region; 8502 } 8503 8504 VPRecipeOrVPValueTy 8505 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8506 ArrayRef<VPValue *> Operands, 8507 VFRange &Range, VPlanPtr &Plan) { 8508 // First, check for specific widening recipes that deal with inductions, Phi 8509 // nodes, calls and memory operations. 8510 VPRecipeBase *Recipe; 8511 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8512 if (Phi->getParent() != OrigLoop->getHeader()) 8513 return tryToBlend(Phi, Operands, Plan); 8514 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8515 return toVPRecipeResult(Recipe); 8516 8517 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8518 assert((Legal->isReductionVariable(Phi) || 8519 Legal->isFirstOrderRecurrence(Phi)) && 8520 "can only widen reductions and first-order recurrences here"); 8521 VPValue *StartV = Operands[0]; 8522 if (Legal->isReductionVariable(Phi)) { 8523 const RecurrenceDescriptor &RdxDesc = 8524 Legal->getReductionVars().find(Phi)->second; 8525 assert(RdxDesc.getRecurrenceStartValue() == 8526 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8527 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8528 CM.isInLoopReduction(Phi), 8529 CM.useOrderedReductions(RdxDesc)); 8530 } else { 8531 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8532 } 8533 8534 // Record the incoming value from the backedge, so we can add the incoming 8535 // value from the backedge after all recipes have been created. 8536 recordRecipeOf(cast<Instruction>( 8537 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8538 PhisToFix.push_back(PhiRecipe); 8539 return toVPRecipeResult(PhiRecipe); 8540 } 8541 8542 if (isa<TruncInst>(Instr) && 8543 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8544 Range, *Plan))) 8545 return toVPRecipeResult(Recipe); 8546 8547 // All widen recipes below deal only with VF > 1. 8548 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8549 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8550 return nullptr; 8551 8552 if (auto *CI = dyn_cast<CallInst>(Instr)) 8553 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8554 8555 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8556 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8557 8558 if (!shouldWiden(Instr, Range)) 8559 return nullptr; 8560 8561 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8562 return toVPRecipeResult(new VPWidenGEPRecipe( 8563 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8564 8565 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8566 bool InvariantCond = 8567 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8568 return toVPRecipeResult(new VPWidenSelectRecipe( 8569 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8570 } 8571 8572 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8573 } 8574 8575 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8576 ElementCount MaxVF) { 8577 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8578 8579 // Collect instructions from the original loop that will become trivially dead 8580 // in the vectorized loop. We don't need to vectorize these instructions. For 8581 // example, original induction update instructions can become dead because we 8582 // separately emit induction "steps" when generating code for the new loop. 8583 // Similarly, we create a new latch condition when setting up the structure 8584 // of the new loop, so the old one can become dead. 8585 SmallPtrSet<Instruction *, 4> DeadInstructions; 8586 collectTriviallyDeadInstructions(DeadInstructions); 8587 8588 // Add assume instructions we need to drop to DeadInstructions, to prevent 8589 // them from being added to the VPlan. 8590 // TODO: We only need to drop assumes in blocks that get flattend. If the 8591 // control flow is preserved, we should keep them. 8592 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8593 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8594 8595 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8596 // Dead instructions do not need sinking. Remove them from SinkAfter. 8597 for (Instruction *I : DeadInstructions) 8598 SinkAfter.erase(I); 8599 8600 // Cannot sink instructions after dead instructions (there won't be any 8601 // recipes for them). Instead, find the first non-dead previous instruction. 8602 for (auto &P : Legal->getSinkAfter()) { 8603 Instruction *SinkTarget = P.second; 8604 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8605 (void)FirstInst; 8606 while (DeadInstructions.contains(SinkTarget)) { 8607 assert( 8608 SinkTarget != FirstInst && 8609 "Must find a live instruction (at least the one feeding the " 8610 "first-order recurrence PHI) before reaching beginning of the block"); 8611 SinkTarget = SinkTarget->getPrevNode(); 8612 assert(SinkTarget != P.first && 8613 "sink source equals target, no sinking required"); 8614 } 8615 P.second = SinkTarget; 8616 } 8617 8618 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8619 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8620 VFRange SubRange = {VF, MaxVFPlusOne}; 8621 VPlans.push_back( 8622 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8623 VF = SubRange.End; 8624 } 8625 } 8626 8627 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8628 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8629 // BranchOnCount VPInstruction to the latch. 8630 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8631 bool HasNUW) { 8632 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8633 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8634 8635 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8636 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8637 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8638 Header->insert(CanonicalIVPHI, Header->begin()); 8639 8640 auto *CanonicalIVIncrement = 8641 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8642 : VPInstruction::CanonicalIVIncrement, 8643 {CanonicalIVPHI}, DL); 8644 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8645 8646 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8647 EB->appendRecipe(CanonicalIVIncrement); 8648 8649 auto *BranchOnCount = 8650 new VPInstruction(VPInstruction::BranchOnCount, 8651 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8652 EB->appendRecipe(BranchOnCount); 8653 } 8654 8655 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8656 // original exit block. 8657 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8658 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8659 VPlan &Plan) { 8660 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8661 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8662 // Only handle single-exit loops with unique exit blocks for now. 8663 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8664 return; 8665 8666 // Introduce VPUsers modeling the exit values. 8667 for (PHINode &ExitPhi : ExitBB->phis()) { 8668 Value *IncomingValue = 8669 ExitPhi.getIncomingValueForBlock(ExitingBB); 8670 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8671 Plan.addLiveOut(&ExitPhi, V); 8672 } 8673 } 8674 8675 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8676 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8677 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8678 8679 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8680 8681 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8682 8683 // --------------------------------------------------------------------------- 8684 // Pre-construction: record ingredients whose recipes we'll need to further 8685 // process after constructing the initial VPlan. 8686 // --------------------------------------------------------------------------- 8687 8688 // Mark instructions we'll need to sink later and their targets as 8689 // ingredients whose recipe we'll need to record. 8690 for (auto &Entry : SinkAfter) { 8691 RecipeBuilder.recordRecipeOf(Entry.first); 8692 RecipeBuilder.recordRecipeOf(Entry.second); 8693 } 8694 for (auto &Reduction : CM.getInLoopReductionChains()) { 8695 PHINode *Phi = Reduction.first; 8696 RecurKind Kind = 8697 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8698 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8699 8700 RecipeBuilder.recordRecipeOf(Phi); 8701 for (auto &R : ReductionOperations) { 8702 RecipeBuilder.recordRecipeOf(R); 8703 // For min/max reductions, where we have a pair of icmp/select, we also 8704 // need to record the ICmp recipe, so it can be removed later. 8705 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8706 "Only min/max recurrences allowed for inloop reductions"); 8707 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8708 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8709 } 8710 } 8711 8712 // For each interleave group which is relevant for this (possibly trimmed) 8713 // Range, add it to the set of groups to be later applied to the VPlan and add 8714 // placeholders for its members' Recipes which we'll be replacing with a 8715 // single VPInterleaveRecipe. 8716 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8717 auto applyIG = [IG, this](ElementCount VF) -> bool { 8718 return (VF.isVector() && // Query is illegal for VF == 1 8719 CM.getWideningDecision(IG->getInsertPos(), VF) == 8720 LoopVectorizationCostModel::CM_Interleave); 8721 }; 8722 if (!getDecisionAndClampRange(applyIG, Range)) 8723 continue; 8724 InterleaveGroups.insert(IG); 8725 for (unsigned i = 0; i < IG->getFactor(); i++) 8726 if (Instruction *Member = IG->getMember(i)) 8727 RecipeBuilder.recordRecipeOf(Member); 8728 }; 8729 8730 // --------------------------------------------------------------------------- 8731 // Build initial VPlan: Scan the body of the loop in a topological order to 8732 // visit each basic block after having visited its predecessor basic blocks. 8733 // --------------------------------------------------------------------------- 8734 8735 // Create initial VPlan skeleton, starting with a block for the pre-header, 8736 // followed by a region for the vector loop, followed by the middle block. The 8737 // skeleton vector loop region contains a header and latch block. 8738 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8739 auto Plan = std::make_unique<VPlan>(Preheader); 8740 8741 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8742 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8743 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8744 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8745 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8746 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8747 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8748 8749 Instruction *DLInst = 8750 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8751 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8752 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8753 !CM.foldTailByMasking()); 8754 8755 // Scan the body of the loop in a topological order to visit each basic block 8756 // after having visited its predecessor basic blocks. 8757 LoopBlocksDFS DFS(OrigLoop); 8758 DFS.perform(LI); 8759 8760 VPBasicBlock *VPBB = HeaderVPBB; 8761 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8762 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8763 // Relevant instructions from basic block BB will be grouped into VPRecipe 8764 // ingredients and fill a new VPBasicBlock. 8765 unsigned VPBBsForBB = 0; 8766 if (VPBB != HeaderVPBB) 8767 VPBB->setName(BB->getName()); 8768 Builder.setInsertPoint(VPBB); 8769 8770 // Introduce each ingredient into VPlan. 8771 // TODO: Model and preserve debug intrinsics in VPlan. 8772 for (Instruction &I : BB->instructionsWithoutDebug()) { 8773 Instruction *Instr = &I; 8774 8775 // First filter out irrelevant instructions, to ensure no recipes are 8776 // built for them. 8777 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8778 continue; 8779 8780 SmallVector<VPValue *, 4> Operands; 8781 auto *Phi = dyn_cast<PHINode>(Instr); 8782 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8783 Operands.push_back(Plan->getOrAddVPValue( 8784 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8785 } else { 8786 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8787 Operands = {OpRange.begin(), OpRange.end()}; 8788 } 8789 8790 // Invariant stores inside loop will be deleted and a single store 8791 // with the final reduction value will be added to the exit block 8792 StoreInst *SI; 8793 if ((SI = dyn_cast<StoreInst>(&I)) && 8794 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8795 continue; 8796 8797 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8798 Instr, Operands, Range, Plan)) { 8799 // If Instr can be simplified to an existing VPValue, use it. 8800 if (RecipeOrValue.is<VPValue *>()) { 8801 auto *VPV = RecipeOrValue.get<VPValue *>(); 8802 Plan->addVPValue(Instr, VPV); 8803 // If the re-used value is a recipe, register the recipe for the 8804 // instruction, in case the recipe for Instr needs to be recorded. 8805 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8806 RecipeBuilder.setRecipe(Instr, R); 8807 continue; 8808 } 8809 // Otherwise, add the new recipe. 8810 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8811 for (auto *Def : Recipe->definedValues()) { 8812 auto *UV = Def->getUnderlyingValue(); 8813 Plan->addVPValue(UV, Def); 8814 } 8815 8816 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8817 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8818 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8819 // of the header block. That can happen for truncates of induction 8820 // variables. Those recipes are moved to the phi section of the header 8821 // block after applying SinkAfter, which relies on the original 8822 // position of the trunc. 8823 assert(isa<TruncInst>(Instr)); 8824 InductionsToMove.push_back( 8825 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8826 } 8827 RecipeBuilder.setRecipe(Instr, Recipe); 8828 VPBB->appendRecipe(Recipe); 8829 continue; 8830 } 8831 8832 // Otherwise, if all widening options failed, Instruction is to be 8833 // replicated. This may create a successor for VPBB. 8834 VPBasicBlock *NextVPBB = 8835 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8836 if (NextVPBB != VPBB) { 8837 VPBB = NextVPBB; 8838 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8839 : ""); 8840 } 8841 } 8842 8843 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8844 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8845 } 8846 8847 HeaderVPBB->setName("vector.body"); 8848 8849 // Fold the last, empty block into its predecessor. 8850 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8851 assert(VPBB && "expected to fold last (empty) block"); 8852 // After here, VPBB should not be used. 8853 VPBB = nullptr; 8854 8855 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8856 8857 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8858 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8859 "entry block must be set to a VPRegionBlock having a non-empty entry " 8860 "VPBasicBlock"); 8861 RecipeBuilder.fixHeaderPhis(); 8862 8863 // --------------------------------------------------------------------------- 8864 // Transform initial VPlan: Apply previously taken decisions, in order, to 8865 // bring the VPlan to its final state. 8866 // --------------------------------------------------------------------------- 8867 8868 // Apply Sink-After legal constraints. 8869 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8870 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8871 if (Region && Region->isReplicator()) { 8872 assert(Region->getNumSuccessors() == 1 && 8873 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8874 assert(R->getParent()->size() == 1 && 8875 "A recipe in an original replicator region must be the only " 8876 "recipe in its block"); 8877 return Region; 8878 } 8879 return nullptr; 8880 }; 8881 for (auto &Entry : SinkAfter) { 8882 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8883 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8884 8885 auto *TargetRegion = GetReplicateRegion(Target); 8886 auto *SinkRegion = GetReplicateRegion(Sink); 8887 if (!SinkRegion) { 8888 // If the sink source is not a replicate region, sink the recipe directly. 8889 if (TargetRegion) { 8890 // The target is in a replication region, make sure to move Sink to 8891 // the block after it, not into the replication region itself. 8892 VPBasicBlock *NextBlock = 8893 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8894 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8895 } else 8896 Sink->moveAfter(Target); 8897 continue; 8898 } 8899 8900 // The sink source is in a replicate region. Unhook the region from the CFG. 8901 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8902 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8903 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8904 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8905 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8906 8907 if (TargetRegion) { 8908 // The target recipe is also in a replicate region, move the sink region 8909 // after the target region. 8910 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8911 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8912 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8913 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8914 } else { 8915 // The sink source is in a replicate region, we need to move the whole 8916 // replicate region, which should only contain a single recipe in the 8917 // main block. 8918 auto *SplitBlock = 8919 Target->getParent()->splitAt(std::next(Target->getIterator())); 8920 8921 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8922 8923 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8924 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8925 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8926 } 8927 } 8928 8929 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8930 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8931 8932 // Now that sink-after is done, move induction recipes for optimized truncates 8933 // to the phi section of the header block. 8934 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8935 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8936 8937 // Adjust the recipes for any inloop reductions. 8938 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 8939 RecipeBuilder, Range.Start); 8940 8941 // Introduce a recipe to combine the incoming and previous values of a 8942 // first-order recurrence. 8943 for (VPRecipeBase &R : 8944 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8945 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8946 if (!RecurPhi) 8947 continue; 8948 8949 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8950 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8951 auto *Region = GetReplicateRegion(PrevRecipe); 8952 if (Region) 8953 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); 8954 if (!InsertBlock) { 8955 InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); 8956 VPBlockUtils::insertBlockAfter(InsertBlock, Region); 8957 } 8958 if (Region || PrevRecipe->isPhi()) 8959 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8960 else 8961 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 8962 8963 auto *RecurSplice = cast<VPInstruction>( 8964 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 8965 {RecurPhi, RecurPhi->getBackedgeValue()})); 8966 8967 RecurPhi->replaceAllUsesWith(RecurSplice); 8968 // Set the first operand of RecurSplice to RecurPhi again, after replacing 8969 // all users. 8970 RecurSplice->setOperand(0, RecurPhi); 8971 } 8972 8973 // Interleave memory: for each Interleave Group we marked earlier as relevant 8974 // for this VPlan, replace the Recipes widening its memory instructions with a 8975 // single VPInterleaveRecipe at its insertion point. 8976 for (auto IG : InterleaveGroups) { 8977 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8978 RecipeBuilder.getRecipe(IG->getInsertPos())); 8979 SmallVector<VPValue *, 4> StoredValues; 8980 for (unsigned i = 0; i < IG->getFactor(); ++i) 8981 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8982 auto *StoreR = 8983 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8984 StoredValues.push_back(StoreR->getStoredValue()); 8985 } 8986 8987 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8988 Recipe->getMask()); 8989 VPIG->insertBefore(Recipe); 8990 unsigned J = 0; 8991 for (unsigned i = 0; i < IG->getFactor(); ++i) 8992 if (Instruction *Member = IG->getMember(i)) { 8993 if (!Member->getType()->isVoidTy()) { 8994 VPValue *OriginalV = Plan->getVPValue(Member); 8995 Plan->removeVPValueFor(Member); 8996 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8997 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8998 J++; 8999 } 9000 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9001 } 9002 } 9003 9004 std::string PlanName; 9005 raw_string_ostream RSO(PlanName); 9006 ElementCount VF = Range.Start; 9007 Plan->addVF(VF); 9008 RSO << "Initial VPlan for VF={" << VF; 9009 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9010 Plan->addVF(VF); 9011 RSO << "," << VF; 9012 } 9013 RSO << "},UF>=1"; 9014 RSO.flush(); 9015 Plan->setName(PlanName); 9016 9017 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9018 // in ways that accessing values using original IR values is incorrect. 9019 Plan->disableValue2VPValue(); 9020 9021 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9022 VPlanTransforms::sinkScalarOperands(*Plan); 9023 VPlanTransforms::mergeReplicateRegions(*Plan); 9024 VPlanTransforms::removeDeadRecipes(*Plan); 9025 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9026 9027 // Fold Exit block into its predecessor if possible. 9028 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9029 // VPBasicBlock as exit. 9030 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); 9031 9032 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9033 return Plan; 9034 } 9035 9036 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9037 // Outer loop handling: They may require CFG and instruction level 9038 // transformations before even evaluating whether vectorization is profitable. 9039 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9040 // the vectorization pipeline. 9041 assert(!OrigLoop->isInnermost()); 9042 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9043 9044 // Create new empty VPlan 9045 auto Plan = std::make_unique<VPlan>(); 9046 9047 // Build hierarchical CFG 9048 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9049 HCFGBuilder.buildHierarchicalCFG(); 9050 9051 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9052 VF *= 2) 9053 Plan->addVF(VF); 9054 9055 SmallPtrSet<Instruction *, 1> DeadInstructions; 9056 VPlanTransforms::VPInstructionsToVPRecipes( 9057 OrigLoop, Plan, 9058 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9059 DeadInstructions, *PSE.getSE()); 9060 9061 // Remove the existing terminator of the exiting block of the top-most region. 9062 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9063 auto *Term = 9064 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9065 Term->eraseFromParent(); 9066 9067 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9068 true); 9069 return Plan; 9070 } 9071 9072 // Adjust the recipes for reductions. For in-loop reductions the chain of 9073 // instructions leading from the loop exit instr to the phi need to be converted 9074 // to reductions, with one operand being vector and the other being the scalar 9075 // reduction chain. For other reductions, a select is introduced between the phi 9076 // and live-out recipes when folding the tail. 9077 void LoopVectorizationPlanner::adjustRecipesForReductions( 9078 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9079 ElementCount MinVF) { 9080 for (auto &Reduction : CM.getInLoopReductionChains()) { 9081 PHINode *Phi = Reduction.first; 9082 const RecurrenceDescriptor &RdxDesc = 9083 Legal->getReductionVars().find(Phi)->second; 9084 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9085 9086 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9087 continue; 9088 9089 // ReductionOperations are orders top-down from the phi's use to the 9090 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9091 // which of the two operands will remain scalar and which will be reduced. 9092 // For minmax the chain will be the select instructions. 9093 Instruction *Chain = Phi; 9094 for (Instruction *R : ReductionOperations) { 9095 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9096 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9097 9098 VPValue *ChainOp = Plan->getVPValue(Chain); 9099 unsigned FirstOpId; 9100 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9101 "Only min/max recurrences allowed for inloop reductions"); 9102 // Recognize a call to the llvm.fmuladd intrinsic. 9103 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9104 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9105 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9106 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9107 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9108 "Expected to replace a VPWidenSelectSC"); 9109 FirstOpId = 1; 9110 } else { 9111 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9112 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9113 "Expected to replace a VPWidenSC"); 9114 FirstOpId = 0; 9115 } 9116 unsigned VecOpId = 9117 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9118 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9119 9120 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9121 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9122 : nullptr; 9123 9124 if (IsFMulAdd) { 9125 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9126 // need to create an fmul recipe to use as the vector operand for the 9127 // fadd reduction. 9128 VPInstruction *FMulRecipe = new VPInstruction( 9129 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9130 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9131 WidenRecipe->getParent()->insert(FMulRecipe, 9132 WidenRecipe->getIterator()); 9133 VecOp = FMulRecipe; 9134 } 9135 VPReductionRecipe *RedRecipe = 9136 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9137 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9138 Plan->removeVPValueFor(R); 9139 Plan->addVPValue(R, RedRecipe); 9140 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9141 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9142 WidenRecipe->eraseFromParent(); 9143 9144 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9145 VPRecipeBase *CompareRecipe = 9146 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9147 assert(isa<VPWidenRecipe>(CompareRecipe) && 9148 "Expected to replace a VPWidenSC"); 9149 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9150 "Expected no remaining users"); 9151 CompareRecipe->eraseFromParent(); 9152 } 9153 Chain = R; 9154 } 9155 } 9156 9157 // If tail is folded by masking, introduce selects between the phi 9158 // and the live-out instruction of each reduction, at the beginning of the 9159 // dedicated latch block. 9160 if (CM.foldTailByMasking()) { 9161 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9162 for (VPRecipeBase &R : 9163 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9164 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9165 if (!PhiR || PhiR->isInLoop()) 9166 continue; 9167 VPValue *Cond = 9168 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9169 VPValue *Red = PhiR->getBackedgeValue(); 9170 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9171 "reduction recipe must be defined before latch"); 9172 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9173 } 9174 } 9175 } 9176 9177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9178 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9179 VPSlotTracker &SlotTracker) const { 9180 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9181 IG->getInsertPos()->printAsOperand(O, false); 9182 O << ", "; 9183 getAddr()->printAsOperand(O, SlotTracker); 9184 VPValue *Mask = getMask(); 9185 if (Mask) { 9186 O << ", "; 9187 Mask->printAsOperand(O, SlotTracker); 9188 } 9189 9190 unsigned OpIdx = 0; 9191 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9192 if (!IG->getMember(i)) 9193 continue; 9194 if (getNumStoreOperands() > 0) { 9195 O << "\n" << Indent << " store "; 9196 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9197 O << " to index " << i; 9198 } else { 9199 O << "\n" << Indent << " "; 9200 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9201 O << " = load from index " << i; 9202 } 9203 ++OpIdx; 9204 } 9205 } 9206 #endif 9207 9208 void VPWidenCallRecipe::execute(VPTransformState &State) { 9209 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9210 *this, State); 9211 } 9212 9213 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9214 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9215 State.ILV->setDebugLocFromInst(&I); 9216 9217 // The condition can be loop invariant but still defined inside the 9218 // loop. This means that we can't just use the original 'cond' value. 9219 // We have to take the 'vectorized' value and pick the first lane. 9220 // Instcombine will make this a no-op. 9221 auto *InvarCond = 9222 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9223 9224 for (unsigned Part = 0; Part < State.UF; ++Part) { 9225 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9226 Value *Op0 = State.get(getOperand(1), Part); 9227 Value *Op1 = State.get(getOperand(2), Part); 9228 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9229 State.set(this, Sel, Part); 9230 State.ILV->addMetadata(Sel, &I); 9231 } 9232 } 9233 9234 void VPWidenRecipe::execute(VPTransformState &State) { 9235 auto &I = *cast<Instruction>(getUnderlyingValue()); 9236 auto &Builder = State.Builder; 9237 switch (I.getOpcode()) { 9238 case Instruction::Call: 9239 case Instruction::Br: 9240 case Instruction::PHI: 9241 case Instruction::GetElementPtr: 9242 case Instruction::Select: 9243 llvm_unreachable("This instruction is handled by a different recipe."); 9244 case Instruction::UDiv: 9245 case Instruction::SDiv: 9246 case Instruction::SRem: 9247 case Instruction::URem: 9248 case Instruction::Add: 9249 case Instruction::FAdd: 9250 case Instruction::Sub: 9251 case Instruction::FSub: 9252 case Instruction::FNeg: 9253 case Instruction::Mul: 9254 case Instruction::FMul: 9255 case Instruction::FDiv: 9256 case Instruction::FRem: 9257 case Instruction::Shl: 9258 case Instruction::LShr: 9259 case Instruction::AShr: 9260 case Instruction::And: 9261 case Instruction::Or: 9262 case Instruction::Xor: { 9263 // Just widen unops and binops. 9264 State.ILV->setDebugLocFromInst(&I); 9265 9266 for (unsigned Part = 0; Part < State.UF; ++Part) { 9267 SmallVector<Value *, 2> Ops; 9268 for (VPValue *VPOp : operands()) 9269 Ops.push_back(State.get(VPOp, Part)); 9270 9271 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9272 9273 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9274 VecOp->copyIRFlags(&I); 9275 9276 // If the instruction is vectorized and was in a basic block that needed 9277 // predication, we can't propagate poison-generating flags (nuw/nsw, 9278 // exact, etc.). The control flow has been linearized and the 9279 // instruction is no longer guarded by the predicate, which could make 9280 // the flag properties to no longer hold. 9281 if (State.MayGeneratePoisonRecipes.contains(this)) 9282 VecOp->dropPoisonGeneratingFlags(); 9283 } 9284 9285 // Use this vector value for all users of the original instruction. 9286 State.set(this, V, Part); 9287 State.ILV->addMetadata(V, &I); 9288 } 9289 9290 break; 9291 } 9292 case Instruction::Freeze: { 9293 State.ILV->setDebugLocFromInst(&I); 9294 9295 for (unsigned Part = 0; Part < State.UF; ++Part) { 9296 Value *Op = State.get(getOperand(0), Part); 9297 9298 Value *Freeze = Builder.CreateFreeze(Op); 9299 State.set(this, Freeze, Part); 9300 } 9301 break; 9302 } 9303 case Instruction::ICmp: 9304 case Instruction::FCmp: { 9305 // Widen compares. Generate vector compares. 9306 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9307 auto *Cmp = cast<CmpInst>(&I); 9308 State.ILV->setDebugLocFromInst(Cmp); 9309 for (unsigned Part = 0; Part < State.UF; ++Part) { 9310 Value *A = State.get(getOperand(0), Part); 9311 Value *B = State.get(getOperand(1), Part); 9312 Value *C = nullptr; 9313 if (FCmp) { 9314 // Propagate fast math flags. 9315 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9316 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9317 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9318 } else { 9319 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9320 } 9321 State.set(this, C, Part); 9322 State.ILV->addMetadata(C, &I); 9323 } 9324 9325 break; 9326 } 9327 9328 case Instruction::ZExt: 9329 case Instruction::SExt: 9330 case Instruction::FPToUI: 9331 case Instruction::FPToSI: 9332 case Instruction::FPExt: 9333 case Instruction::PtrToInt: 9334 case Instruction::IntToPtr: 9335 case Instruction::SIToFP: 9336 case Instruction::UIToFP: 9337 case Instruction::Trunc: 9338 case Instruction::FPTrunc: 9339 case Instruction::BitCast: { 9340 auto *CI = cast<CastInst>(&I); 9341 State.ILV->setDebugLocFromInst(CI); 9342 9343 /// Vectorize casts. 9344 Type *DestTy = (State.VF.isScalar()) 9345 ? CI->getType() 9346 : VectorType::get(CI->getType(), State.VF); 9347 9348 for (unsigned Part = 0; Part < State.UF; ++Part) { 9349 Value *A = State.get(getOperand(0), Part); 9350 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9351 State.set(this, Cast, Part); 9352 State.ILV->addMetadata(Cast, &I); 9353 } 9354 break; 9355 } 9356 default: 9357 // This instruction is not vectorized by simple widening. 9358 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9359 llvm_unreachable("Unhandled instruction!"); 9360 } // end of switch. 9361 } 9362 9363 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9364 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9365 // Construct a vector GEP by widening the operands of the scalar GEP as 9366 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9367 // results in a vector of pointers when at least one operand of the GEP 9368 // is vector-typed. Thus, to keep the representation compact, we only use 9369 // vector-typed operands for loop-varying values. 9370 9371 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9372 // If we are vectorizing, but the GEP has only loop-invariant operands, 9373 // the GEP we build (by only using vector-typed operands for 9374 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9375 // produce a vector of pointers, we need to either arbitrarily pick an 9376 // operand to broadcast, or broadcast a clone of the original GEP. 9377 // Here, we broadcast a clone of the original. 9378 // 9379 // TODO: If at some point we decide to scalarize instructions having 9380 // loop-invariant operands, this special case will no longer be 9381 // required. We would add the scalarization decision to 9382 // collectLoopScalars() and teach getVectorValue() to broadcast 9383 // the lane-zero scalar value. 9384 auto *Clone = State.Builder.Insert(GEP->clone()); 9385 for (unsigned Part = 0; Part < State.UF; ++Part) { 9386 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9387 State.set(this, EntryPart, Part); 9388 State.ILV->addMetadata(EntryPart, GEP); 9389 } 9390 } else { 9391 // If the GEP has at least one loop-varying operand, we are sure to 9392 // produce a vector of pointers. But if we are only unrolling, we want 9393 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9394 // produce with the code below will be scalar (if VF == 1) or vector 9395 // (otherwise). Note that for the unroll-only case, we still maintain 9396 // values in the vector mapping with initVector, as we do for other 9397 // instructions. 9398 for (unsigned Part = 0; Part < State.UF; ++Part) { 9399 // The pointer operand of the new GEP. If it's loop-invariant, we 9400 // won't broadcast it. 9401 auto *Ptr = IsPtrLoopInvariant 9402 ? State.get(getOperand(0), VPIteration(0, 0)) 9403 : State.get(getOperand(0), Part); 9404 9405 // Collect all the indices for the new GEP. If any index is 9406 // loop-invariant, we won't broadcast it. 9407 SmallVector<Value *, 4> Indices; 9408 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9409 VPValue *Operand = getOperand(I); 9410 if (IsIndexLoopInvariant[I - 1]) 9411 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9412 else 9413 Indices.push_back(State.get(Operand, Part)); 9414 } 9415 9416 // If the GEP instruction is vectorized and was in a basic block that 9417 // needed predication, we can't propagate the poison-generating 'inbounds' 9418 // flag. The control flow has been linearized and the GEP is no longer 9419 // guarded by the predicate, which could make the 'inbounds' properties to 9420 // no longer hold. 9421 bool IsInBounds = 9422 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9423 9424 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9425 // but it should be a vector, otherwise. 9426 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 9427 Indices, "", IsInBounds); 9428 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9429 "NewGEP is not a pointer vector"); 9430 State.set(this, NewGEP, Part); 9431 State.ILV->addMetadata(NewGEP, GEP); 9432 } 9433 } 9434 } 9435 9436 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9437 assert(!State.Instance && "Int or FP induction being replicated."); 9438 9439 Value *Start = getStartValue()->getLiveInIRValue(); 9440 const InductionDescriptor &ID = getInductionDescriptor(); 9441 TruncInst *Trunc = getTruncInst(); 9442 IRBuilderBase &Builder = State.Builder; 9443 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9444 assert(State.VF.isVector() && "must have vector VF"); 9445 9446 // The value from the original loop to which we are mapping the new induction 9447 // variable. 9448 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9449 9450 // Fast-math-flags propagate from the original induction instruction. 9451 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9452 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9453 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9454 9455 // Now do the actual transformations, and start with fetching the step value. 9456 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9457 9458 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9459 "Expected either an induction phi-node or a truncate of it!"); 9460 9461 // Construct the initial value of the vector IV in the vector loop preheader 9462 auto CurrIP = Builder.saveIP(); 9463 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9464 Builder.SetInsertPoint(VectorPH->getTerminator()); 9465 if (isa<TruncInst>(EntryVal)) { 9466 assert(Start->getType()->isIntegerTy() && 9467 "Truncation requires an integer type"); 9468 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9469 Step = Builder.CreateTrunc(Step, TruncType); 9470 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9471 } 9472 9473 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9474 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9475 Value *SteppedStart = getStepVector( 9476 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9477 9478 // We create vector phi nodes for both integer and floating-point induction 9479 // variables. Here, we determine the kind of arithmetic we will perform. 9480 Instruction::BinaryOps AddOp; 9481 Instruction::BinaryOps MulOp; 9482 if (Step->getType()->isIntegerTy()) { 9483 AddOp = Instruction::Add; 9484 MulOp = Instruction::Mul; 9485 } else { 9486 AddOp = ID.getInductionOpcode(); 9487 MulOp = Instruction::FMul; 9488 } 9489 9490 // Multiply the vectorization factor by the step using integer or 9491 // floating-point arithmetic as appropriate. 9492 Type *StepType = Step->getType(); 9493 Value *RuntimeVF; 9494 if (Step->getType()->isFloatingPointTy()) 9495 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9496 else 9497 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9498 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9499 9500 // Create a vector splat to use in the induction update. 9501 // 9502 // FIXME: If the step is non-constant, we create the vector splat with 9503 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9504 // handle a constant vector splat. 9505 Value *SplatVF = isa<Constant>(Mul) 9506 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9507 : Builder.CreateVectorSplat(State.VF, Mul); 9508 Builder.restoreIP(CurrIP); 9509 9510 // We may need to add the step a number of times, depending on the unroll 9511 // factor. The last of those goes into the PHI. 9512 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9513 &*State.CFG.PrevBB->getFirstInsertionPt()); 9514 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9515 Instruction *LastInduction = VecInd; 9516 for (unsigned Part = 0; Part < State.UF; ++Part) { 9517 State.set(this, LastInduction, Part); 9518 9519 if (isa<TruncInst>(EntryVal)) 9520 State.ILV->addMetadata(LastInduction, EntryVal); 9521 9522 LastInduction = cast<Instruction>( 9523 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9524 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9525 } 9526 9527 LastInduction->setName("vec.ind.next"); 9528 VecInd->addIncoming(SteppedStart, VectorPH); 9529 // Add induction update using an incorrect block temporarily. The phi node 9530 // will be fixed after VPlan execution. Note that at this point the latch 9531 // block cannot be used, as it does not exist yet. 9532 // TODO: Model increment value in VPlan, by turning the recipe into a 9533 // multi-def and a subclass of VPHeaderPHIRecipe. 9534 VecInd->addIncoming(LastInduction, VectorPH); 9535 } 9536 9537 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9538 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9539 "Not a pointer induction according to InductionDescriptor!"); 9540 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9541 "Unexpected type."); 9542 9543 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9544 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9545 9546 if (onlyScalarsGenerated(State.VF)) { 9547 // This is the normalized GEP that starts counting at zero. 9548 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9549 CanonicalIV, IndDesc.getStep()->getType()); 9550 // Determine the number of scalars we need to generate for each unroll 9551 // iteration. If the instruction is uniform, we only need to generate the 9552 // first lane. Otherwise, we generate all VF values. 9553 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9554 assert((IsUniform || !State.VF.isScalable()) && 9555 "Cannot scalarize a scalable VF"); 9556 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9557 9558 for (unsigned Part = 0; Part < State.UF; ++Part) { 9559 Value *PartStart = 9560 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9561 9562 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9563 Value *Idx = State.Builder.CreateAdd( 9564 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9565 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9566 9567 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9568 State.CFG.PrevBB->getTerminator()); 9569 Value *SclrGep = emitTransformedIndex( 9570 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9571 SclrGep->setName("next.gep"); 9572 State.set(this, SclrGep, VPIteration(Part, Lane)); 9573 } 9574 } 9575 return; 9576 } 9577 9578 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9579 "Induction step not a SCEV constant!"); 9580 Type *PhiType = IndDesc.getStep()->getType(); 9581 9582 // Build a pointer phi 9583 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9584 Type *ScStValueType = ScalarStartValue->getType(); 9585 PHINode *NewPointerPhi = 9586 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9587 9588 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9589 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9590 9591 // A pointer induction, performed by using a gep 9592 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9593 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9594 9595 const SCEV *ScalarStep = IndDesc.getStep(); 9596 SCEVExpander Exp(SE, DL, "induction"); 9597 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9598 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9599 Value *NumUnrolledElems = 9600 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9601 Value *InductionGEP = GetElementPtrInst::Create( 9602 IndDesc.getElementType(), NewPointerPhi, 9603 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9604 InductionLoc); 9605 // Add induction update using an incorrect block temporarily. The phi node 9606 // will be fixed after VPlan execution. Note that at this point the latch 9607 // block cannot be used, as it does not exist yet. 9608 // TODO: Model increment value in VPlan, by turning the recipe into a 9609 // multi-def and a subclass of VPHeaderPHIRecipe. 9610 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9611 9612 // Create UF many actual address geps that use the pointer 9613 // phi as base and a vectorized version of the step value 9614 // (<step*0, ..., step*N>) as offset. 9615 for (unsigned Part = 0; Part < State.UF; ++Part) { 9616 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9617 Value *StartOffsetScalar = 9618 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9619 Value *StartOffset = 9620 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9621 // Create a vector of consecutive numbers from zero to VF. 9622 StartOffset = State.Builder.CreateAdd( 9623 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9624 9625 Value *GEP = State.Builder.CreateGEP( 9626 IndDesc.getElementType(), NewPointerPhi, 9627 State.Builder.CreateMul( 9628 StartOffset, 9629 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9630 "vector.gep")); 9631 State.set(this, GEP, Part); 9632 } 9633 } 9634 9635 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9636 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9637 9638 // Fast-math-flags propagate from the original induction instruction. 9639 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9640 if (IndDesc.getInductionBinOp() && 9641 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9642 State.Builder.setFastMathFlags( 9643 IndDesc.getInductionBinOp()->getFastMathFlags()); 9644 9645 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9646 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9647 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9648 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9649 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9650 ScalarIV = 9651 Ty->isIntegerTy() 9652 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9653 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9654 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9655 getStartValue()->getLiveInIRValue(), Step, 9656 IndDesc); 9657 ScalarIV->setName("offset.idx"); 9658 } 9659 if (TruncToTy) { 9660 assert(Step->getType()->isIntegerTy() && 9661 "Truncation requires an integer step"); 9662 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9663 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9664 } 9665 return ScalarIV; 9666 }; 9667 9668 Value *ScalarIV = CreateScalarIV(Step); 9669 if (State.VF.isVector()) { 9670 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9671 return; 9672 } 9673 9674 for (unsigned Part = 0; Part < State.UF; ++Part) { 9675 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9676 Value *EntryPart; 9677 if (Step->getType()->isFloatingPointTy()) { 9678 Value *StartIdx = 9679 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9680 // Floating-point operations inherit FMF via the builder's flags. 9681 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9682 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9683 ScalarIV, MulOp); 9684 } else { 9685 Value *StartIdx = 9686 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9687 EntryPart = State.Builder.CreateAdd( 9688 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9689 } 9690 State.set(this, EntryPart, Part); 9691 } 9692 } 9693 9694 void VPBlendRecipe::execute(VPTransformState &State) { 9695 State.ILV->setDebugLocFromInst(Phi); 9696 // We know that all PHIs in non-header blocks are converted into 9697 // selects, so we don't have to worry about the insertion order and we 9698 // can just use the builder. 9699 // At this point we generate the predication tree. There may be 9700 // duplications since this is a simple recursive scan, but future 9701 // optimizations will clean it up. 9702 9703 unsigned NumIncoming = getNumIncomingValues(); 9704 9705 // Generate a sequence of selects of the form: 9706 // SELECT(Mask3, In3, 9707 // SELECT(Mask2, In2, 9708 // SELECT(Mask1, In1, 9709 // In0))) 9710 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9711 // are essentially undef are taken from In0. 9712 InnerLoopVectorizer::VectorParts Entry(State.UF); 9713 for (unsigned In = 0; In < NumIncoming; ++In) { 9714 for (unsigned Part = 0; Part < State.UF; ++Part) { 9715 // We might have single edge PHIs (blocks) - use an identity 9716 // 'select' for the first PHI operand. 9717 Value *In0 = State.get(getIncomingValue(In), Part); 9718 if (In == 0) 9719 Entry[Part] = In0; // Initialize with the first incoming value. 9720 else { 9721 // Select between the current value and the previous incoming edge 9722 // based on the incoming mask. 9723 Value *Cond = State.get(getMask(In), Part); 9724 Entry[Part] = 9725 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9726 } 9727 } 9728 } 9729 for (unsigned Part = 0; Part < State.UF; ++Part) 9730 State.set(this, Entry[Part], Part); 9731 } 9732 9733 void VPInterleaveRecipe::execute(VPTransformState &State) { 9734 assert(!State.Instance && "Interleave group being replicated."); 9735 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9736 getStoredValues(), getMask()); 9737 } 9738 9739 void VPReductionRecipe::execute(VPTransformState &State) { 9740 assert(!State.Instance && "Reduction being replicated."); 9741 Value *PrevInChain = State.get(getChainOp(), 0); 9742 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9743 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9744 // Propagate the fast-math flags carried by the underlying instruction. 9745 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9746 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9747 for (unsigned Part = 0; Part < State.UF; ++Part) { 9748 Value *NewVecOp = State.get(getVecOp(), Part); 9749 if (VPValue *Cond = getCondOp()) { 9750 Value *NewCond = State.get(Cond, Part); 9751 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9752 Value *Iden = RdxDesc->getRecurrenceIdentity( 9753 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9754 Value *IdenVec = 9755 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9756 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9757 NewVecOp = Select; 9758 } 9759 Value *NewRed; 9760 Value *NextInChain; 9761 if (IsOrdered) { 9762 if (State.VF.isVector()) 9763 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9764 PrevInChain); 9765 else 9766 NewRed = State.Builder.CreateBinOp( 9767 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9768 NewVecOp); 9769 PrevInChain = NewRed; 9770 } else { 9771 PrevInChain = State.get(getChainOp(), Part); 9772 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9773 } 9774 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9775 NextInChain = 9776 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9777 NewRed, PrevInChain); 9778 } else if (IsOrdered) 9779 NextInChain = NewRed; 9780 else 9781 NextInChain = State.Builder.CreateBinOp( 9782 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9783 PrevInChain); 9784 State.set(this, NextInChain, Part); 9785 } 9786 } 9787 9788 void VPReplicateRecipe::execute(VPTransformState &State) { 9789 if (State.Instance) { // Generate a single instance. 9790 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9791 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9792 IsPredicated, State); 9793 // Insert scalar instance packing it into a vector. 9794 if (AlsoPack && State.VF.isVector()) { 9795 // If we're constructing lane 0, initialize to start from poison. 9796 if (State.Instance->Lane.isFirstLane()) { 9797 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9798 Value *Poison = PoisonValue::get( 9799 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9800 State.set(this, Poison, State.Instance->Part); 9801 } 9802 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9803 } 9804 return; 9805 } 9806 9807 // Generate scalar instances for all VF lanes of all UF parts, unless the 9808 // instruction is uniform inwhich case generate only the first lane for each 9809 // of the UF parts. 9810 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9811 assert((!State.VF.isScalable() || IsUniform) && 9812 "Can't scalarize a scalable vector"); 9813 for (unsigned Part = 0; Part < State.UF; ++Part) 9814 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9815 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9816 VPIteration(Part, Lane), IsPredicated, 9817 State); 9818 } 9819 9820 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9821 assert(State.Instance && "Branch on Mask works only on single instance."); 9822 9823 unsigned Part = State.Instance->Part; 9824 unsigned Lane = State.Instance->Lane.getKnownLane(); 9825 9826 Value *ConditionBit = nullptr; 9827 VPValue *BlockInMask = getMask(); 9828 if (BlockInMask) { 9829 ConditionBit = State.get(BlockInMask, Part); 9830 if (ConditionBit->getType()->isVectorTy()) 9831 ConditionBit = State.Builder.CreateExtractElement( 9832 ConditionBit, State.Builder.getInt32(Lane)); 9833 } else // Block in mask is all-one. 9834 ConditionBit = State.Builder.getTrue(); 9835 9836 // Replace the temporary unreachable terminator with a new conditional branch, 9837 // whose two destinations will be set later when they are created. 9838 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9839 assert(isa<UnreachableInst>(CurrentTerminator) && 9840 "Expected to replace unreachable terminator with conditional branch."); 9841 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9842 CondBr->setSuccessor(0, nullptr); 9843 ReplaceInstWithInst(CurrentTerminator, CondBr); 9844 } 9845 9846 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9847 assert(State.Instance && "Predicated instruction PHI works per instance."); 9848 Instruction *ScalarPredInst = 9849 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9850 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9851 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9852 assert(PredicatingBB && "Predicated block has no single predecessor."); 9853 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9854 "operand must be VPReplicateRecipe"); 9855 9856 // By current pack/unpack logic we need to generate only a single phi node: if 9857 // a vector value for the predicated instruction exists at this point it means 9858 // the instruction has vector users only, and a phi for the vector value is 9859 // needed. In this case the recipe of the predicated instruction is marked to 9860 // also do that packing, thereby "hoisting" the insert-element sequence. 9861 // Otherwise, a phi node for the scalar value is needed. 9862 unsigned Part = State.Instance->Part; 9863 if (State.hasVectorValue(getOperand(0), Part)) { 9864 Value *VectorValue = State.get(getOperand(0), Part); 9865 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9866 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9867 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9868 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9869 if (State.hasVectorValue(this, Part)) 9870 State.reset(this, VPhi, Part); 9871 else 9872 State.set(this, VPhi, Part); 9873 // NOTE: Currently we need to update the value of the operand, so the next 9874 // predicated iteration inserts its generated value in the correct vector. 9875 State.reset(getOperand(0), VPhi, Part); 9876 } else { 9877 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9878 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9879 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9880 PredicatingBB); 9881 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9882 if (State.hasScalarValue(this, *State.Instance)) 9883 State.reset(this, Phi, *State.Instance); 9884 else 9885 State.set(this, Phi, *State.Instance); 9886 // NOTE: Currently we need to update the value of the operand, so the next 9887 // predicated iteration inserts its generated value in the correct vector. 9888 State.reset(getOperand(0), Phi, *State.Instance); 9889 } 9890 } 9891 9892 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9893 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9894 9895 // Attempt to issue a wide load. 9896 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9897 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9898 9899 assert((LI || SI) && "Invalid Load/Store instruction"); 9900 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9901 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9902 9903 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9904 9905 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9906 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9907 bool CreateGatherScatter = !Consecutive; 9908 9909 auto &Builder = State.Builder; 9910 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9911 bool isMaskRequired = getMask(); 9912 if (isMaskRequired) 9913 for (unsigned Part = 0; Part < State.UF; ++Part) 9914 BlockInMaskParts[Part] = State.get(getMask(), Part); 9915 9916 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9917 // Calculate the pointer for the specific unroll-part. 9918 GetElementPtrInst *PartPtr = nullptr; 9919 9920 bool InBounds = false; 9921 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9922 InBounds = gep->isInBounds(); 9923 if (Reverse) { 9924 // If the address is consecutive but reversed, then the 9925 // wide store needs to start at the last vector element. 9926 // RunTimeVF = VScale * VF.getKnownMinValue() 9927 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9928 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9929 // NumElt = -Part * RunTimeVF 9930 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9931 // LastLane = 1 - RunTimeVF 9932 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9933 PartPtr = 9934 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9935 PartPtr->setIsInBounds(InBounds); 9936 PartPtr = cast<GetElementPtrInst>( 9937 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9938 PartPtr->setIsInBounds(InBounds); 9939 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9940 BlockInMaskParts[Part] = 9941 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9942 } else { 9943 Value *Increment = 9944 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9945 PartPtr = cast<GetElementPtrInst>( 9946 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9947 PartPtr->setIsInBounds(InBounds); 9948 } 9949 9950 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9951 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9952 }; 9953 9954 // Handle Stores: 9955 if (SI) { 9956 State.ILV->setDebugLocFromInst(SI); 9957 9958 for (unsigned Part = 0; Part < State.UF; ++Part) { 9959 Instruction *NewSI = nullptr; 9960 Value *StoredVal = State.get(StoredValue, Part); 9961 if (CreateGatherScatter) { 9962 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9963 Value *VectorGep = State.get(getAddr(), Part); 9964 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9965 MaskPart); 9966 } else { 9967 if (Reverse) { 9968 // If we store to reverse consecutive memory locations, then we need 9969 // to reverse the order of elements in the stored value. 9970 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9971 // We don't want to update the value in the map as it might be used in 9972 // another expression. So don't call resetVectorValue(StoredVal). 9973 } 9974 auto *VecPtr = 9975 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9976 if (isMaskRequired) 9977 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9978 BlockInMaskParts[Part]); 9979 else 9980 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9981 } 9982 State.ILV->addMetadata(NewSI, SI); 9983 } 9984 return; 9985 } 9986 9987 // Handle loads. 9988 assert(LI && "Must have a load instruction"); 9989 State.ILV->setDebugLocFromInst(LI); 9990 for (unsigned Part = 0; Part < State.UF; ++Part) { 9991 Value *NewLI; 9992 if (CreateGatherScatter) { 9993 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9994 Value *VectorGep = State.get(getAddr(), Part); 9995 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9996 nullptr, "wide.masked.gather"); 9997 State.ILV->addMetadata(NewLI, LI); 9998 } else { 9999 auto *VecPtr = 10000 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10001 if (isMaskRequired) 10002 NewLI = Builder.CreateMaskedLoad( 10003 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10004 PoisonValue::get(DataTy), "wide.masked.load"); 10005 else 10006 NewLI = 10007 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10008 10009 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10010 State.ILV->addMetadata(NewLI, LI); 10011 if (Reverse) 10012 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10013 } 10014 10015 State.set(getVPSingleValue(), NewLI, Part); 10016 } 10017 } 10018 10019 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10020 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10021 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10022 // for predication. 10023 static ScalarEpilogueLowering getScalarEpilogueLowering( 10024 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10025 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10026 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10027 LoopVectorizationLegality &LVL) { 10028 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10029 // don't look at hints or options, and don't request a scalar epilogue. 10030 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10031 // LoopAccessInfo (due to code dependency and not being able to reliably get 10032 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10033 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10034 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10035 // back to the old way and vectorize with versioning when forced. See D81345.) 10036 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10037 PGSOQueryType::IRPass) && 10038 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10039 return CM_ScalarEpilogueNotAllowedOptSize; 10040 10041 // 2) If set, obey the directives 10042 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10043 switch (PreferPredicateOverEpilogue) { 10044 case PreferPredicateTy::ScalarEpilogue: 10045 return CM_ScalarEpilogueAllowed; 10046 case PreferPredicateTy::PredicateElseScalarEpilogue: 10047 return CM_ScalarEpilogueNotNeededUsePredicate; 10048 case PreferPredicateTy::PredicateOrDontVectorize: 10049 return CM_ScalarEpilogueNotAllowedUsePredicate; 10050 }; 10051 } 10052 10053 // 3) If set, obey the hints 10054 switch (Hints.getPredicate()) { 10055 case LoopVectorizeHints::FK_Enabled: 10056 return CM_ScalarEpilogueNotNeededUsePredicate; 10057 case LoopVectorizeHints::FK_Disabled: 10058 return CM_ScalarEpilogueAllowed; 10059 }; 10060 10061 // 4) if the TTI hook indicates this is profitable, request predication. 10062 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10063 LVL.getLAI())) 10064 return CM_ScalarEpilogueNotNeededUsePredicate; 10065 10066 return CM_ScalarEpilogueAllowed; 10067 } 10068 10069 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10070 // If Values have been set for this Def return the one relevant for \p Part. 10071 if (hasVectorValue(Def, Part)) 10072 return Data.PerPartOutput[Def][Part]; 10073 10074 if (!hasScalarValue(Def, {Part, 0})) { 10075 Value *IRV = Def->getLiveInIRValue(); 10076 Value *B = ILV->getBroadcastInstrs(IRV); 10077 set(Def, B, Part); 10078 return B; 10079 } 10080 10081 Value *ScalarValue = get(Def, {Part, 0}); 10082 // If we aren't vectorizing, we can just copy the scalar map values over 10083 // to the vector map. 10084 if (VF.isScalar()) { 10085 set(Def, ScalarValue, Part); 10086 return ScalarValue; 10087 } 10088 10089 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10090 bool IsUniform = RepR && RepR->isUniform(); 10091 10092 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10093 // Check if there is a scalar value for the selected lane. 10094 if (!hasScalarValue(Def, {Part, LastLane})) { 10095 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10096 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10097 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10098 "unexpected recipe found to be invariant"); 10099 IsUniform = true; 10100 LastLane = 0; 10101 } 10102 10103 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10104 // Set the insert point after the last scalarized instruction or after the 10105 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10106 // will directly follow the scalar definitions. 10107 auto OldIP = Builder.saveIP(); 10108 auto NewIP = 10109 isa<PHINode>(LastInst) 10110 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10111 : std::next(BasicBlock::iterator(LastInst)); 10112 Builder.SetInsertPoint(&*NewIP); 10113 10114 // However, if we are vectorizing, we need to construct the vector values. 10115 // If the value is known to be uniform after vectorization, we can just 10116 // broadcast the scalar value corresponding to lane zero for each unroll 10117 // iteration. Otherwise, we construct the vector values using 10118 // insertelement instructions. Since the resulting vectors are stored in 10119 // State, we will only generate the insertelements once. 10120 Value *VectorValue = nullptr; 10121 if (IsUniform) { 10122 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10123 set(Def, VectorValue, Part); 10124 } else { 10125 // Initialize packing with insertelements to start from undef. 10126 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10127 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10128 set(Def, Undef, Part); 10129 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10130 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10131 VectorValue = get(Def, Part); 10132 } 10133 Builder.restoreIP(OldIP); 10134 return VectorValue; 10135 } 10136 10137 // Process the loop in the VPlan-native vectorization path. This path builds 10138 // VPlan upfront in the vectorization pipeline, which allows to apply 10139 // VPlan-to-VPlan transformations from the very beginning without modifying the 10140 // input LLVM IR. 10141 static bool processLoopInVPlanNativePath( 10142 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10143 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10144 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10145 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10146 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10147 LoopVectorizationRequirements &Requirements) { 10148 10149 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10150 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10151 return false; 10152 } 10153 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10154 Function *F = L->getHeader()->getParent(); 10155 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10156 10157 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10158 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10159 10160 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10161 &Hints, IAI); 10162 // Use the planner for outer loop vectorization. 10163 // TODO: CM is not used at this point inside the planner. Turn CM into an 10164 // optional argument if we don't need it in the future. 10165 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10166 Requirements, ORE); 10167 10168 // Get user vectorization factor. 10169 ElementCount UserVF = Hints.getWidth(); 10170 10171 CM.collectElementTypesForWidening(); 10172 10173 // Plan how to best vectorize, return the best VF and its cost. 10174 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10175 10176 // If we are stress testing VPlan builds, do not attempt to generate vector 10177 // code. Masked vector code generation support will follow soon. 10178 // Also, do not attempt to vectorize if no vector code will be produced. 10179 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10180 return false; 10181 10182 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10183 10184 { 10185 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10186 F->getParent()->getDataLayout()); 10187 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10188 &CM, BFI, PSI, Checks); 10189 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10190 << L->getHeader()->getParent()->getName() << "\"\n"); 10191 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10192 } 10193 10194 // Mark the loop as already vectorized to avoid vectorizing again. 10195 Hints.setAlreadyVectorized(); 10196 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10197 return true; 10198 } 10199 10200 // Emit a remark if there are stores to floats that required a floating point 10201 // extension. If the vectorized loop was generated with floating point there 10202 // will be a performance penalty from the conversion overhead and the change in 10203 // the vector width. 10204 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10205 SmallVector<Instruction *, 4> Worklist; 10206 for (BasicBlock *BB : L->getBlocks()) { 10207 for (Instruction &Inst : *BB) { 10208 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10209 if (S->getValueOperand()->getType()->isFloatTy()) 10210 Worklist.push_back(S); 10211 } 10212 } 10213 } 10214 10215 // Traverse the floating point stores upwards searching, for floating point 10216 // conversions. 10217 SmallPtrSet<const Instruction *, 4> Visited; 10218 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10219 while (!Worklist.empty()) { 10220 auto *I = Worklist.pop_back_val(); 10221 if (!L->contains(I)) 10222 continue; 10223 if (!Visited.insert(I).second) 10224 continue; 10225 10226 // Emit a remark if the floating point store required a floating 10227 // point conversion. 10228 // TODO: More work could be done to identify the root cause such as a 10229 // constant or a function return type and point the user to it. 10230 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10231 ORE->emit([&]() { 10232 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10233 I->getDebugLoc(), L->getHeader()) 10234 << "floating point conversion changes vector width. " 10235 << "Mixed floating point precision requires an up/down " 10236 << "cast that will negatively impact performance."; 10237 }); 10238 10239 for (Use &Op : I->operands()) 10240 if (auto *OpI = dyn_cast<Instruction>(Op)) 10241 Worklist.push_back(OpI); 10242 } 10243 } 10244 10245 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10246 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10247 !EnableLoopInterleaving), 10248 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10249 !EnableLoopVectorization) {} 10250 10251 bool LoopVectorizePass::processLoop(Loop *L) { 10252 assert((EnableVPlanNativePath || L->isInnermost()) && 10253 "VPlan-native path is not enabled. Only process inner loops."); 10254 10255 #ifndef NDEBUG 10256 const std::string DebugLocStr = getDebugLocString(L); 10257 #endif /* NDEBUG */ 10258 10259 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10260 << L->getHeader()->getParent()->getName() << "' from " 10261 << DebugLocStr << "\n"); 10262 10263 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10264 10265 LLVM_DEBUG( 10266 dbgs() << "LV: Loop hints:" 10267 << " force=" 10268 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10269 ? "disabled" 10270 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10271 ? "enabled" 10272 : "?")) 10273 << " width=" << Hints.getWidth() 10274 << " interleave=" << Hints.getInterleave() << "\n"); 10275 10276 // Function containing loop 10277 Function *F = L->getHeader()->getParent(); 10278 10279 // Looking at the diagnostic output is the only way to determine if a loop 10280 // was vectorized (other than looking at the IR or machine code), so it 10281 // is important to generate an optimization remark for each loop. Most of 10282 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10283 // generated as OptimizationRemark and OptimizationRemarkMissed are 10284 // less verbose reporting vectorized loops and unvectorized loops that may 10285 // benefit from vectorization, respectively. 10286 10287 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10288 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10289 return false; 10290 } 10291 10292 PredicatedScalarEvolution PSE(*SE, *L); 10293 10294 // Check if it is legal to vectorize the loop. 10295 LoopVectorizationRequirements Requirements; 10296 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10297 &Requirements, &Hints, DB, AC, BFI, PSI); 10298 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10299 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10300 Hints.emitRemarkWithHints(); 10301 return false; 10302 } 10303 10304 // Check the function attributes and profiles to find out if this function 10305 // should be optimized for size. 10306 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10307 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10308 10309 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10310 // here. They may require CFG and instruction level transformations before 10311 // even evaluating whether vectorization is profitable. Since we cannot modify 10312 // the incoming IR, we need to build VPlan upfront in the vectorization 10313 // pipeline. 10314 if (!L->isInnermost()) 10315 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10316 ORE, BFI, PSI, Hints, Requirements); 10317 10318 assert(L->isInnermost() && "Inner loop expected."); 10319 10320 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10321 // count by optimizing for size, to minimize overheads. 10322 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10323 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10324 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10325 << "This loop is worth vectorizing only if no scalar " 10326 << "iteration overheads are incurred."); 10327 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10328 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10329 else { 10330 LLVM_DEBUG(dbgs() << "\n"); 10331 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10332 } 10333 } 10334 10335 // Check the function attributes to see if implicit floats are allowed. 10336 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10337 // an integer loop and the vector instructions selected are purely integer 10338 // vector instructions? 10339 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10340 reportVectorizationFailure( 10341 "Can't vectorize when the NoImplicitFloat attribute is used", 10342 "loop not vectorized due to NoImplicitFloat attribute", 10343 "NoImplicitFloat", ORE, L); 10344 Hints.emitRemarkWithHints(); 10345 return false; 10346 } 10347 10348 // Check if the target supports potentially unsafe FP vectorization. 10349 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10350 // for the target we're vectorizing for, to make sure none of the 10351 // additional fp-math flags can help. 10352 if (Hints.isPotentiallyUnsafe() && 10353 TTI->isFPVectorizationPotentiallyUnsafe()) { 10354 reportVectorizationFailure( 10355 "Potentially unsafe FP op prevents vectorization", 10356 "loop not vectorized due to unsafe FP support.", 10357 "UnsafeFP", ORE, L); 10358 Hints.emitRemarkWithHints(); 10359 return false; 10360 } 10361 10362 bool AllowOrderedReductions; 10363 // If the flag is set, use that instead and override the TTI behaviour. 10364 if (ForceOrderedReductions.getNumOccurrences() > 0) 10365 AllowOrderedReductions = ForceOrderedReductions; 10366 else 10367 AllowOrderedReductions = TTI->enableOrderedReductions(); 10368 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10369 ORE->emit([&]() { 10370 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10371 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10372 ExactFPMathInst->getDebugLoc(), 10373 ExactFPMathInst->getParent()) 10374 << "loop not vectorized: cannot prove it is safe to reorder " 10375 "floating-point operations"; 10376 }); 10377 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10378 "reorder floating-point operations\n"); 10379 Hints.emitRemarkWithHints(); 10380 return false; 10381 } 10382 10383 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10384 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10385 10386 // If an override option has been passed in for interleaved accesses, use it. 10387 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10388 UseInterleaved = EnableInterleavedMemAccesses; 10389 10390 // Analyze interleaved memory accesses. 10391 if (UseInterleaved) { 10392 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10393 } 10394 10395 // Use the cost model. 10396 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10397 F, &Hints, IAI); 10398 CM.collectValuesToIgnore(); 10399 CM.collectElementTypesForWidening(); 10400 10401 // Use the planner for vectorization. 10402 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10403 Requirements, ORE); 10404 10405 // Get user vectorization factor and interleave count. 10406 ElementCount UserVF = Hints.getWidth(); 10407 unsigned UserIC = Hints.getInterleave(); 10408 10409 // Plan how to best vectorize, return the best VF and its cost. 10410 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10411 10412 VectorizationFactor VF = VectorizationFactor::Disabled(); 10413 unsigned IC = 1; 10414 10415 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10416 F->getParent()->getDataLayout()); 10417 if (MaybeVF) { 10418 if (LVP.requiresTooManyRuntimeChecks()) { 10419 ORE->emit([&]() { 10420 return OptimizationRemarkAnalysisAliasing( 10421 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10422 L->getHeader()) 10423 << "loop not vectorized: cannot prove it is safe to reorder " 10424 "memory operations"; 10425 }); 10426 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10427 Hints.emitRemarkWithHints(); 10428 return false; 10429 } 10430 VF = *MaybeVF; 10431 // Select the interleave count. 10432 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10433 10434 unsigned SelectedIC = std::max(IC, UserIC); 10435 // Optimistically generate runtime checks if they are needed. Drop them if 10436 // they turn out to not be profitable. 10437 if (VF.Width.isVector() || SelectedIC > 1) 10438 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10439 } 10440 10441 // Identify the diagnostic messages that should be produced. 10442 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10443 bool VectorizeLoop = true, InterleaveLoop = true; 10444 if (VF.Width.isScalar()) { 10445 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10446 VecDiagMsg = std::make_pair( 10447 "VectorizationNotBeneficial", 10448 "the cost-model indicates that vectorization is not beneficial"); 10449 VectorizeLoop = false; 10450 } 10451 10452 if (!MaybeVF && UserIC > 1) { 10453 // Tell the user interleaving was avoided up-front, despite being explicitly 10454 // requested. 10455 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10456 "interleaving should be avoided up front\n"); 10457 IntDiagMsg = std::make_pair( 10458 "InterleavingAvoided", 10459 "Ignoring UserIC, because interleaving was avoided up front"); 10460 InterleaveLoop = false; 10461 } else if (IC == 1 && UserIC <= 1) { 10462 // Tell the user interleaving is not beneficial. 10463 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10464 IntDiagMsg = std::make_pair( 10465 "InterleavingNotBeneficial", 10466 "the cost-model indicates that interleaving is not beneficial"); 10467 InterleaveLoop = false; 10468 if (UserIC == 1) { 10469 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10470 IntDiagMsg.second += 10471 " and is explicitly disabled or interleave count is set to 1"; 10472 } 10473 } else if (IC > 1 && UserIC == 1) { 10474 // Tell the user interleaving is beneficial, but it explicitly disabled. 10475 LLVM_DEBUG( 10476 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10477 IntDiagMsg = std::make_pair( 10478 "InterleavingBeneficialButDisabled", 10479 "the cost-model indicates that interleaving is beneficial " 10480 "but is explicitly disabled or interleave count is set to 1"); 10481 InterleaveLoop = false; 10482 } 10483 10484 // Override IC if user provided an interleave count. 10485 IC = UserIC > 0 ? UserIC : IC; 10486 10487 // Emit diagnostic messages, if any. 10488 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10489 if (!VectorizeLoop && !InterleaveLoop) { 10490 // Do not vectorize or interleaving the loop. 10491 ORE->emit([&]() { 10492 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10493 L->getStartLoc(), L->getHeader()) 10494 << VecDiagMsg.second; 10495 }); 10496 ORE->emit([&]() { 10497 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10498 L->getStartLoc(), L->getHeader()) 10499 << IntDiagMsg.second; 10500 }); 10501 return false; 10502 } else if (!VectorizeLoop && InterleaveLoop) { 10503 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10504 ORE->emit([&]() { 10505 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10506 L->getStartLoc(), L->getHeader()) 10507 << VecDiagMsg.second; 10508 }); 10509 } else if (VectorizeLoop && !InterleaveLoop) { 10510 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10511 << ") in " << DebugLocStr << '\n'); 10512 ORE->emit([&]() { 10513 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10514 L->getStartLoc(), L->getHeader()) 10515 << IntDiagMsg.second; 10516 }); 10517 } else if (VectorizeLoop && InterleaveLoop) { 10518 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10519 << ") in " << DebugLocStr << '\n'); 10520 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10521 } 10522 10523 bool DisableRuntimeUnroll = false; 10524 MDNode *OrigLoopID = L->getLoopID(); 10525 { 10526 using namespace ore; 10527 if (!VectorizeLoop) { 10528 assert(IC > 1 && "interleave count should not be 1 or 0"); 10529 // If we decided that it is not legal to vectorize the loop, then 10530 // interleave it. 10531 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10532 &CM, BFI, PSI, Checks); 10533 10534 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10535 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10536 10537 ORE->emit([&]() { 10538 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10539 L->getHeader()) 10540 << "interleaved loop (interleaved count: " 10541 << NV("InterleaveCount", IC) << ")"; 10542 }); 10543 } else { 10544 // If we decided that it is *legal* to vectorize the loop, then do it. 10545 10546 // Consider vectorizing the epilogue too if it's profitable. 10547 VectorizationFactor EpilogueVF = 10548 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10549 if (EpilogueVF.Width.isVector()) { 10550 10551 // The first pass vectorizes the main loop and creates a scalar epilogue 10552 // to be vectorized by executing the plan (potentially with a different 10553 // factor) again shortly afterwards. 10554 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10555 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10556 EPI, &LVL, &CM, BFI, PSI, Checks); 10557 10558 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10559 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10560 DT); 10561 ++LoopsVectorized; 10562 10563 // Second pass vectorizes the epilogue and adjusts the control flow 10564 // edges from the first pass. 10565 EPI.MainLoopVF = EPI.EpilogueVF; 10566 EPI.MainLoopUF = EPI.EpilogueUF; 10567 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10568 ORE, EPI, &LVL, &CM, BFI, PSI, 10569 Checks); 10570 10571 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10572 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10573 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10574 Header->setName("vec.epilog.vector.body"); 10575 10576 // Ensure that the start values for any VPReductionPHIRecipes are 10577 // updated before vectorising the epilogue loop. 10578 for (VPRecipeBase &R : Header->phis()) { 10579 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10580 if (auto *Resume = MainILV.getReductionResumeValue( 10581 ReductionPhi->getRecurrenceDescriptor())) { 10582 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10583 ReductionPhi->setOperand(0, StartVal); 10584 } 10585 } 10586 } 10587 10588 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10589 DT); 10590 ++LoopsEpilogueVectorized; 10591 10592 if (!MainILV.areSafetyChecksAdded()) 10593 DisableRuntimeUnroll = true; 10594 } else { 10595 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10596 &LVL, &CM, BFI, PSI, Checks); 10597 10598 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10599 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10600 ++LoopsVectorized; 10601 10602 // Add metadata to disable runtime unrolling a scalar loop when there 10603 // are no runtime checks about strides and memory. A scalar loop that is 10604 // rarely used is not worth unrolling. 10605 if (!LB.areSafetyChecksAdded()) 10606 DisableRuntimeUnroll = true; 10607 } 10608 // Report the vectorization decision. 10609 ORE->emit([&]() { 10610 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10611 L->getHeader()) 10612 << "vectorized loop (vectorization width: " 10613 << NV("VectorizationFactor", VF.Width) 10614 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10615 }); 10616 } 10617 10618 if (ORE->allowExtraAnalysis(LV_NAME)) 10619 checkMixedPrecision(L, ORE); 10620 } 10621 10622 Optional<MDNode *> RemainderLoopID = 10623 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10624 LLVMLoopVectorizeFollowupEpilogue}); 10625 if (RemainderLoopID.hasValue()) { 10626 L->setLoopID(RemainderLoopID.getValue()); 10627 } else { 10628 if (DisableRuntimeUnroll) 10629 AddRuntimeUnrollDisableMetaData(L); 10630 10631 // Mark the loop as already vectorized to avoid vectorizing again. 10632 Hints.setAlreadyVectorized(); 10633 } 10634 10635 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10636 return true; 10637 } 10638 10639 LoopVectorizeResult LoopVectorizePass::runImpl( 10640 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10641 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10642 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10643 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10644 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10645 SE = &SE_; 10646 LI = &LI_; 10647 TTI = &TTI_; 10648 DT = &DT_; 10649 BFI = &BFI_; 10650 TLI = TLI_; 10651 AA = &AA_; 10652 AC = &AC_; 10653 GetLAA = &GetLAA_; 10654 DB = &DB_; 10655 ORE = &ORE_; 10656 PSI = PSI_; 10657 10658 // Don't attempt if 10659 // 1. the target claims to have no vector registers, and 10660 // 2. interleaving won't help ILP. 10661 // 10662 // The second condition is necessary because, even if the target has no 10663 // vector registers, loop vectorization may still enable scalar 10664 // interleaving. 10665 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10666 TTI->getMaxInterleaveFactor(1) < 2) 10667 return LoopVectorizeResult(false, false); 10668 10669 bool Changed = false, CFGChanged = false; 10670 10671 // The vectorizer requires loops to be in simplified form. 10672 // Since simplification may add new inner loops, it has to run before the 10673 // legality and profitability checks. This means running the loop vectorizer 10674 // will simplify all loops, regardless of whether anything end up being 10675 // vectorized. 10676 for (auto &L : *LI) 10677 Changed |= CFGChanged |= 10678 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10679 10680 // Build up a worklist of inner-loops to vectorize. This is necessary as 10681 // the act of vectorizing or partially unrolling a loop creates new loops 10682 // and can invalidate iterators across the loops. 10683 SmallVector<Loop *, 8> Worklist; 10684 10685 for (Loop *L : *LI) 10686 collectSupportedLoops(*L, LI, ORE, Worklist); 10687 10688 LoopsAnalyzed += Worklist.size(); 10689 10690 // Now walk the identified inner loops. 10691 while (!Worklist.empty()) { 10692 Loop *L = Worklist.pop_back_val(); 10693 10694 // For the inner loops we actually process, form LCSSA to simplify the 10695 // transform. 10696 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10697 10698 Changed |= CFGChanged |= processLoop(L); 10699 } 10700 10701 // Process each loop nest in the function. 10702 return LoopVectorizeResult(Changed, CFGChanged); 10703 } 10704 10705 PreservedAnalyses LoopVectorizePass::run(Function &F, 10706 FunctionAnalysisManager &AM) { 10707 auto &LI = AM.getResult<LoopAnalysis>(F); 10708 // There are no loops in the function. Return before computing other expensive 10709 // analyses. 10710 if (LI.empty()) 10711 return PreservedAnalyses::all(); 10712 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10713 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10714 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10715 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10716 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10717 auto &AA = AM.getResult<AAManager>(F); 10718 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10719 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10720 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10721 10722 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10723 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10724 [&](Loop &L) -> const LoopAccessInfo & { 10725 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10726 TLI, TTI, nullptr, nullptr, nullptr}; 10727 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10728 }; 10729 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10730 ProfileSummaryInfo *PSI = 10731 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10732 LoopVectorizeResult Result = 10733 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10734 if (!Result.MadeAnyChange) 10735 return PreservedAnalyses::all(); 10736 PreservedAnalyses PA; 10737 10738 // We currently do not preserve loopinfo/dominator analyses with outer loop 10739 // vectorization. Until this is addressed, mark these analyses as preserved 10740 // only for non-VPlan-native path. 10741 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10742 if (!EnableVPlanNativePath) { 10743 PA.preserve<LoopAnalysis>(); 10744 PA.preserve<DominatorTreeAnalysis>(); 10745 } 10746 10747 if (Result.MadeCFGChange) { 10748 // Making CFG changes likely means a loop got vectorized. Indicate that 10749 // extra simplification passes should be run. 10750 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10751 // be run if runtime checks have been added. 10752 AM.getResult<ShouldRunExtraVectorPasses>(F); 10753 PA.preserve<ShouldRunExtraVectorPasses>(); 10754 } else { 10755 PA.preserveSet<CFGAnalyses>(); 10756 } 10757 return PA; 10758 } 10759 10760 void LoopVectorizePass::printPipeline( 10761 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10762 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10763 OS, MapClassName2PassName); 10764 10765 OS << "<"; 10766 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10767 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10768 OS << ">"; 10769 } 10770