1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/None.h" 69 #include "llvm/ADT/Optional.h" 70 #include "llvm/ADT/STLExtras.h" 71 #include "llvm/ADT/SmallPtrSet.h" 72 #include "llvm/ADT/SmallSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 90 #include "llvm/Analysis/ProfileSummaryInfo.h" 91 #include "llvm/Analysis/ScalarEvolution.h" 92 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 93 #include "llvm/Analysis/TargetLibraryInfo.h" 94 #include "llvm/Analysis/TargetTransformInfo.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/InitializePasses.h" 125 #include "llvm/Pass.h" 126 #include "llvm/Support/Casting.h" 127 #include "llvm/Support/CommandLine.h" 128 #include "llvm/Support/Compiler.h" 129 #include "llvm/Support/Debug.h" 130 #include "llvm/Support/ErrorHandling.h" 131 #include "llvm/Support/InstructionCost.h" 132 #include "llvm/Support/MathExtras.h" 133 #include "llvm/Support/raw_ostream.h" 134 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 135 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 200 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks with a " 202 "vectorize(enable) pragma.")); 203 204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 205 // that predication is preferred, and this lists all options. I.e., the 206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 207 // and predicate the instructions accordingly. If tail-folding fails, there are 208 // different fallback strategies depending on these values: 209 namespace PreferPredicateTy { 210 enum Option { 211 ScalarEpilogue = 0, 212 PredicateElseScalarEpilogue, 213 PredicateOrDontVectorize 214 }; 215 } // namespace PreferPredicateTy 216 217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 218 "prefer-predicate-over-epilogue", 219 cl::init(PreferPredicateTy::ScalarEpilogue), 220 cl::Hidden, 221 cl::desc("Tail-folding and predication preferences over creating a scalar " 222 "epilogue loop."), 223 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 224 "scalar-epilogue", 225 "Don't tail-predicate loops, create scalar epilogue"), 226 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 227 "predicate-else-scalar-epilogue", 228 "prefer tail-folding, create scalar epilogue if tail " 229 "folding fails."), 230 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 231 "predicate-dont-vectorize", 232 "prefers tail-folding, don't attempt vectorization if " 233 "tail-folding fails."))); 234 235 static cl::opt<bool> MaximizeBandwidth( 236 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 237 cl::desc("Maximize bandwidth when selecting vectorization factor which " 238 "will be determined by the smallest type in loop.")); 239 240 static cl::opt<bool> EnableInterleavedMemAccesses( 241 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 242 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 243 244 /// An interleave-group may need masking if it resides in a block that needs 245 /// predication, or in order to mask away gaps. 246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 247 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 248 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 249 250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 251 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 252 cl::desc("We don't interleave loops with a estimated constant trip count " 253 "below this number")); 254 255 static cl::opt<unsigned> ForceTargetNumScalarRegs( 256 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of scalar registers.")); 258 259 static cl::opt<unsigned> ForceTargetNumVectorRegs( 260 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's number of vector registers.")); 262 263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 264 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 265 cl::desc("A flag that overrides the target's max interleave factor for " 266 "scalar loops.")); 267 268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 269 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 270 cl::desc("A flag that overrides the target's max interleave factor for " 271 "vectorized loops.")); 272 273 static cl::opt<unsigned> ForceTargetInstructionCost( 274 "force-target-instruction-cost", cl::init(0), cl::Hidden, 275 cl::desc("A flag that overrides the target's expected cost for " 276 "an instruction to a single constant value. Mostly " 277 "useful for getting consistent testing.")); 278 279 static cl::opt<bool> ForceTargetSupportsScalableVectors( 280 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 281 cl::desc( 282 "Pretend that scalable vectors are supported, even if the target does " 283 "not support them. This flag should only be used for testing.")); 284 285 static cl::opt<unsigned> SmallLoopCost( 286 "small-loop-cost", cl::init(20), cl::Hidden, 287 cl::desc( 288 "The cost of a loop that is considered 'small' by the interleaver.")); 289 290 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 291 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 292 cl::desc("Enable the use of the block frequency analysis to access PGO " 293 "heuristics minimizing code growth in cold regions and being more " 294 "aggressive in hot regions.")); 295 296 // Runtime interleave loops for load/store throughput. 297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 298 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 299 cl::desc( 300 "Enable runtime interleaving until load/store ports are saturated")); 301 302 /// Interleave small loops with scalar reductions. 303 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 304 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 305 cl::desc("Enable interleaving for loops with small iteration counts that " 306 "contain scalar reductions to expose ILP.")); 307 308 /// The number of stores in a loop that are allowed to need predication. 309 static cl::opt<unsigned> NumberOfStoresToPredicate( 310 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 311 cl::desc("Max number of stores to be predicated behind an if.")); 312 313 static cl::opt<bool> EnableIndVarRegisterHeur( 314 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 315 cl::desc("Count the induction variable only once when interleaving")); 316 317 static cl::opt<bool> EnableCondStoresVectorization( 318 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 319 cl::desc("Enable if predication of stores during vectorization.")); 320 321 static cl::opt<unsigned> MaxNestedScalarReductionIC( 322 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 323 cl::desc("The maximum interleave count to use when interleaving a scalar " 324 "reduction in a nested loop.")); 325 326 static cl::opt<bool> 327 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 328 cl::Hidden, 329 cl::desc("Prefer in-loop vector reductions, " 330 "overriding the targets preference.")); 331 332 static cl::opt<bool> ForceOrderedReductions( 333 "force-ordered-reductions", cl::init(false), cl::Hidden, 334 cl::desc("Enable the vectorisation of loops with in-order (strict) " 335 "FP reductions")); 336 337 static cl::opt<bool> PreferPredicatedReductionSelect( 338 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 339 cl::desc( 340 "Prefer predicating a reduction operation over an after loop select.")); 341 342 cl::opt<bool> EnableVPlanNativePath( 343 "enable-vplan-native-path", cl::init(false), cl::Hidden, 344 cl::desc("Enable VPlan-native vectorization path with " 345 "support for outer loop vectorization.")); 346 347 // This flag enables the stress testing of the VPlan H-CFG construction in the 348 // VPlan-native vectorization path. It must be used in conjuction with 349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 350 // verification of the H-CFGs built. 351 static cl::opt<bool> VPlanBuildStressTest( 352 "vplan-build-stress-test", cl::init(false), cl::Hidden, 353 cl::desc( 354 "Build VPlan for every supported loop nest in the function and bail " 355 "out right after the build (stress test the VPlan H-CFG construction " 356 "in the VPlan-native vectorization path).")); 357 358 cl::opt<bool> llvm::EnableLoopInterleaving( 359 "interleave-loops", cl::init(true), cl::Hidden, 360 cl::desc("Enable loop interleaving in Loop vectorization passes")); 361 cl::opt<bool> llvm::EnableLoopVectorization( 362 "vectorize-loops", cl::init(true), cl::Hidden, 363 cl::desc("Run the Loop vectorization passes")); 364 365 cl::opt<bool> PrintVPlansInDotFormat( 366 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 367 cl::desc("Use dot format instead of plain text when dumping VPlans")); 368 369 /// A helper function that returns true if the given type is irregular. The 370 /// type is irregular if its allocated size doesn't equal the store size of an 371 /// element of the corresponding vector type. 372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 373 // Determine if an array of N elements of type Ty is "bitcast compatible" 374 // with a <N x Ty> vector. 375 // This is only true if there is no padding between the array elements. 376 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 377 } 378 379 /// A helper function that returns the reciprocal of the block probability of 380 /// predicated blocks. If we return X, we are assuming the predicated block 381 /// will execute once for every X iterations of the loop header. 382 /// 383 /// TODO: We should use actual block probability here, if available. Currently, 384 /// we always assume predicated blocks have a 50% chance of executing. 385 static unsigned getReciprocalPredBlockProb() { return 2; } 386 387 /// A helper function that returns an integer or floating-point constant with 388 /// value C. 389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 390 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 391 : ConstantFP::get(Ty, C); 392 } 393 394 /// Returns "best known" trip count for the specified loop \p L as defined by 395 /// the following procedure: 396 /// 1) Returns exact trip count if it is known. 397 /// 2) Returns expected trip count according to profile data if any. 398 /// 3) Returns upper bound estimate if it is known. 399 /// 4) Returns None if all of the above failed. 400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 401 // Check if exact trip count is known. 402 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 403 return ExpectedTC; 404 405 // Check if there is an expected trip count available from profile data. 406 if (LoopVectorizeWithBlockFrequency) 407 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 408 return EstimatedTC; 409 410 // Check if upper bound estimate is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 412 return ExpectedTC; 413 414 return None; 415 } 416 417 // Forward declare GeneratedRTChecks. 418 class GeneratedRTChecks; 419 420 namespace llvm { 421 422 AnalysisKey ShouldRunExtraVectorPasses::Key; 423 424 /// InnerLoopVectorizer vectorizes loops which contain only one basic 425 /// block to a specified vectorization factor (VF). 426 /// This class performs the widening of scalars into vectors, or multiple 427 /// scalars. This class also implements the following features: 428 /// * It inserts an epilogue loop for handling loops that don't have iteration 429 /// counts that are known to be a multiple of the vectorization factor. 430 /// * It handles the code generation for reduction variables. 431 /// * Scalarization (implementation using scalars) of un-vectorizable 432 /// instructions. 433 /// InnerLoopVectorizer does not perform any vectorization-legality 434 /// checks, and relies on the caller to check for the different legality 435 /// aspects. The InnerLoopVectorizer relies on the 436 /// LoopVectorizationLegality class to provide information about the induction 437 /// and reduction variables that were found to a given vectorization factor. 438 class InnerLoopVectorizer { 439 public: 440 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 441 LoopInfo *LI, DominatorTree *DT, 442 const TargetLibraryInfo *TLI, 443 const TargetTransformInfo *TTI, AssumptionCache *AC, 444 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 445 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 446 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 447 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 448 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 449 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 450 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 451 PSI(PSI), RTChecks(RTChecks) { 452 // Query this against the original loop and save it here because the profile 453 // of the original loop header may change as the transformation happens. 454 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 455 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 456 } 457 458 virtual ~InnerLoopVectorizer() = default; 459 460 /// Create a new empty loop that will contain vectorized instructions later 461 /// on, while the old loop will be used as the scalar remainder. Control flow 462 /// is generated around the vectorized (and scalar epilogue) loops consisting 463 /// of various checks and bypasses. Return the pre-header block of the new 464 /// loop and the start value for the canonical induction, if it is != 0. The 465 /// latter is the case when vectorizing the epilogue loop. In the case of 466 /// epilogue vectorization, this function is overriden to handle the more 467 /// complex control flow around the loops. 468 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 469 470 /// Widen a single call instruction within the innermost loop. 471 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 472 VPTransformState &State); 473 474 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 475 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 476 477 // Return true if any runtime check is added. 478 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 479 480 /// A type for vectorized values in the new loop. Each value from the 481 /// original loop, when vectorized, is represented by UF vector values in the 482 /// new unrolled loop, where UF is the unroll factor. 483 using VectorParts = SmallVector<Value *, 2>; 484 485 /// A helper function to scalarize a single Instruction in the innermost loop. 486 /// Generates a sequence of scalar instances for each lane between \p MinLane 487 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 488 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 489 /// Instr's operands. 490 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 491 const VPIteration &Instance, bool IfPredicateInstr, 492 VPTransformState &State); 493 494 /// Construct the vector value of a scalarized value \p V one lane at a time. 495 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 496 VPTransformState &State); 497 498 /// Try to vectorize interleaved access group \p Group with the base address 499 /// given in \p Addr, optionally masking the vector operations if \p 500 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 501 /// values in the vectorized loop. 502 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 503 ArrayRef<VPValue *> VPDefs, 504 VPTransformState &State, VPValue *Addr, 505 ArrayRef<VPValue *> StoredValues, 506 VPValue *BlockInMask = nullptr); 507 508 /// Set the debug location in the builder \p Ptr using the debug location in 509 /// \p V. If \p Ptr is None then it uses the class member's Builder. 510 void setDebugLocFromInst(const Value *V); 511 512 /// Fix the non-induction PHIs in \p Plan. 513 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 514 515 /// Returns true if the reordering of FP operations is not allowed, but we are 516 /// able to vectorize with strict in-order reductions for the given RdxDesc. 517 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 518 519 /// Create a broadcast instruction. This method generates a broadcast 520 /// instruction (shuffle) for loop invariant values and for the induction 521 /// value. If this is the induction variable then we extend it to N, N+1, ... 522 /// this is needed because each iteration in the loop corresponds to a SIMD 523 /// element. 524 virtual Value *getBroadcastInstrs(Value *V); 525 526 /// Add metadata from one instruction to another. 527 /// 528 /// This includes both the original MDs from \p From and additional ones (\see 529 /// addNewMetadata). Use this for *newly created* instructions in the vector 530 /// loop. 531 void addMetadata(Instruction *To, Instruction *From); 532 533 /// Similar to the previous function but it adds the metadata to a 534 /// vector of instructions. 535 void addMetadata(ArrayRef<Value *> To, Instruction *From); 536 537 // Returns the resume value (bc.merge.rdx) for a reduction as 538 // generated by fixReduction. 539 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 540 541 protected: 542 friend class LoopVectorizationPlanner; 543 544 /// A small list of PHINodes. 545 using PhiVector = SmallVector<PHINode *, 4>; 546 547 /// A type for scalarized values in the new loop. Each value from the 548 /// original loop, when scalarized, is represented by UF x VF scalar values 549 /// in the new unrolled loop, where UF is the unroll factor and VF is the 550 /// vectorization factor. 551 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 552 553 /// Set up the values of the IVs correctly when exiting the vector loop. 554 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 555 Value *VectorTripCount, Value *EndValue, 556 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 557 VPlan &Plan); 558 559 /// Handle all cross-iteration phis in the header. 560 void fixCrossIterationPHIs(VPTransformState &State); 561 562 /// Create the exit value of first order recurrences in the middle block and 563 /// update their users. 564 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 565 VPTransformState &State); 566 567 /// Create code for the loop exit value of the reduction. 568 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 569 570 /// Clear NSW/NUW flags from reduction instructions if necessary. 571 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 572 VPTransformState &State); 573 574 /// Iteratively sink the scalarized operands of a predicated instruction into 575 /// the block that was created for it. 576 void sinkScalarOperands(Instruction *PredInst); 577 578 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 579 /// represented as. 580 void truncateToMinimalBitwidths(VPTransformState &State); 581 582 /// Returns (and creates if needed) the original loop trip count. 583 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 584 585 /// Returns (and creates if needed) the trip count of the widened loop. 586 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 587 588 /// Returns a bitcasted value to the requested vector type. 589 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 590 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 591 const DataLayout &DL); 592 593 /// Emit a bypass check to see if the vector trip count is zero, including if 594 /// it overflows. 595 void emitIterationCountCheck(BasicBlock *Bypass); 596 597 /// Emit a bypass check to see if all of the SCEV assumptions we've 598 /// had to make are correct. Returns the block containing the checks or 599 /// nullptr if no checks have been added. 600 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 601 602 /// Emit bypass checks to check any memory assumptions we may have made. 603 /// Returns the block containing the checks or nullptr if no checks have been 604 /// added. 605 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 606 607 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 608 /// vector loop preheader, middle block and scalar preheader. 609 void createVectorLoopSkeleton(StringRef Prefix); 610 611 /// Create new phi nodes for the induction variables to resume iteration count 612 /// in the scalar epilogue, from where the vectorized loop left off. 613 /// In cases where the loop skeleton is more complicated (eg. epilogue 614 /// vectorization) and the resume values can come from an additional bypass 615 /// block, the \p AdditionalBypass pair provides information about the bypass 616 /// block and the end value on the edge from bypass to this loop. 617 void createInductionResumeValues( 618 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 619 620 /// Complete the loop skeleton by adding debug MDs, creating appropriate 621 /// conditional branches in the middle block, preparing the builder and 622 /// running the verifier. Return the preheader of the completed vector loop. 623 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 624 625 /// Add additional metadata to \p To that was not present on \p Orig. 626 /// 627 /// Currently this is used to add the noalias annotations based on the 628 /// inserted memchecks. Use this for instructions that are *cloned* into the 629 /// vector loop. 630 void addNewMetadata(Instruction *To, const Instruction *Orig); 631 632 /// Collect poison-generating recipes that may generate a poison value that is 633 /// used after vectorization, even when their operands are not poison. Those 634 /// recipes meet the following conditions: 635 /// * Contribute to the address computation of a recipe generating a widen 636 /// memory load/store (VPWidenMemoryInstructionRecipe or 637 /// VPInterleaveRecipe). 638 /// * Such a widen memory load/store has at least one underlying Instruction 639 /// that is in a basic block that needs predication and after vectorization 640 /// the generated instruction won't be predicated. 641 void collectPoisonGeneratingRecipes(VPTransformState &State); 642 643 /// Allow subclasses to override and print debug traces before/after vplan 644 /// execution, when trace information is requested. 645 virtual void printDebugTracesAtStart(){}; 646 virtual void printDebugTracesAtEnd(){}; 647 648 /// The original loop. 649 Loop *OrigLoop; 650 651 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 652 /// dynamic knowledge to simplify SCEV expressions and converts them to a 653 /// more usable form. 654 PredicatedScalarEvolution &PSE; 655 656 /// Loop Info. 657 LoopInfo *LI; 658 659 /// Dominator Tree. 660 DominatorTree *DT; 661 662 /// Alias Analysis. 663 AAResults *AA; 664 665 /// Target Library Info. 666 const TargetLibraryInfo *TLI; 667 668 /// Target Transform Info. 669 const TargetTransformInfo *TTI; 670 671 /// Assumption Cache. 672 AssumptionCache *AC; 673 674 /// Interface to emit optimization remarks. 675 OptimizationRemarkEmitter *ORE; 676 677 /// LoopVersioning. It's only set up (non-null) if memchecks were 678 /// used. 679 /// 680 /// This is currently only used to add no-alias metadata based on the 681 /// memchecks. The actually versioning is performed manually. 682 std::unique_ptr<LoopVersioning> LVer; 683 684 /// The vectorization SIMD factor to use. Each vector will have this many 685 /// vector elements. 686 ElementCount VF; 687 688 /// The vectorization unroll factor to use. Each scalar is vectorized to this 689 /// many different vector instructions. 690 unsigned UF; 691 692 /// The builder that we use 693 IRBuilder<> Builder; 694 695 // --- Vectorization state --- 696 697 /// The vector-loop preheader. 698 BasicBlock *LoopVectorPreHeader; 699 700 /// The scalar-loop preheader. 701 BasicBlock *LoopScalarPreHeader; 702 703 /// Middle Block between the vector and the scalar. 704 BasicBlock *LoopMiddleBlock; 705 706 /// The unique ExitBlock of the scalar loop if one exists. Note that 707 /// there can be multiple exiting edges reaching this block. 708 BasicBlock *LoopExitBlock; 709 710 /// The scalar loop body. 711 BasicBlock *LoopScalarBody; 712 713 /// A list of all bypass blocks. The first block is the entry of the loop. 714 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 715 716 /// Store instructions that were predicated. 717 SmallVector<Instruction *, 4> PredicatedInstructions; 718 719 /// Trip count of the original loop. 720 Value *TripCount = nullptr; 721 722 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 723 Value *VectorTripCount = nullptr; 724 725 /// The legality analysis. 726 LoopVectorizationLegality *Legal; 727 728 /// The profitablity analysis. 729 LoopVectorizationCostModel *Cost; 730 731 // Record whether runtime checks are added. 732 bool AddedSafetyChecks = false; 733 734 // Holds the end values for each induction variable. We save the end values 735 // so we can later fix-up the external users of the induction variables. 736 DenseMap<PHINode *, Value *> IVEndValues; 737 738 /// BFI and PSI are used to check for profile guided size optimizations. 739 BlockFrequencyInfo *BFI; 740 ProfileSummaryInfo *PSI; 741 742 // Whether this loop should be optimized for size based on profile guided size 743 // optimizatios. 744 bool OptForSizeBasedOnProfile; 745 746 /// Structure to hold information about generated runtime checks, responsible 747 /// for cleaning the checks, if vectorization turns out unprofitable. 748 GeneratedRTChecks &RTChecks; 749 750 // Holds the resume values for reductions in the loops, used to set the 751 // correct start value of reduction PHIs when vectorizing the epilogue. 752 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 753 ReductionResumeValues; 754 }; 755 756 class InnerLoopUnroller : public InnerLoopVectorizer { 757 public: 758 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 759 LoopInfo *LI, DominatorTree *DT, 760 const TargetLibraryInfo *TLI, 761 const TargetTransformInfo *TTI, AssumptionCache *AC, 762 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 763 LoopVectorizationLegality *LVL, 764 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 765 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 766 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 767 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 768 BFI, PSI, Check) {} 769 770 private: 771 Value *getBroadcastInstrs(Value *V) override; 772 }; 773 774 /// Encapsulate information regarding vectorization of a loop and its epilogue. 775 /// This information is meant to be updated and used across two stages of 776 /// epilogue vectorization. 777 struct EpilogueLoopVectorizationInfo { 778 ElementCount MainLoopVF = ElementCount::getFixed(0); 779 unsigned MainLoopUF = 0; 780 ElementCount EpilogueVF = ElementCount::getFixed(0); 781 unsigned EpilogueUF = 0; 782 BasicBlock *MainLoopIterationCountCheck = nullptr; 783 BasicBlock *EpilogueIterationCountCheck = nullptr; 784 BasicBlock *SCEVSafetyCheck = nullptr; 785 BasicBlock *MemSafetyCheck = nullptr; 786 Value *TripCount = nullptr; 787 Value *VectorTripCount = nullptr; 788 789 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 790 ElementCount EVF, unsigned EUF) 791 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 792 assert(EUF == 1 && 793 "A high UF for the epilogue loop is likely not beneficial."); 794 } 795 }; 796 797 /// An extension of the inner loop vectorizer that creates a skeleton for a 798 /// vectorized loop that has its epilogue (residual) also vectorized. 799 /// The idea is to run the vplan on a given loop twice, firstly to setup the 800 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 801 /// from the first step and vectorize the epilogue. This is achieved by 802 /// deriving two concrete strategy classes from this base class and invoking 803 /// them in succession from the loop vectorizer planner. 804 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 805 public: 806 InnerLoopAndEpilogueVectorizer( 807 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 808 DominatorTree *DT, const TargetLibraryInfo *TLI, 809 const TargetTransformInfo *TTI, AssumptionCache *AC, 810 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 811 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 812 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 813 GeneratedRTChecks &Checks) 814 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 815 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 816 Checks), 817 EPI(EPI) {} 818 819 // Override this function to handle the more complex control flow around the 820 // three loops. 821 std::pair<BasicBlock *, Value *> 822 createVectorizedLoopSkeleton() final override { 823 return createEpilogueVectorizedLoopSkeleton(); 824 } 825 826 /// The interface for creating a vectorized skeleton using one of two 827 /// different strategies, each corresponding to one execution of the vplan 828 /// as described above. 829 virtual std::pair<BasicBlock *, Value *> 830 createEpilogueVectorizedLoopSkeleton() = 0; 831 832 /// Holds and updates state information required to vectorize the main loop 833 /// and its epilogue in two separate passes. This setup helps us avoid 834 /// regenerating and recomputing runtime safety checks. It also helps us to 835 /// shorten the iteration-count-check path length for the cases where the 836 /// iteration count of the loop is so small that the main vector loop is 837 /// completely skipped. 838 EpilogueLoopVectorizationInfo &EPI; 839 }; 840 841 /// A specialized derived class of inner loop vectorizer that performs 842 /// vectorization of *main* loops in the process of vectorizing loops and their 843 /// epilogues. 844 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 845 public: 846 EpilogueVectorizerMainLoop( 847 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 848 DominatorTree *DT, const TargetLibraryInfo *TLI, 849 const TargetTransformInfo *TTI, AssumptionCache *AC, 850 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 851 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 852 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 853 GeneratedRTChecks &Check) 854 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 855 EPI, LVL, CM, BFI, PSI, Check) {} 856 /// Implements the interface for creating a vectorized skeleton using the 857 /// *main loop* strategy (ie the first pass of vplan execution). 858 std::pair<BasicBlock *, Value *> 859 createEpilogueVectorizedLoopSkeleton() final override; 860 861 protected: 862 /// Emits an iteration count bypass check once for the main loop (when \p 863 /// ForEpilogue is false) and once for the epilogue loop (when \p 864 /// ForEpilogue is true). 865 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 866 void printDebugTracesAtStart() override; 867 void printDebugTracesAtEnd() override; 868 }; 869 870 // A specialized derived class of inner loop vectorizer that performs 871 // vectorization of *epilogue* loops in the process of vectorizing loops and 872 // their epilogues. 873 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 874 public: 875 EpilogueVectorizerEpilogueLoop( 876 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 877 DominatorTree *DT, const TargetLibraryInfo *TLI, 878 const TargetTransformInfo *TTI, AssumptionCache *AC, 879 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 880 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 881 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 882 GeneratedRTChecks &Checks) 883 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 884 EPI, LVL, CM, BFI, PSI, Checks) { 885 TripCount = EPI.TripCount; 886 } 887 /// Implements the interface for creating a vectorized skeleton using the 888 /// *epilogue loop* strategy (ie the second pass of vplan execution). 889 std::pair<BasicBlock *, Value *> 890 createEpilogueVectorizedLoopSkeleton() final override; 891 892 protected: 893 /// Emits an iteration count bypass check after the main vector loop has 894 /// finished to see if there are any iterations left to execute by either 895 /// the vector epilogue or the scalar epilogue. 896 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 897 BasicBlock *Bypass, 898 BasicBlock *Insert); 899 void printDebugTracesAtStart() override; 900 void printDebugTracesAtEnd() override; 901 }; 902 } // end namespace llvm 903 904 /// Look for a meaningful debug location on the instruction or it's 905 /// operands. 906 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 907 if (!I) 908 return I; 909 910 DebugLoc Empty; 911 if (I->getDebugLoc() != Empty) 912 return I; 913 914 for (Use &Op : I->operands()) { 915 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 916 if (OpInst->getDebugLoc() != Empty) 917 return OpInst; 918 } 919 920 return I; 921 } 922 923 void InnerLoopVectorizer::setDebugLocFromInst( 924 const Value *V) { 925 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 926 const DILocation *DIL = Inst->getDebugLoc(); 927 928 // When a FSDiscriminator is enabled, we don't need to add the multiply 929 // factors to the discriminators. 930 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 931 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 932 // FIXME: For scalable vectors, assume vscale=1. 933 auto NewDIL = 934 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 935 if (NewDIL) 936 Builder.SetCurrentDebugLocation(NewDIL.getValue()); 937 else 938 LLVM_DEBUG(dbgs() 939 << "Failed to create new discriminator: " 940 << DIL->getFilename() << " Line: " << DIL->getLine()); 941 } else 942 Builder.SetCurrentDebugLocation(DIL); 943 } else 944 Builder.SetCurrentDebugLocation(DebugLoc()); 945 } 946 947 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 948 /// is passed, the message relates to that particular instruction. 949 #ifndef NDEBUG 950 static void debugVectorizationMessage(const StringRef Prefix, 951 const StringRef DebugMsg, 952 Instruction *I) { 953 dbgs() << "LV: " << Prefix << DebugMsg; 954 if (I != nullptr) 955 dbgs() << " " << *I; 956 else 957 dbgs() << '.'; 958 dbgs() << '\n'; 959 } 960 #endif 961 962 /// Create an analysis remark that explains why vectorization failed 963 /// 964 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 965 /// RemarkName is the identifier for the remark. If \p I is passed it is an 966 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 967 /// the location of the remark. \return the remark object that can be 968 /// streamed to. 969 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 970 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 971 Value *CodeRegion = TheLoop->getHeader(); 972 DebugLoc DL = TheLoop->getStartLoc(); 973 974 if (I) { 975 CodeRegion = I->getParent(); 976 // If there is no debug location attached to the instruction, revert back to 977 // using the loop's. 978 if (I->getDebugLoc()) 979 DL = I->getDebugLoc(); 980 } 981 982 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 983 } 984 985 namespace llvm { 986 987 /// Return a value for Step multiplied by VF. 988 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 989 int64_t Step) { 990 assert(Ty->isIntegerTy() && "Expected an integer step"); 991 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 992 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 993 } 994 995 /// Return the runtime value for VF. 996 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 997 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 998 return VF.isScalable() ? B.CreateVScale(EC) : EC; 999 } 1000 1001 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1002 ElementCount VF) { 1003 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1004 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1005 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1006 return B.CreateUIToFP(RuntimeVF, FTy); 1007 } 1008 1009 void reportVectorizationFailure(const StringRef DebugMsg, 1010 const StringRef OREMsg, const StringRef ORETag, 1011 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1012 Instruction *I) { 1013 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1014 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1015 ORE->emit( 1016 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1017 << "loop not vectorized: " << OREMsg); 1018 } 1019 1020 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1021 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1022 Instruction *I) { 1023 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1024 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1025 ORE->emit( 1026 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1027 << Msg); 1028 } 1029 1030 } // end namespace llvm 1031 1032 #ifndef NDEBUG 1033 /// \return string containing a file name and a line # for the given loop. 1034 static std::string getDebugLocString(const Loop *L) { 1035 std::string Result; 1036 if (L) { 1037 raw_string_ostream OS(Result); 1038 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1039 LoopDbgLoc.print(OS); 1040 else 1041 // Just print the module name. 1042 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1043 OS.flush(); 1044 } 1045 return Result; 1046 } 1047 #endif 1048 1049 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1050 const Instruction *Orig) { 1051 // If the loop was versioned with memchecks, add the corresponding no-alias 1052 // metadata. 1053 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1054 LVer->annotateInstWithNoAlias(To, Orig); 1055 } 1056 1057 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1058 VPTransformState &State) { 1059 1060 // Collect recipes in the backward slice of `Root` that may generate a poison 1061 // value that is used after vectorization. 1062 SmallPtrSet<VPRecipeBase *, 16> Visited; 1063 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1064 SmallVector<VPRecipeBase *, 16> Worklist; 1065 Worklist.push_back(Root); 1066 1067 // Traverse the backward slice of Root through its use-def chain. 1068 while (!Worklist.empty()) { 1069 VPRecipeBase *CurRec = Worklist.back(); 1070 Worklist.pop_back(); 1071 1072 if (!Visited.insert(CurRec).second) 1073 continue; 1074 1075 // Prune search if we find another recipe generating a widen memory 1076 // instruction. Widen memory instructions involved in address computation 1077 // will lead to gather/scatter instructions, which don't need to be 1078 // handled. 1079 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1080 isa<VPInterleaveRecipe>(CurRec) || 1081 isa<VPScalarIVStepsRecipe>(CurRec) || 1082 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1083 continue; 1084 1085 // This recipe contributes to the address computation of a widen 1086 // load/store. Collect recipe if its underlying instruction has 1087 // poison-generating flags. 1088 Instruction *Instr = CurRec->getUnderlyingInstr(); 1089 if (Instr && Instr->hasPoisonGeneratingFlags()) 1090 State.MayGeneratePoisonRecipes.insert(CurRec); 1091 1092 // Add new definitions to the worklist. 1093 for (VPValue *operand : CurRec->operands()) 1094 if (VPDef *OpDef = operand->getDef()) 1095 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1096 } 1097 }); 1098 1099 // Traverse all the recipes in the VPlan and collect the poison-generating 1100 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1101 // VPInterleaveRecipe. 1102 auto Iter = depth_first( 1103 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1104 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1105 for (VPRecipeBase &Recipe : *VPBB) { 1106 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1107 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1108 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1109 if (AddrDef && WidenRec->isConsecutive() && 1110 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1111 collectPoisonGeneratingInstrsInBackwardSlice( 1112 cast<VPRecipeBase>(AddrDef)); 1113 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1114 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1115 if (AddrDef) { 1116 // Check if any member of the interleave group needs predication. 1117 const InterleaveGroup<Instruction> *InterGroup = 1118 InterleaveRec->getInterleaveGroup(); 1119 bool NeedPredication = false; 1120 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1121 I < NumMembers; ++I) { 1122 Instruction *Member = InterGroup->getMember(I); 1123 if (Member) 1124 NeedPredication |= 1125 Legal->blockNeedsPredication(Member->getParent()); 1126 } 1127 1128 if (NeedPredication) 1129 collectPoisonGeneratingInstrsInBackwardSlice( 1130 cast<VPRecipeBase>(AddrDef)); 1131 } 1132 } 1133 } 1134 } 1135 } 1136 1137 void InnerLoopVectorizer::addMetadata(Instruction *To, 1138 Instruction *From) { 1139 propagateMetadata(To, From); 1140 addNewMetadata(To, From); 1141 } 1142 1143 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1144 Instruction *From) { 1145 for (Value *V : To) { 1146 if (Instruction *I = dyn_cast<Instruction>(V)) 1147 addMetadata(I, From); 1148 } 1149 } 1150 1151 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1152 const RecurrenceDescriptor &RdxDesc) { 1153 auto It = ReductionResumeValues.find(&RdxDesc); 1154 assert(It != ReductionResumeValues.end() && 1155 "Expected to find a resume value for the reduction."); 1156 return It->second; 1157 } 1158 1159 namespace llvm { 1160 1161 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1162 // lowered. 1163 enum ScalarEpilogueLowering { 1164 1165 // The default: allowing scalar epilogues. 1166 CM_ScalarEpilogueAllowed, 1167 1168 // Vectorization with OptForSize: don't allow epilogues. 1169 CM_ScalarEpilogueNotAllowedOptSize, 1170 1171 // A special case of vectorisation with OptForSize: loops with a very small 1172 // trip count are considered for vectorization under OptForSize, thereby 1173 // making sure the cost of their loop body is dominant, free of runtime 1174 // guards and scalar iteration overheads. 1175 CM_ScalarEpilogueNotAllowedLowTripLoop, 1176 1177 // Loop hint predicate indicating an epilogue is undesired. 1178 CM_ScalarEpilogueNotNeededUsePredicate, 1179 1180 // Directive indicating we must either tail fold or not vectorize 1181 CM_ScalarEpilogueNotAllowedUsePredicate 1182 }; 1183 1184 /// ElementCountComparator creates a total ordering for ElementCount 1185 /// for the purposes of using it in a set structure. 1186 struct ElementCountComparator { 1187 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1188 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1189 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1190 } 1191 }; 1192 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1193 1194 /// LoopVectorizationCostModel - estimates the expected speedups due to 1195 /// vectorization. 1196 /// In many cases vectorization is not profitable. This can happen because of 1197 /// a number of reasons. In this class we mainly attempt to predict the 1198 /// expected speedup/slowdowns due to the supported instruction set. We use the 1199 /// TargetTransformInfo to query the different backends for the cost of 1200 /// different operations. 1201 class LoopVectorizationCostModel { 1202 public: 1203 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1204 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1205 LoopVectorizationLegality *Legal, 1206 const TargetTransformInfo &TTI, 1207 const TargetLibraryInfo *TLI, DemandedBits *DB, 1208 AssumptionCache *AC, 1209 OptimizationRemarkEmitter *ORE, const Function *F, 1210 const LoopVectorizeHints *Hints, 1211 InterleavedAccessInfo &IAI) 1212 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1213 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1214 Hints(Hints), InterleaveInfo(IAI) {} 1215 1216 /// \return An upper bound for the vectorization factors (both fixed and 1217 /// scalable). If the factors are 0, vectorization and interleaving should be 1218 /// avoided up front. 1219 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1220 1221 /// \return True if runtime checks are required for vectorization, and false 1222 /// otherwise. 1223 bool runtimeChecksRequired(); 1224 1225 /// \return The most profitable vectorization factor and the cost of that VF. 1226 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1227 /// then this vectorization factor will be selected if vectorization is 1228 /// possible. 1229 VectorizationFactor 1230 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1231 1232 VectorizationFactor 1233 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1234 const LoopVectorizationPlanner &LVP); 1235 1236 /// Setup cost-based decisions for user vectorization factor. 1237 /// \return true if the UserVF is a feasible VF to be chosen. 1238 bool selectUserVectorizationFactor(ElementCount UserVF) { 1239 collectUniformsAndScalars(UserVF); 1240 collectInstsToScalarize(UserVF); 1241 return expectedCost(UserVF).first.isValid(); 1242 } 1243 1244 /// \return The size (in bits) of the smallest and widest types in the code 1245 /// that needs to be vectorized. We ignore values that remain scalar such as 1246 /// 64 bit loop indices. 1247 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1248 1249 /// \return The desired interleave count. 1250 /// If interleave count has been specified by metadata it will be returned. 1251 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1252 /// are the selected vectorization factor and the cost of the selected VF. 1253 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1254 1255 /// Memory access instruction may be vectorized in more than one way. 1256 /// Form of instruction after vectorization depends on cost. 1257 /// This function takes cost-based decisions for Load/Store instructions 1258 /// and collects them in a map. This decisions map is used for building 1259 /// the lists of loop-uniform and loop-scalar instructions. 1260 /// The calculated cost is saved with widening decision in order to 1261 /// avoid redundant calculations. 1262 void setCostBasedWideningDecision(ElementCount VF); 1263 1264 /// A struct that represents some properties of the register usage 1265 /// of a loop. 1266 struct RegisterUsage { 1267 /// Holds the number of loop invariant values that are used in the loop. 1268 /// The key is ClassID of target-provided register class. 1269 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1270 /// Holds the maximum number of concurrent live intervals in the loop. 1271 /// The key is ClassID of target-provided register class. 1272 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1273 }; 1274 1275 /// \return Returns information about the register usages of the loop for the 1276 /// given vectorization factors. 1277 SmallVector<RegisterUsage, 8> 1278 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1279 1280 /// Collect values we want to ignore in the cost model. 1281 void collectValuesToIgnore(); 1282 1283 /// Collect all element types in the loop for which widening is needed. 1284 void collectElementTypesForWidening(); 1285 1286 /// Split reductions into those that happen in the loop, and those that happen 1287 /// outside. In loop reductions are collected into InLoopReductionChains. 1288 void collectInLoopReductions(); 1289 1290 /// Returns true if we should use strict in-order reductions for the given 1291 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1292 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1293 /// of FP operations. 1294 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1295 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1296 } 1297 1298 /// \returns The smallest bitwidth each instruction can be represented with. 1299 /// The vector equivalents of these instructions should be truncated to this 1300 /// type. 1301 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1302 return MinBWs; 1303 } 1304 1305 /// \returns True if it is more profitable to scalarize instruction \p I for 1306 /// vectorization factor \p VF. 1307 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1308 assert(VF.isVector() && 1309 "Profitable to scalarize relevant only for VF > 1."); 1310 1311 // Cost model is not run in the VPlan-native path - return conservative 1312 // result until this changes. 1313 if (EnableVPlanNativePath) 1314 return false; 1315 1316 auto Scalars = InstsToScalarize.find(VF); 1317 assert(Scalars != InstsToScalarize.end() && 1318 "VF not yet analyzed for scalarization profitability"); 1319 return Scalars->second.find(I) != Scalars->second.end(); 1320 } 1321 1322 /// Returns true if \p I is known to be uniform after vectorization. 1323 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1324 if (VF.isScalar()) 1325 return true; 1326 1327 // Cost model is not run in the VPlan-native path - return conservative 1328 // result until this changes. 1329 if (EnableVPlanNativePath) 1330 return false; 1331 1332 auto UniformsPerVF = Uniforms.find(VF); 1333 assert(UniformsPerVF != Uniforms.end() && 1334 "VF not yet analyzed for uniformity"); 1335 return UniformsPerVF->second.count(I); 1336 } 1337 1338 /// Returns true if \p I is known to be scalar after vectorization. 1339 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1340 if (VF.isScalar()) 1341 return true; 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto ScalarsPerVF = Scalars.find(VF); 1349 assert(ScalarsPerVF != Scalars.end() && 1350 "Scalar values are not calculated for VF"); 1351 return ScalarsPerVF->second.count(I); 1352 } 1353 1354 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1355 /// for vectorization factor \p VF. 1356 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1357 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1358 !isProfitableToScalarize(I, VF) && 1359 !isScalarAfterVectorization(I, VF); 1360 } 1361 1362 /// Decision that was taken during cost calculation for memory instruction. 1363 enum InstWidening { 1364 CM_Unknown, 1365 CM_Widen, // For consecutive accesses with stride +1. 1366 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1367 CM_Interleave, 1368 CM_GatherScatter, 1369 CM_Scalarize 1370 }; 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// instruction \p I and vector width \p VF. 1374 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1375 InstructionCost Cost) { 1376 assert(VF.isVector() && "Expected VF >=2"); 1377 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1378 } 1379 1380 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1381 /// interleaving group \p Grp and vector width \p VF. 1382 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1383 ElementCount VF, InstWidening W, 1384 InstructionCost Cost) { 1385 assert(VF.isVector() && "Expected VF >=2"); 1386 /// Broadcast this decicion to all instructions inside the group. 1387 /// But the cost will be assigned to one instruction only. 1388 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1389 if (auto *I = Grp->getMember(i)) { 1390 if (Grp->getInsertPos() == I) 1391 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1392 else 1393 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1394 } 1395 } 1396 } 1397 1398 /// Return the cost model decision for the given instruction \p I and vector 1399 /// width \p VF. Return CM_Unknown if this instruction did not pass 1400 /// through the cost modeling. 1401 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1402 assert(VF.isVector() && "Expected VF to be a vector VF"); 1403 // Cost model is not run in the VPlan-native path - return conservative 1404 // result until this changes. 1405 if (EnableVPlanNativePath) 1406 return CM_GatherScatter; 1407 1408 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1409 auto Itr = WideningDecisions.find(InstOnVF); 1410 if (Itr == WideningDecisions.end()) 1411 return CM_Unknown; 1412 return Itr->second.first; 1413 } 1414 1415 /// Return the vectorization cost for the given instruction \p I and vector 1416 /// width \p VF. 1417 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1418 assert(VF.isVector() && "Expected VF >=2"); 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1421 "The cost is not calculated"); 1422 return WideningDecisions[InstOnVF].second; 1423 } 1424 1425 /// Return True if instruction \p I is an optimizable truncate whose operand 1426 /// is an induction variable. Such a truncate will be removed by adding a new 1427 /// induction variable with the destination type. 1428 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1429 // If the instruction is not a truncate, return false. 1430 auto *Trunc = dyn_cast<TruncInst>(I); 1431 if (!Trunc) 1432 return false; 1433 1434 // Get the source and destination types of the truncate. 1435 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1436 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1437 1438 // If the truncate is free for the given types, return false. Replacing a 1439 // free truncate with an induction variable would add an induction variable 1440 // update instruction to each iteration of the loop. We exclude from this 1441 // check the primary induction variable since it will need an update 1442 // instruction regardless. 1443 Value *Op = Trunc->getOperand(0); 1444 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1445 return false; 1446 1447 // If the truncated value is not an induction variable, return false. 1448 return Legal->isInductionPhi(Op); 1449 } 1450 1451 /// Collects the instructions to scalarize for each predicated instruction in 1452 /// the loop. 1453 void collectInstsToScalarize(ElementCount VF); 1454 1455 /// Collect Uniform and Scalar values for the given \p VF. 1456 /// The sets depend on CM decision for Load/Store instructions 1457 /// that may be vectorized as interleave, gather-scatter or scalarized. 1458 void collectUniformsAndScalars(ElementCount VF) { 1459 // Do the analysis once. 1460 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1461 return; 1462 setCostBasedWideningDecision(VF); 1463 collectLoopUniforms(VF); 1464 collectLoopScalars(VF); 1465 } 1466 1467 /// Returns true if the target machine supports masked store operation 1468 /// for the given \p DataType and kind of access to \p Ptr. 1469 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1470 return Legal->isConsecutivePtr(DataType, Ptr) && 1471 TTI.isLegalMaskedStore(DataType, Alignment); 1472 } 1473 1474 /// Returns true if the target machine supports masked load operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1477 return Legal->isConsecutivePtr(DataType, Ptr) && 1478 TTI.isLegalMaskedLoad(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine can represent \p V as a masked gather 1482 /// or scatter operation. 1483 bool isLegalGatherOrScatter(Value *V, 1484 ElementCount VF = ElementCount::getFixed(1)) { 1485 bool LI = isa<LoadInst>(V); 1486 bool SI = isa<StoreInst>(V); 1487 if (!LI && !SI) 1488 return false; 1489 auto *Ty = getLoadStoreType(V); 1490 Align Align = getLoadStoreAlignment(V); 1491 if (VF.isVector()) 1492 Ty = VectorType::get(Ty, VF); 1493 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1494 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1495 } 1496 1497 /// Returns true if the target machine supports all of the reduction 1498 /// variables found for the given VF. 1499 bool canVectorizeReductions(ElementCount VF) const { 1500 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1501 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1502 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1503 })); 1504 } 1505 1506 /// Returns true if \p I is an instruction that will be scalarized with 1507 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1508 /// instructions include conditional stores and instructions that may divide 1509 /// by zero. 1510 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1511 1512 // Returns true if \p I is an instruction that will be predicated either 1513 // through scalar predication or masked load/store or masked gather/scatter. 1514 // \p VF is the vectorization factor that will be used to vectorize \p I. 1515 // Superset of instructions that return true for isScalarWithPredication. 1516 bool isPredicatedInst(Instruction *I, ElementCount VF, 1517 bool IsKnownUniform = false) { 1518 // When we know the load is uniform and the original scalar loop was not 1519 // predicated we don't need to mark it as a predicated instruction. Any 1520 // vectorised blocks created when tail-folding are something artificial we 1521 // have introduced and we know there is always at least one active lane. 1522 // That's why we call Legal->blockNeedsPredication here because it doesn't 1523 // query tail-folding. 1524 if (IsKnownUniform && isa<LoadInst>(I) && 1525 !Legal->blockNeedsPredication(I->getParent())) 1526 return false; 1527 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1528 return false; 1529 // Loads and stores that need some form of masked operation are predicated 1530 // instructions. 1531 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1532 return Legal->isMaskRequired(I); 1533 return isScalarWithPredication(I, VF); 1534 } 1535 1536 /// Returns true if \p I is a memory instruction with consecutive memory 1537 /// access that can be widened. 1538 bool 1539 memoryInstructionCanBeWidened(Instruction *I, 1540 ElementCount VF = ElementCount::getFixed(1)); 1541 1542 /// Returns true if \p I is a memory instruction in an interleaved-group 1543 /// of memory accesses that can be vectorized with wide vector loads/stores 1544 /// and shuffles. 1545 bool 1546 interleavedAccessCanBeWidened(Instruction *I, 1547 ElementCount VF = ElementCount::getFixed(1)); 1548 1549 /// Check if \p Instr belongs to any interleaved access group. 1550 bool isAccessInterleaved(Instruction *Instr) { 1551 return InterleaveInfo.isInterleaved(Instr); 1552 } 1553 1554 /// Get the interleaved access group that \p Instr belongs to. 1555 const InterleaveGroup<Instruction> * 1556 getInterleavedAccessGroup(Instruction *Instr) { 1557 return InterleaveInfo.getInterleaveGroup(Instr); 1558 } 1559 1560 /// Returns true if we're required to use a scalar epilogue for at least 1561 /// the final iteration of the original loop. 1562 bool requiresScalarEpilogue(ElementCount VF) const { 1563 if (!isScalarEpilogueAllowed()) 1564 return false; 1565 // If we might exit from anywhere but the latch, must run the exiting 1566 // iteration in scalar form. 1567 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1568 return true; 1569 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1570 } 1571 1572 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1573 /// loop hint annotation. 1574 bool isScalarEpilogueAllowed() const { 1575 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1576 } 1577 1578 /// Returns true if all loop blocks should be masked to fold tail loop. 1579 bool foldTailByMasking() const { return FoldTailByMasking; } 1580 1581 /// Returns true if the instructions in this block requires predication 1582 /// for any reason, e.g. because tail folding now requires a predicate 1583 /// or because the block in the original loop was predicated. 1584 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1585 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1586 } 1587 1588 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1589 /// nodes to the chain of instructions representing the reductions. Uses a 1590 /// MapVector to ensure deterministic iteration order. 1591 using ReductionChainMap = 1592 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1593 1594 /// Return the chain of instructions representing an inloop reduction. 1595 const ReductionChainMap &getInLoopReductionChains() const { 1596 return InLoopReductionChains; 1597 } 1598 1599 /// Returns true if the Phi is part of an inloop reduction. 1600 bool isInLoopReduction(PHINode *Phi) const { 1601 return InLoopReductionChains.count(Phi); 1602 } 1603 1604 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1605 /// with factor VF. Return the cost of the instruction, including 1606 /// scalarization overhead if it's needed. 1607 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1608 1609 /// Estimate cost of a call instruction CI if it were vectorized with factor 1610 /// VF. Return the cost of the instruction, including scalarization overhead 1611 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1612 /// scalarized - 1613 /// i.e. either vector version isn't available, or is too expensive. 1614 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1615 bool &NeedToScalarize) const; 1616 1617 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1618 /// that of B. 1619 bool isMoreProfitable(const VectorizationFactor &A, 1620 const VectorizationFactor &B) const; 1621 1622 /// Invalidates decisions already taken by the cost model. 1623 void invalidateCostModelingDecisions() { 1624 WideningDecisions.clear(); 1625 Uniforms.clear(); 1626 Scalars.clear(); 1627 } 1628 1629 private: 1630 unsigned NumPredStores = 0; 1631 1632 /// Convenience function that returns the value of vscale_range iff 1633 /// vscale_range.min == vscale_range.max or otherwise returns the value 1634 /// returned by the corresponding TLI method. 1635 Optional<unsigned> getVScaleForTuning() const; 1636 1637 /// \return An upper bound for the vectorization factors for both 1638 /// fixed and scalable vectorization, where the minimum-known number of 1639 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1640 /// disabled or unsupported, then the scalable part will be equal to 1641 /// ElementCount::getScalable(0). 1642 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1643 ElementCount UserVF, 1644 bool FoldTailByMasking); 1645 1646 /// \return the maximized element count based on the targets vector 1647 /// registers and the loop trip-count, but limited to a maximum safe VF. 1648 /// This is a helper function of computeFeasibleMaxVF. 1649 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1650 unsigned SmallestType, 1651 unsigned WidestType, 1652 ElementCount MaxSafeVF, 1653 bool FoldTailByMasking); 1654 1655 /// \return the maximum legal scalable VF, based on the safe max number 1656 /// of elements. 1657 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1658 1659 /// The vectorization cost is a combination of the cost itself and a boolean 1660 /// indicating whether any of the contributing operations will actually 1661 /// operate on vector values after type legalization in the backend. If this 1662 /// latter value is false, then all operations will be scalarized (i.e. no 1663 /// vectorization has actually taken place). 1664 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1665 1666 /// Returns the expected execution cost. The unit of the cost does 1667 /// not matter because we use the 'cost' units to compare different 1668 /// vector widths. The cost that is returned is *not* normalized by 1669 /// the factor width. If \p Invalid is not nullptr, this function 1670 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1671 /// each instruction that has an Invalid cost for the given VF. 1672 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1673 VectorizationCostTy 1674 expectedCost(ElementCount VF, 1675 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1676 1677 /// Returns the execution time cost of an instruction for a given vector 1678 /// width. Vector width of one means scalar. 1679 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1680 1681 /// The cost-computation logic from getInstructionCost which provides 1682 /// the vector type as an output parameter. 1683 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1684 Type *&VectorTy); 1685 1686 /// Return the cost of instructions in an inloop reduction pattern, if I is 1687 /// part of that pattern. 1688 Optional<InstructionCost> 1689 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1690 TTI::TargetCostKind CostKind); 1691 1692 /// Calculate vectorization cost of memory instruction \p I. 1693 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1694 1695 /// The cost computation for scalarized memory instruction. 1696 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1697 1698 /// The cost computation for interleaving group of memory instructions. 1699 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1700 1701 /// The cost computation for Gather/Scatter instruction. 1702 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1703 1704 /// The cost computation for widening instruction \p I with consecutive 1705 /// memory access. 1706 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1707 1708 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1709 /// Load: scalar load + broadcast. 1710 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1711 /// element) 1712 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1713 1714 /// Estimate the overhead of scalarizing an instruction. This is a 1715 /// convenience wrapper for the type-based getScalarizationOverhead API. 1716 InstructionCost getScalarizationOverhead(Instruction *I, 1717 ElementCount VF) const; 1718 1719 /// Returns whether the instruction is a load or store and will be a emitted 1720 /// as a vector operation. 1721 bool isConsecutiveLoadOrStore(Instruction *I); 1722 1723 /// Returns true if an artificially high cost for emulated masked memrefs 1724 /// should be used. 1725 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1726 1727 /// Map of scalar integer values to the smallest bitwidth they can be legally 1728 /// represented as. The vector equivalents of these values should be truncated 1729 /// to this type. 1730 MapVector<Instruction *, uint64_t> MinBWs; 1731 1732 /// A type representing the costs for instructions if they were to be 1733 /// scalarized rather than vectorized. The entries are Instruction-Cost 1734 /// pairs. 1735 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1736 1737 /// A set containing all BasicBlocks that are known to present after 1738 /// vectorization as a predicated block. 1739 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1740 1741 /// Records whether it is allowed to have the original scalar loop execute at 1742 /// least once. This may be needed as a fallback loop in case runtime 1743 /// aliasing/dependence checks fail, or to handle the tail/remainder 1744 /// iterations when the trip count is unknown or doesn't divide by the VF, 1745 /// or as a peel-loop to handle gaps in interleave-groups. 1746 /// Under optsize and when the trip count is very small we don't allow any 1747 /// iterations to execute in the scalar loop. 1748 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1749 1750 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1751 bool FoldTailByMasking = false; 1752 1753 /// A map holding scalar costs for different vectorization factors. The 1754 /// presence of a cost for an instruction in the mapping indicates that the 1755 /// instruction will be scalarized when vectorizing with the associated 1756 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1757 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1758 1759 /// Holds the instructions known to be uniform after vectorization. 1760 /// The data is collected per VF. 1761 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1762 1763 /// Holds the instructions known to be scalar after vectorization. 1764 /// The data is collected per VF. 1765 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1766 1767 /// Holds the instructions (address computations) that are forced to be 1768 /// scalarized. 1769 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1770 1771 /// PHINodes of the reductions that should be expanded in-loop along with 1772 /// their associated chains of reduction operations, in program order from top 1773 /// (PHI) to bottom 1774 ReductionChainMap InLoopReductionChains; 1775 1776 /// A Map of inloop reduction operations and their immediate chain operand. 1777 /// FIXME: This can be removed once reductions can be costed correctly in 1778 /// vplan. This was added to allow quick lookup to the inloop operations, 1779 /// without having to loop through InLoopReductionChains. 1780 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1781 1782 /// Returns the expected difference in cost from scalarizing the expression 1783 /// feeding a predicated instruction \p PredInst. The instructions to 1784 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1785 /// non-negative return value implies the expression will be scalarized. 1786 /// Currently, only single-use chains are considered for scalarization. 1787 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1788 ElementCount VF); 1789 1790 /// Collect the instructions that are uniform after vectorization. An 1791 /// instruction is uniform if we represent it with a single scalar value in 1792 /// the vectorized loop corresponding to each vector iteration. Examples of 1793 /// uniform instructions include pointer operands of consecutive or 1794 /// interleaved memory accesses. Note that although uniformity implies an 1795 /// instruction will be scalar, the reverse is not true. In general, a 1796 /// scalarized instruction will be represented by VF scalar values in the 1797 /// vectorized loop, each corresponding to an iteration of the original 1798 /// scalar loop. 1799 void collectLoopUniforms(ElementCount VF); 1800 1801 /// Collect the instructions that are scalar after vectorization. An 1802 /// instruction is scalar if it is known to be uniform or will be scalarized 1803 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1804 /// to the list if they are used by a load/store instruction that is marked as 1805 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1806 /// VF values in the vectorized loop, each corresponding to an iteration of 1807 /// the original scalar loop. 1808 void collectLoopScalars(ElementCount VF); 1809 1810 /// Keeps cost model vectorization decision and cost for instructions. 1811 /// Right now it is used for memory instructions only. 1812 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1813 std::pair<InstWidening, InstructionCost>>; 1814 1815 DecisionList WideningDecisions; 1816 1817 /// Returns true if \p V is expected to be vectorized and it needs to be 1818 /// extracted. 1819 bool needsExtract(Value *V, ElementCount VF) const { 1820 Instruction *I = dyn_cast<Instruction>(V); 1821 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1822 TheLoop->isLoopInvariant(I)) 1823 return false; 1824 1825 // Assume we can vectorize V (and hence we need extraction) if the 1826 // scalars are not computed yet. This can happen, because it is called 1827 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1828 // the scalars are collected. That should be a safe assumption in most 1829 // cases, because we check if the operands have vectorizable types 1830 // beforehand in LoopVectorizationLegality. 1831 return Scalars.find(VF) == Scalars.end() || 1832 !isScalarAfterVectorization(I, VF); 1833 }; 1834 1835 /// Returns a range containing only operands needing to be extracted. 1836 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1837 ElementCount VF) const { 1838 return SmallVector<Value *, 4>(make_filter_range( 1839 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1840 } 1841 1842 /// Determines if we have the infrastructure to vectorize loop \p L and its 1843 /// epilogue, assuming the main loop is vectorized by \p VF. 1844 bool isCandidateForEpilogueVectorization(const Loop &L, 1845 const ElementCount VF) const; 1846 1847 /// Returns true if epilogue vectorization is considered profitable, and 1848 /// false otherwise. 1849 /// \p VF is the vectorization factor chosen for the original loop. 1850 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1851 1852 public: 1853 /// The loop that we evaluate. 1854 Loop *TheLoop; 1855 1856 /// Predicated scalar evolution analysis. 1857 PredicatedScalarEvolution &PSE; 1858 1859 /// Loop Info analysis. 1860 LoopInfo *LI; 1861 1862 /// Vectorization legality. 1863 LoopVectorizationLegality *Legal; 1864 1865 /// Vector target information. 1866 const TargetTransformInfo &TTI; 1867 1868 /// Target Library Info. 1869 const TargetLibraryInfo *TLI; 1870 1871 /// Demanded bits analysis. 1872 DemandedBits *DB; 1873 1874 /// Assumption cache. 1875 AssumptionCache *AC; 1876 1877 /// Interface to emit optimization remarks. 1878 OptimizationRemarkEmitter *ORE; 1879 1880 const Function *TheFunction; 1881 1882 /// Loop Vectorize Hint. 1883 const LoopVectorizeHints *Hints; 1884 1885 /// The interleave access information contains groups of interleaved accesses 1886 /// with the same stride and close to each other. 1887 InterleavedAccessInfo &InterleaveInfo; 1888 1889 /// Values to ignore in the cost model. 1890 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1891 1892 /// Values to ignore in the cost model when VF > 1. 1893 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1894 1895 /// All element types found in the loop. 1896 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1897 1898 /// Profitable vector factors. 1899 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1900 }; 1901 } // end namespace llvm 1902 1903 /// Helper struct to manage generating runtime checks for vectorization. 1904 /// 1905 /// The runtime checks are created up-front in temporary blocks to allow better 1906 /// estimating the cost and un-linked from the existing IR. After deciding to 1907 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1908 /// temporary blocks are completely removed. 1909 class GeneratedRTChecks { 1910 /// Basic block which contains the generated SCEV checks, if any. 1911 BasicBlock *SCEVCheckBlock = nullptr; 1912 1913 /// The value representing the result of the generated SCEV checks. If it is 1914 /// nullptr, either no SCEV checks have been generated or they have been used. 1915 Value *SCEVCheckCond = nullptr; 1916 1917 /// Basic block which contains the generated memory runtime checks, if any. 1918 BasicBlock *MemCheckBlock = nullptr; 1919 1920 /// The value representing the result of the generated memory runtime checks. 1921 /// If it is nullptr, either no memory runtime checks have been generated or 1922 /// they have been used. 1923 Value *MemRuntimeCheckCond = nullptr; 1924 1925 DominatorTree *DT; 1926 LoopInfo *LI; 1927 1928 SCEVExpander SCEVExp; 1929 SCEVExpander MemCheckExp; 1930 1931 public: 1932 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1933 const DataLayout &DL) 1934 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1935 MemCheckExp(SE, DL, "scev.check") {} 1936 1937 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1938 /// accurately estimate the cost of the runtime checks. The blocks are 1939 /// un-linked from the IR and is added back during vector code generation. If 1940 /// there is no vector code generation, the check blocks are removed 1941 /// completely. 1942 void Create(Loop *L, const LoopAccessInfo &LAI, 1943 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1944 1945 BasicBlock *LoopHeader = L->getHeader(); 1946 BasicBlock *Preheader = L->getLoopPreheader(); 1947 1948 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1949 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1950 // may be used by SCEVExpander. The blocks will be un-linked from their 1951 // predecessors and removed from LI & DT at the end of the function. 1952 if (!UnionPred.isAlwaysTrue()) { 1953 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1954 nullptr, "vector.scevcheck"); 1955 1956 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1957 &UnionPred, SCEVCheckBlock->getTerminator()); 1958 } 1959 1960 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1961 if (RtPtrChecking.Need) { 1962 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1963 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1964 "vector.memcheck"); 1965 1966 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1967 if (DiffChecks) { 1968 MemRuntimeCheckCond = addDiffRuntimeChecks( 1969 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1970 [VF](IRBuilderBase &B, unsigned Bits) { 1971 return getRuntimeVF(B, B.getIntNTy(Bits), VF); 1972 }, 1973 IC); 1974 } else { 1975 MemRuntimeCheckCond = 1976 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1977 RtPtrChecking.getChecks(), MemCheckExp); 1978 } 1979 assert(MemRuntimeCheckCond && 1980 "no RT checks generated although RtPtrChecking " 1981 "claimed checks are required"); 1982 } 1983 1984 if (!MemCheckBlock && !SCEVCheckBlock) 1985 return; 1986 1987 // Unhook the temporary block with the checks, update various places 1988 // accordingly. 1989 if (SCEVCheckBlock) 1990 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1991 if (MemCheckBlock) 1992 MemCheckBlock->replaceAllUsesWith(Preheader); 1993 1994 if (SCEVCheckBlock) { 1995 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1996 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1997 Preheader->getTerminator()->eraseFromParent(); 1998 } 1999 if (MemCheckBlock) { 2000 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2001 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2002 Preheader->getTerminator()->eraseFromParent(); 2003 } 2004 2005 DT->changeImmediateDominator(LoopHeader, Preheader); 2006 if (MemCheckBlock) { 2007 DT->eraseNode(MemCheckBlock); 2008 LI->removeBlock(MemCheckBlock); 2009 } 2010 if (SCEVCheckBlock) { 2011 DT->eraseNode(SCEVCheckBlock); 2012 LI->removeBlock(SCEVCheckBlock); 2013 } 2014 } 2015 2016 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2017 /// unused. 2018 ~GeneratedRTChecks() { 2019 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2020 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2021 if (!SCEVCheckCond) 2022 SCEVCleaner.markResultUsed(); 2023 2024 if (!MemRuntimeCheckCond) 2025 MemCheckCleaner.markResultUsed(); 2026 2027 if (MemRuntimeCheckCond) { 2028 auto &SE = *MemCheckExp.getSE(); 2029 // Memory runtime check generation creates compares that use expanded 2030 // values. Remove them before running the SCEVExpanderCleaners. 2031 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2032 if (MemCheckExp.isInsertedInstruction(&I)) 2033 continue; 2034 SE.forgetValue(&I); 2035 I.eraseFromParent(); 2036 } 2037 } 2038 MemCheckCleaner.cleanup(); 2039 SCEVCleaner.cleanup(); 2040 2041 if (SCEVCheckCond) 2042 SCEVCheckBlock->eraseFromParent(); 2043 if (MemRuntimeCheckCond) 2044 MemCheckBlock->eraseFromParent(); 2045 } 2046 2047 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2048 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2049 /// depending on the generated condition. 2050 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2051 BasicBlock *LoopVectorPreHeader, 2052 BasicBlock *LoopExitBlock) { 2053 if (!SCEVCheckCond) 2054 return nullptr; 2055 2056 Value *Cond = SCEVCheckCond; 2057 // Mark the check as used, to prevent it from being removed during cleanup. 2058 SCEVCheckCond = nullptr; 2059 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2060 if (C->isZero()) 2061 return nullptr; 2062 2063 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2064 2065 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2066 // Create new preheader for vector loop. 2067 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2068 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2069 2070 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2071 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2072 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2073 SCEVCheckBlock); 2074 2075 DT->addNewBlock(SCEVCheckBlock, Pred); 2076 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2077 2078 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2079 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2080 return SCEVCheckBlock; 2081 } 2082 2083 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2084 /// the branches to branch to the vector preheader or \p Bypass, depending on 2085 /// the generated condition. 2086 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2087 BasicBlock *LoopVectorPreHeader) { 2088 // Check if we generated code that checks in runtime if arrays overlap. 2089 if (!MemRuntimeCheckCond) 2090 return nullptr; 2091 2092 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2093 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2094 MemCheckBlock); 2095 2096 DT->addNewBlock(MemCheckBlock, Pred); 2097 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2098 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2099 2100 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2101 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2102 2103 ReplaceInstWithInst( 2104 MemCheckBlock->getTerminator(), 2105 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2106 MemCheckBlock->getTerminator()->setDebugLoc( 2107 Pred->getTerminator()->getDebugLoc()); 2108 2109 // Mark the check as used, to prevent it from being removed during cleanup. 2110 MemRuntimeCheckCond = nullptr; 2111 return MemCheckBlock; 2112 } 2113 }; 2114 2115 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2116 // vectorization. The loop needs to be annotated with #pragma omp simd 2117 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2118 // vector length information is not provided, vectorization is not considered 2119 // explicit. Interleave hints are not allowed either. These limitations will be 2120 // relaxed in the future. 2121 // Please, note that we are currently forced to abuse the pragma 'clang 2122 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2123 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2124 // provides *explicit vectorization hints* (LV can bypass legal checks and 2125 // assume that vectorization is legal). However, both hints are implemented 2126 // using the same metadata (llvm.loop.vectorize, processed by 2127 // LoopVectorizeHints). This will be fixed in the future when the native IR 2128 // representation for pragma 'omp simd' is introduced. 2129 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2130 OptimizationRemarkEmitter *ORE) { 2131 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2132 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2133 2134 // Only outer loops with an explicit vectorization hint are supported. 2135 // Unannotated outer loops are ignored. 2136 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2137 return false; 2138 2139 Function *Fn = OuterLp->getHeader()->getParent(); 2140 if (!Hints.allowVectorization(Fn, OuterLp, 2141 true /*VectorizeOnlyWhenForced*/)) { 2142 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2143 return false; 2144 } 2145 2146 if (Hints.getInterleave() > 1) { 2147 // TODO: Interleave support is future work. 2148 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2149 "outer loops.\n"); 2150 Hints.emitRemarkWithHints(); 2151 return false; 2152 } 2153 2154 return true; 2155 } 2156 2157 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2158 OptimizationRemarkEmitter *ORE, 2159 SmallVectorImpl<Loop *> &V) { 2160 // Collect inner loops and outer loops without irreducible control flow. For 2161 // now, only collect outer loops that have explicit vectorization hints. If we 2162 // are stress testing the VPlan H-CFG construction, we collect the outermost 2163 // loop of every loop nest. 2164 if (L.isInnermost() || VPlanBuildStressTest || 2165 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2166 LoopBlocksRPO RPOT(&L); 2167 RPOT.perform(LI); 2168 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2169 V.push_back(&L); 2170 // TODO: Collect inner loops inside marked outer loops in case 2171 // vectorization fails for the outer loop. Do not invoke 2172 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2173 // already known to be reducible. We can use an inherited attribute for 2174 // that. 2175 return; 2176 } 2177 } 2178 for (Loop *InnerL : L) 2179 collectSupportedLoops(*InnerL, LI, ORE, V); 2180 } 2181 2182 namespace { 2183 2184 /// The LoopVectorize Pass. 2185 struct LoopVectorize : public FunctionPass { 2186 /// Pass identification, replacement for typeid 2187 static char ID; 2188 2189 LoopVectorizePass Impl; 2190 2191 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2192 bool VectorizeOnlyWhenForced = false) 2193 : FunctionPass(ID), 2194 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2195 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2196 } 2197 2198 bool runOnFunction(Function &F) override { 2199 if (skipFunction(F)) 2200 return false; 2201 2202 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2203 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2204 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2205 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2206 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2207 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2208 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2209 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2210 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2211 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2212 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2213 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2214 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2215 2216 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2217 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2218 2219 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2220 GetLAA, *ORE, PSI).MadeAnyChange; 2221 } 2222 2223 void getAnalysisUsage(AnalysisUsage &AU) const override { 2224 AU.addRequired<AssumptionCacheTracker>(); 2225 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2226 AU.addRequired<DominatorTreeWrapperPass>(); 2227 AU.addRequired<LoopInfoWrapperPass>(); 2228 AU.addRequired<ScalarEvolutionWrapperPass>(); 2229 AU.addRequired<TargetTransformInfoWrapperPass>(); 2230 AU.addRequired<AAResultsWrapperPass>(); 2231 AU.addRequired<LoopAccessLegacyAnalysis>(); 2232 AU.addRequired<DemandedBitsWrapperPass>(); 2233 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2234 AU.addRequired<InjectTLIMappingsLegacy>(); 2235 2236 // We currently do not preserve loopinfo/dominator analyses with outer loop 2237 // vectorization. Until this is addressed, mark these analyses as preserved 2238 // only for non-VPlan-native path. 2239 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2240 if (!EnableVPlanNativePath) { 2241 AU.addPreserved<LoopInfoWrapperPass>(); 2242 AU.addPreserved<DominatorTreeWrapperPass>(); 2243 } 2244 2245 AU.addPreserved<BasicAAWrapperPass>(); 2246 AU.addPreserved<GlobalsAAWrapperPass>(); 2247 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2248 } 2249 }; 2250 2251 } // end anonymous namespace 2252 2253 //===----------------------------------------------------------------------===// 2254 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2255 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2256 //===----------------------------------------------------------------------===// 2257 2258 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2259 // We need to place the broadcast of invariant variables outside the loop, 2260 // but only if it's proven safe to do so. Else, broadcast will be inside 2261 // vector loop body. 2262 Instruction *Instr = dyn_cast<Instruction>(V); 2263 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2264 (!Instr || 2265 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2266 // Place the code for broadcasting invariant variables in the new preheader. 2267 IRBuilder<>::InsertPointGuard Guard(Builder); 2268 if (SafeToHoist) 2269 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2270 2271 // Broadcast the scalar into all locations in the vector. 2272 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2273 2274 return Shuf; 2275 } 2276 2277 /// This function adds 2278 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2279 /// to each vector element of Val. The sequence starts at StartIndex. 2280 /// \p Opcode is relevant for FP induction variable. 2281 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2282 Instruction::BinaryOps BinOp, ElementCount VF, 2283 IRBuilderBase &Builder) { 2284 assert(VF.isVector() && "only vector VFs are supported"); 2285 2286 // Create and check the types. 2287 auto *ValVTy = cast<VectorType>(Val->getType()); 2288 ElementCount VLen = ValVTy->getElementCount(); 2289 2290 Type *STy = Val->getType()->getScalarType(); 2291 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2292 "Induction Step must be an integer or FP"); 2293 assert(Step->getType() == STy && "Step has wrong type"); 2294 2295 SmallVector<Constant *, 8> Indices; 2296 2297 // Create a vector of consecutive numbers from zero to VF. 2298 VectorType *InitVecValVTy = ValVTy; 2299 if (STy->isFloatingPointTy()) { 2300 Type *InitVecValSTy = 2301 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2302 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2303 } 2304 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2305 2306 // Splat the StartIdx 2307 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2308 2309 if (STy->isIntegerTy()) { 2310 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2311 Step = Builder.CreateVectorSplat(VLen, Step); 2312 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2313 // FIXME: The newly created binary instructions should contain nsw/nuw 2314 // flags, which can be found from the original scalar operations. 2315 Step = Builder.CreateMul(InitVec, Step); 2316 return Builder.CreateAdd(Val, Step, "induction"); 2317 } 2318 2319 // Floating point induction. 2320 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2321 "Binary Opcode should be specified for FP induction"); 2322 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2323 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2324 2325 Step = Builder.CreateVectorSplat(VLen, Step); 2326 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2327 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2328 } 2329 2330 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2331 /// variable on which to base the steps, \p Step is the size of the step. 2332 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2333 const InductionDescriptor &ID, VPValue *Def, 2334 VPTransformState &State) { 2335 IRBuilderBase &Builder = State.Builder; 2336 // We shouldn't have to build scalar steps if we aren't vectorizing. 2337 assert(State.VF.isVector() && "VF should be greater than one"); 2338 // Get the value type and ensure it and the step have the same integer type. 2339 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2340 assert(ScalarIVTy == Step->getType() && 2341 "Val and Step should have the same type"); 2342 2343 // We build scalar steps for both integer and floating-point induction 2344 // variables. Here, we determine the kind of arithmetic we will perform. 2345 Instruction::BinaryOps AddOp; 2346 Instruction::BinaryOps MulOp; 2347 if (ScalarIVTy->isIntegerTy()) { 2348 AddOp = Instruction::Add; 2349 MulOp = Instruction::Mul; 2350 } else { 2351 AddOp = ID.getInductionOpcode(); 2352 MulOp = Instruction::FMul; 2353 } 2354 2355 // Determine the number of scalars we need to generate for each unroll 2356 // iteration. 2357 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2358 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2359 // Compute the scalar steps and save the results in State. 2360 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2361 ScalarIVTy->getScalarSizeInBits()); 2362 Type *VecIVTy = nullptr; 2363 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2364 if (!FirstLaneOnly && State.VF.isScalable()) { 2365 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2366 UnitStepVec = 2367 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2368 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2369 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2370 } 2371 2372 for (unsigned Part = 0; Part < State.UF; ++Part) { 2373 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2374 2375 if (!FirstLaneOnly && State.VF.isScalable()) { 2376 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2377 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2378 if (ScalarIVTy->isFloatingPointTy()) 2379 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2380 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2381 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2382 State.set(Def, Add, Part); 2383 // It's useful to record the lane values too for the known minimum number 2384 // of elements so we do those below. This improves the code quality when 2385 // trying to extract the first element, for example. 2386 } 2387 2388 if (ScalarIVTy->isFloatingPointTy()) 2389 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2390 2391 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2392 Value *StartIdx = Builder.CreateBinOp( 2393 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2394 // The step returned by `createStepForVF` is a runtime-evaluated value 2395 // when VF is scalable. Otherwise, it should be folded into a Constant. 2396 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2397 "Expected StartIdx to be folded to a constant when VF is not " 2398 "scalable"); 2399 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2400 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2401 State.set(Def, Add, VPIteration(Part, Lane)); 2402 } 2403 } 2404 } 2405 2406 // Generate code for the induction step. Note that induction steps are 2407 // required to be loop-invariant 2408 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2409 Instruction *InsertBefore, 2410 Loop *OrigLoop = nullptr) { 2411 const DataLayout &DL = SE.getDataLayout(); 2412 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2413 "Induction step should be loop invariant"); 2414 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2415 return E->getValue(); 2416 2417 SCEVExpander Exp(SE, DL, "induction"); 2418 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2419 } 2420 2421 /// Compute the transformed value of Index at offset StartValue using step 2422 /// StepValue. 2423 /// For integer induction, returns StartValue + Index * StepValue. 2424 /// For pointer induction, returns StartValue[Index * StepValue]. 2425 /// FIXME: The newly created binary instructions should contain nsw/nuw 2426 /// flags, which can be found from the original scalar operations. 2427 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2428 Value *StartValue, Value *Step, 2429 const InductionDescriptor &ID) { 2430 assert(Index->getType()->getScalarType() == Step->getType() && 2431 "Index scalar type does not match StepValue type"); 2432 2433 // Note: the IR at this point is broken. We cannot use SE to create any new 2434 // SCEV and then expand it, hoping that SCEV's simplification will give us 2435 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2436 // lead to various SCEV crashes. So all we can do is to use builder and rely 2437 // on InstCombine for future simplifications. Here we handle some trivial 2438 // cases only. 2439 auto CreateAdd = [&B](Value *X, Value *Y) { 2440 assert(X->getType() == Y->getType() && "Types don't match!"); 2441 if (auto *CX = dyn_cast<ConstantInt>(X)) 2442 if (CX->isZero()) 2443 return Y; 2444 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2445 if (CY->isZero()) 2446 return X; 2447 return B.CreateAdd(X, Y); 2448 }; 2449 2450 // We allow X to be a vector type, in which case Y will potentially be 2451 // splatted into a vector with the same element count. 2452 auto CreateMul = [&B](Value *X, Value *Y) { 2453 assert(X->getType()->getScalarType() == Y->getType() && 2454 "Types don't match!"); 2455 if (auto *CX = dyn_cast<ConstantInt>(X)) 2456 if (CX->isOne()) 2457 return Y; 2458 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2459 if (CY->isOne()) 2460 return X; 2461 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2462 if (XVTy && !isa<VectorType>(Y->getType())) 2463 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2464 return B.CreateMul(X, Y); 2465 }; 2466 2467 switch (ID.getKind()) { 2468 case InductionDescriptor::IK_IntInduction: { 2469 assert(!isa<VectorType>(Index->getType()) && 2470 "Vector indices not supported for integer inductions yet"); 2471 assert(Index->getType() == StartValue->getType() && 2472 "Index type does not match StartValue type"); 2473 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2474 return B.CreateSub(StartValue, Index); 2475 auto *Offset = CreateMul(Index, Step); 2476 return CreateAdd(StartValue, Offset); 2477 } 2478 case InductionDescriptor::IK_PtrInduction: { 2479 assert(isa<Constant>(Step) && 2480 "Expected constant step for pointer induction"); 2481 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2482 } 2483 case InductionDescriptor::IK_FpInduction: { 2484 assert(!isa<VectorType>(Index->getType()) && 2485 "Vector indices not supported for FP inductions yet"); 2486 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2487 auto InductionBinOp = ID.getInductionBinOp(); 2488 assert(InductionBinOp && 2489 (InductionBinOp->getOpcode() == Instruction::FAdd || 2490 InductionBinOp->getOpcode() == Instruction::FSub) && 2491 "Original bin op should be defined for FP induction"); 2492 2493 Value *MulExp = B.CreateFMul(Step, Index); 2494 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2495 "induction"); 2496 } 2497 case InductionDescriptor::IK_NoInduction: 2498 return nullptr; 2499 } 2500 llvm_unreachable("invalid enum"); 2501 } 2502 2503 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2504 const VPIteration &Instance, 2505 VPTransformState &State) { 2506 Value *ScalarInst = State.get(Def, Instance); 2507 Value *VectorValue = State.get(Def, Instance.Part); 2508 VectorValue = Builder.CreateInsertElement( 2509 VectorValue, ScalarInst, 2510 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2511 State.set(Def, VectorValue, Instance.Part); 2512 } 2513 2514 // Return whether we allow using masked interleave-groups (for dealing with 2515 // strided loads/stores that reside in predicated blocks, or for dealing 2516 // with gaps). 2517 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2518 // If an override option has been passed in for interleaved accesses, use it. 2519 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2520 return EnableMaskedInterleavedMemAccesses; 2521 2522 return TTI.enableMaskedInterleavedAccessVectorization(); 2523 } 2524 2525 // Try to vectorize the interleave group that \p Instr belongs to. 2526 // 2527 // E.g. Translate following interleaved load group (factor = 3): 2528 // for (i = 0; i < N; i+=3) { 2529 // R = Pic[i]; // Member of index 0 2530 // G = Pic[i+1]; // Member of index 1 2531 // B = Pic[i+2]; // Member of index 2 2532 // ... // do something to R, G, B 2533 // } 2534 // To: 2535 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2536 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2537 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2538 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2539 // 2540 // Or translate following interleaved store group (factor = 3): 2541 // for (i = 0; i < N; i+=3) { 2542 // ... do something to R, G, B 2543 // Pic[i] = R; // Member of index 0 2544 // Pic[i+1] = G; // Member of index 1 2545 // Pic[i+2] = B; // Member of index 2 2546 // } 2547 // To: 2548 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2549 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2550 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2551 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2552 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2553 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2554 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2555 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2556 VPValue *BlockInMask) { 2557 Instruction *Instr = Group->getInsertPos(); 2558 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2559 2560 // Prepare for the vector type of the interleaved load/store. 2561 Type *ScalarTy = getLoadStoreType(Instr); 2562 unsigned InterleaveFactor = Group->getFactor(); 2563 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2564 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2565 2566 // Prepare for the new pointers. 2567 SmallVector<Value *, 2> AddrParts; 2568 unsigned Index = Group->getIndex(Instr); 2569 2570 // TODO: extend the masked interleaved-group support to reversed access. 2571 assert((!BlockInMask || !Group->isReverse()) && 2572 "Reversed masked interleave-group not supported."); 2573 2574 // If the group is reverse, adjust the index to refer to the last vector lane 2575 // instead of the first. We adjust the index from the first vector lane, 2576 // rather than directly getting the pointer for lane VF - 1, because the 2577 // pointer operand of the interleaved access is supposed to be uniform. For 2578 // uniform instructions, we're only required to generate a value for the 2579 // first vector lane in each unroll iteration. 2580 if (Group->isReverse()) 2581 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2582 2583 for (unsigned Part = 0; Part < UF; Part++) { 2584 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2585 setDebugLocFromInst(AddrPart); 2586 2587 // Notice current instruction could be any index. Need to adjust the address 2588 // to the member of index 0. 2589 // 2590 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2591 // b = A[i]; // Member of index 0 2592 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2593 // 2594 // E.g. A[i+1] = a; // Member of index 1 2595 // A[i] = b; // Member of index 0 2596 // A[i+2] = c; // Member of index 2 (Current instruction) 2597 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2598 2599 bool InBounds = false; 2600 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2601 InBounds = gep->isInBounds(); 2602 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2603 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2604 2605 // Cast to the vector pointer type. 2606 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2607 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2608 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2609 } 2610 2611 setDebugLocFromInst(Instr); 2612 Value *PoisonVec = PoisonValue::get(VecTy); 2613 2614 Value *MaskForGaps = nullptr; 2615 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2616 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2617 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2618 } 2619 2620 // Vectorize the interleaved load group. 2621 if (isa<LoadInst>(Instr)) { 2622 // For each unroll part, create a wide load for the group. 2623 SmallVector<Value *, 2> NewLoads; 2624 for (unsigned Part = 0; Part < UF; Part++) { 2625 Instruction *NewLoad; 2626 if (BlockInMask || MaskForGaps) { 2627 assert(useMaskedInterleavedAccesses(*TTI) && 2628 "masked interleaved groups are not allowed."); 2629 Value *GroupMask = MaskForGaps; 2630 if (BlockInMask) { 2631 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2632 Value *ShuffledMask = Builder.CreateShuffleVector( 2633 BlockInMaskPart, 2634 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2635 "interleaved.mask"); 2636 GroupMask = MaskForGaps 2637 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2638 MaskForGaps) 2639 : ShuffledMask; 2640 } 2641 NewLoad = 2642 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2643 GroupMask, PoisonVec, "wide.masked.vec"); 2644 } 2645 else 2646 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2647 Group->getAlign(), "wide.vec"); 2648 Group->addMetadata(NewLoad); 2649 NewLoads.push_back(NewLoad); 2650 } 2651 2652 // For each member in the group, shuffle out the appropriate data from the 2653 // wide loads. 2654 unsigned J = 0; 2655 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2656 Instruction *Member = Group->getMember(I); 2657 2658 // Skip the gaps in the group. 2659 if (!Member) 2660 continue; 2661 2662 auto StrideMask = 2663 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2664 for (unsigned Part = 0; Part < UF; Part++) { 2665 Value *StridedVec = Builder.CreateShuffleVector( 2666 NewLoads[Part], StrideMask, "strided.vec"); 2667 2668 // If this member has different type, cast the result type. 2669 if (Member->getType() != ScalarTy) { 2670 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2671 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2672 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2673 } 2674 2675 if (Group->isReverse()) 2676 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2677 2678 State.set(VPDefs[J], StridedVec, Part); 2679 } 2680 ++J; 2681 } 2682 return; 2683 } 2684 2685 // The sub vector type for current instruction. 2686 auto *SubVT = VectorType::get(ScalarTy, VF); 2687 2688 // Vectorize the interleaved store group. 2689 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2690 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2691 "masked interleaved groups are not allowed."); 2692 assert((!MaskForGaps || !VF.isScalable()) && 2693 "masking gaps for scalable vectors is not yet supported."); 2694 for (unsigned Part = 0; Part < UF; Part++) { 2695 // Collect the stored vector from each member. 2696 SmallVector<Value *, 4> StoredVecs; 2697 for (unsigned i = 0; i < InterleaveFactor; i++) { 2698 assert((Group->getMember(i) || MaskForGaps) && 2699 "Fail to get a member from an interleaved store group"); 2700 Instruction *Member = Group->getMember(i); 2701 2702 // Skip the gaps in the group. 2703 if (!Member) { 2704 Value *Undef = PoisonValue::get(SubVT); 2705 StoredVecs.push_back(Undef); 2706 continue; 2707 } 2708 2709 Value *StoredVec = State.get(StoredValues[i], Part); 2710 2711 if (Group->isReverse()) 2712 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2713 2714 // If this member has different type, cast it to a unified type. 2715 2716 if (StoredVec->getType() != SubVT) 2717 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2718 2719 StoredVecs.push_back(StoredVec); 2720 } 2721 2722 // Concatenate all vectors into a wide vector. 2723 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2724 2725 // Interleave the elements in the wide vector. 2726 Value *IVec = Builder.CreateShuffleVector( 2727 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2728 "interleaved.vec"); 2729 2730 Instruction *NewStoreInstr; 2731 if (BlockInMask || MaskForGaps) { 2732 Value *GroupMask = MaskForGaps; 2733 if (BlockInMask) { 2734 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2735 Value *ShuffledMask = Builder.CreateShuffleVector( 2736 BlockInMaskPart, 2737 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2738 "interleaved.mask"); 2739 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2740 ShuffledMask, MaskForGaps) 2741 : ShuffledMask; 2742 } 2743 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2744 Group->getAlign(), GroupMask); 2745 } else 2746 NewStoreInstr = 2747 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2748 2749 Group->addMetadata(NewStoreInstr); 2750 } 2751 } 2752 2753 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2754 VPReplicateRecipe *RepRecipe, 2755 const VPIteration &Instance, 2756 bool IfPredicateInstr, 2757 VPTransformState &State) { 2758 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2759 2760 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2761 // the first lane and part. 2762 if (isa<NoAliasScopeDeclInst>(Instr)) 2763 if (!Instance.isFirstIteration()) 2764 return; 2765 2766 // Does this instruction return a value ? 2767 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2768 2769 Instruction *Cloned = Instr->clone(); 2770 if (!IsVoidRetTy) 2771 Cloned->setName(Instr->getName() + ".cloned"); 2772 2773 // If the scalarized instruction contributes to the address computation of a 2774 // widen masked load/store which was in a basic block that needed predication 2775 // and is not predicated after vectorization, we can't propagate 2776 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2777 // instruction could feed a poison value to the base address of the widen 2778 // load/store. 2779 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2780 Cloned->dropPoisonGeneratingFlags(); 2781 2782 if (Instr->getDebugLoc()) 2783 setDebugLocFromInst(Instr); 2784 2785 // Replace the operands of the cloned instructions with their scalar 2786 // equivalents in the new loop. 2787 for (auto &I : enumerate(RepRecipe->operands())) { 2788 auto InputInstance = Instance; 2789 VPValue *Operand = I.value(); 2790 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2791 if (OperandR && OperandR->isUniform()) 2792 InputInstance.Lane = VPLane::getFirstLane(); 2793 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2794 } 2795 addNewMetadata(Cloned, Instr); 2796 2797 // Place the cloned scalar in the new loop. 2798 State.Builder.Insert(Cloned); 2799 2800 State.set(RepRecipe, Cloned, Instance); 2801 2802 // If we just cloned a new assumption, add it the assumption cache. 2803 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2804 AC->registerAssumption(II); 2805 2806 // End if-block. 2807 if (IfPredicateInstr) 2808 PredicatedInstructions.push_back(Cloned); 2809 } 2810 2811 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2812 if (TripCount) 2813 return TripCount; 2814 2815 assert(InsertBlock); 2816 IRBuilder<> Builder(InsertBlock->getTerminator()); 2817 // Find the loop boundaries. 2818 ScalarEvolution *SE = PSE.getSE(); 2819 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2820 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2821 "Invalid loop count"); 2822 2823 Type *IdxTy = Legal->getWidestInductionType(); 2824 assert(IdxTy && "No type for induction"); 2825 2826 // The exit count might have the type of i64 while the phi is i32. This can 2827 // happen if we have an induction variable that is sign extended before the 2828 // compare. The only way that we get a backedge taken count is that the 2829 // induction variable was signed and as such will not overflow. In such a case 2830 // truncation is legal. 2831 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2832 IdxTy->getPrimitiveSizeInBits()) 2833 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2834 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2835 2836 // Get the total trip count from the count by adding 1. 2837 const SCEV *ExitCount = SE->getAddExpr( 2838 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2839 2840 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2841 2842 // Expand the trip count and place the new instructions in the preheader. 2843 // Notice that the pre-header does not change, only the loop body. 2844 SCEVExpander Exp(*SE, DL, "induction"); 2845 2846 // Count holds the overall loop count (N). 2847 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2848 InsertBlock->getTerminator()); 2849 2850 if (TripCount->getType()->isPointerTy()) 2851 TripCount = 2852 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2853 InsertBlock->getTerminator()); 2854 2855 return TripCount; 2856 } 2857 2858 Value * 2859 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2860 if (VectorTripCount) 2861 return VectorTripCount; 2862 2863 Value *TC = getOrCreateTripCount(InsertBlock); 2864 IRBuilder<> Builder(InsertBlock->getTerminator()); 2865 2866 Type *Ty = TC->getType(); 2867 // This is where we can make the step a runtime constant. 2868 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2869 2870 // If the tail is to be folded by masking, round the number of iterations N 2871 // up to a multiple of Step instead of rounding down. This is done by first 2872 // adding Step-1 and then rounding down. Note that it's ok if this addition 2873 // overflows: the vector induction variable will eventually wrap to zero given 2874 // that it starts at zero and its Step is a power of two; the loop will then 2875 // exit, with the last early-exit vector comparison also producing all-true. 2876 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2877 // is accounted for in emitIterationCountCheck that adds an overflow check. 2878 if (Cost->foldTailByMasking()) { 2879 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2880 "VF*UF must be a power of 2 when folding tail by masking"); 2881 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2882 TC = Builder.CreateAdd( 2883 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2884 } 2885 2886 // Now we need to generate the expression for the part of the loop that the 2887 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2888 // iterations are not required for correctness, or N - Step, otherwise. Step 2889 // is equal to the vectorization factor (number of SIMD elements) times the 2890 // unroll factor (number of SIMD instructions). 2891 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2892 2893 // There are cases where we *must* run at least one iteration in the remainder 2894 // loop. See the cost model for when this can happen. If the step evenly 2895 // divides the trip count, we set the remainder to be equal to the step. If 2896 // the step does not evenly divide the trip count, no adjustment is necessary 2897 // since there will already be scalar iterations. Note that the minimum 2898 // iterations check ensures that N >= Step. 2899 if (Cost->requiresScalarEpilogue(VF)) { 2900 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2901 R = Builder.CreateSelect(IsZero, Step, R); 2902 } 2903 2904 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2905 2906 return VectorTripCount; 2907 } 2908 2909 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2910 const DataLayout &DL) { 2911 // Verify that V is a vector type with same number of elements as DstVTy. 2912 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2913 unsigned VF = DstFVTy->getNumElements(); 2914 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2915 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2916 Type *SrcElemTy = SrcVecTy->getElementType(); 2917 Type *DstElemTy = DstFVTy->getElementType(); 2918 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2919 "Vector elements must have same size"); 2920 2921 // Do a direct cast if element types are castable. 2922 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2923 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2924 } 2925 // V cannot be directly casted to desired vector type. 2926 // May happen when V is a floating point vector but DstVTy is a vector of 2927 // pointers or vice-versa. Handle this using a two-step bitcast using an 2928 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2929 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2930 "Only one type should be a pointer type"); 2931 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2932 "Only one type should be a floating point type"); 2933 Type *IntTy = 2934 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2935 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2936 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2937 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2938 } 2939 2940 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2941 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2942 // Reuse existing vector loop preheader for TC checks. 2943 // Note that new preheader block is generated for vector loop. 2944 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2945 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2946 2947 // Generate code to check if the loop's trip count is less than VF * UF, or 2948 // equal to it in case a scalar epilogue is required; this implies that the 2949 // vector trip count is zero. This check also covers the case where adding one 2950 // to the backedge-taken count overflowed leading to an incorrect trip count 2951 // of zero. In this case we will also jump to the scalar loop. 2952 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2953 : ICmpInst::ICMP_ULT; 2954 2955 // If tail is to be folded, vector loop takes care of all iterations. 2956 Type *CountTy = Count->getType(); 2957 Value *CheckMinIters = Builder.getFalse(); 2958 Value *Step = createStepForVF(Builder, CountTy, VF, UF); 2959 if (!Cost->foldTailByMasking()) 2960 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2961 else if (VF.isScalable()) { 2962 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2963 // an overflow to zero when updating induction variables and so an 2964 // additional overflow check is required before entering the vector loop. 2965 2966 // Get the maximum unsigned value for the type. 2967 Value *MaxUIntTripCount = 2968 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2969 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2970 2971 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2972 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); 2973 } 2974 // Create new preheader for vector loop. 2975 LoopVectorPreHeader = 2976 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2977 "vector.ph"); 2978 2979 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2980 DT->getNode(Bypass)->getIDom()) && 2981 "TC check is expected to dominate Bypass"); 2982 2983 // Update dominator for Bypass & LoopExit (if needed). 2984 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2985 if (!Cost->requiresScalarEpilogue(VF)) 2986 // If there is an epilogue which must run, there's no edge from the 2987 // middle block to exit blocks and thus no need to update the immediate 2988 // dominator of the exit blocks. 2989 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2990 2991 ReplaceInstWithInst( 2992 TCCheckBlock->getTerminator(), 2993 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2994 LoopBypassBlocks.push_back(TCCheckBlock); 2995 } 2996 2997 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2998 2999 BasicBlock *const SCEVCheckBlock = 3000 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3001 if (!SCEVCheckBlock) 3002 return nullptr; 3003 3004 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3005 (OptForSizeBasedOnProfile && 3006 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3007 "Cannot SCEV check stride or overflow when optimizing for size"); 3008 3009 3010 // Update dominator only if this is first RT check. 3011 if (LoopBypassBlocks.empty()) { 3012 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3013 if (!Cost->requiresScalarEpilogue(VF)) 3014 // If there is an epilogue which must run, there's no edge from the 3015 // middle block to exit blocks and thus no need to update the immediate 3016 // dominator of the exit blocks. 3017 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3018 } 3019 3020 LoopBypassBlocks.push_back(SCEVCheckBlock); 3021 AddedSafetyChecks = true; 3022 return SCEVCheckBlock; 3023 } 3024 3025 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3026 // VPlan-native path does not do any analysis for runtime checks currently. 3027 if (EnableVPlanNativePath) 3028 return nullptr; 3029 3030 BasicBlock *const MemCheckBlock = 3031 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3032 3033 // Check if we generated code that checks in runtime if arrays overlap. We put 3034 // the checks into a separate block to make the more common case of few 3035 // elements faster. 3036 if (!MemCheckBlock) 3037 return nullptr; 3038 3039 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3040 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3041 "Cannot emit memory checks when optimizing for size, unless forced " 3042 "to vectorize."); 3043 ORE->emit([&]() { 3044 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3045 OrigLoop->getStartLoc(), 3046 OrigLoop->getHeader()) 3047 << "Code-size may be reduced by not forcing " 3048 "vectorization, or by source-code modifications " 3049 "eliminating the need for runtime checks " 3050 "(e.g., adding 'restrict')."; 3051 }); 3052 } 3053 3054 LoopBypassBlocks.push_back(MemCheckBlock); 3055 3056 AddedSafetyChecks = true; 3057 3058 // Only use noalias metadata when using memory checks guaranteeing no overlap 3059 // across all iterations. 3060 if (!Legal->getLAI()->getRuntimePointerChecking()->getDiffChecks()) { 3061 // We currently don't use LoopVersioning for the actual loop cloning but we 3062 // still use it to add the noalias metadata. 3063 LVer = std::make_unique<LoopVersioning>( 3064 *Legal->getLAI(), 3065 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3066 DT, PSE.getSE()); 3067 LVer->prepareNoAliasMetadata(); 3068 } 3069 return MemCheckBlock; 3070 } 3071 3072 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3073 LoopScalarBody = OrigLoop->getHeader(); 3074 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3075 assert(LoopVectorPreHeader && "Invalid loop structure"); 3076 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3077 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3078 "multiple exit loop without required epilogue?"); 3079 3080 LoopMiddleBlock = 3081 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3082 LI, nullptr, Twine(Prefix) + "middle.block"); 3083 LoopScalarPreHeader = 3084 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3085 nullptr, Twine(Prefix) + "scalar.ph"); 3086 3087 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3088 3089 // Set up the middle block terminator. Two cases: 3090 // 1) If we know that we must execute the scalar epilogue, emit an 3091 // unconditional branch. 3092 // 2) Otherwise, we must have a single unique exit block (due to how we 3093 // implement the multiple exit case). In this case, set up a conditonal 3094 // branch from the middle block to the loop scalar preheader, and the 3095 // exit block. completeLoopSkeleton will update the condition to use an 3096 // iteration check, if required to decide whether to execute the remainder. 3097 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3098 BranchInst::Create(LoopScalarPreHeader) : 3099 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3100 Builder.getTrue()); 3101 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3102 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3103 3104 // Update dominator for loop exit. During skeleton creation, only the vector 3105 // pre-header and the middle block are created. The vector loop is entirely 3106 // created during VPlan exection. 3107 if (!Cost->requiresScalarEpilogue(VF)) 3108 // If there is an epilogue which must run, there's no edge from the 3109 // middle block to exit blocks and thus no need to update the immediate 3110 // dominator of the exit blocks. 3111 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3112 } 3113 3114 void InnerLoopVectorizer::createInductionResumeValues( 3115 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3116 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3117 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3118 "Inconsistent information about additional bypass."); 3119 3120 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3121 assert(VectorTripCount && "Expected valid arguments"); 3122 // We are going to resume the execution of the scalar loop. 3123 // Go over all of the induction variables that we found and fix the 3124 // PHIs that are left in the scalar version of the loop. 3125 // The starting values of PHI nodes depend on the counter of the last 3126 // iteration in the vectorized loop. 3127 // If we come from a bypass edge then we need to start from the original 3128 // start value. 3129 Instruction *OldInduction = Legal->getPrimaryInduction(); 3130 for (auto &InductionEntry : Legal->getInductionVars()) { 3131 PHINode *OrigPhi = InductionEntry.first; 3132 InductionDescriptor II = InductionEntry.second; 3133 3134 // Create phi nodes to merge from the backedge-taken check block. 3135 PHINode *BCResumeVal = 3136 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3137 LoopScalarPreHeader->getTerminator()); 3138 // Copy original phi DL over to the new one. 3139 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3140 Value *&EndValue = IVEndValues[OrigPhi]; 3141 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3142 if (OrigPhi == OldInduction) { 3143 // We know what the end value is. 3144 EndValue = VectorTripCount; 3145 } else { 3146 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3147 3148 // Fast-math-flags propagate from the original induction instruction. 3149 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3150 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3151 3152 Type *StepType = II.getStep()->getType(); 3153 Instruction::CastOps CastOp = 3154 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3155 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3156 Value *Step = 3157 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3158 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3159 EndValue->setName("ind.end"); 3160 3161 // Compute the end value for the additional bypass (if applicable). 3162 if (AdditionalBypass.first) { 3163 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3164 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3165 StepType, true); 3166 Value *Step = 3167 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3168 VTC = 3169 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3170 EndValueFromAdditionalBypass = 3171 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3172 EndValueFromAdditionalBypass->setName("ind.end"); 3173 } 3174 } 3175 // The new PHI merges the original incoming value, in case of a bypass, 3176 // or the value at the end of the vectorized loop. 3177 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3178 3179 // Fix the scalar body counter (PHI node). 3180 // The old induction's phi node in the scalar body needs the truncated 3181 // value. 3182 for (BasicBlock *BB : LoopBypassBlocks) 3183 BCResumeVal->addIncoming(II.getStartValue(), BB); 3184 3185 if (AdditionalBypass.first) 3186 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3187 EndValueFromAdditionalBypass); 3188 3189 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3190 } 3191 } 3192 3193 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3194 // The trip counts should be cached by now. 3195 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3196 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3197 3198 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3199 3200 // Add a check in the middle block to see if we have completed 3201 // all of the iterations in the first vector loop. Three cases: 3202 // 1) If we require a scalar epilogue, there is no conditional branch as 3203 // we unconditionally branch to the scalar preheader. Do nothing. 3204 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3205 // Thus if tail is to be folded, we know we don't need to run the 3206 // remainder and we can use the previous value for the condition (true). 3207 // 3) Otherwise, construct a runtime check. 3208 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3209 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3210 Count, VectorTripCount, "cmp.n", 3211 LoopMiddleBlock->getTerminator()); 3212 3213 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3214 // of the corresponding compare because they may have ended up with 3215 // different line numbers and we want to avoid awkward line stepping while 3216 // debugging. Eg. if the compare has got a line number inside the loop. 3217 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3218 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3219 } 3220 3221 #ifdef EXPENSIVE_CHECKS 3222 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3223 #endif 3224 3225 return LoopVectorPreHeader; 3226 } 3227 3228 std::pair<BasicBlock *, Value *> 3229 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3230 /* 3231 In this function we generate a new loop. The new loop will contain 3232 the vectorized instructions while the old loop will continue to run the 3233 scalar remainder. 3234 3235 [ ] <-- loop iteration number check. 3236 / | 3237 / v 3238 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3239 | / | 3240 | / v 3241 || [ ] <-- vector pre header. 3242 |/ | 3243 | v 3244 | [ ] \ 3245 | [ ]_| <-- vector loop (created during VPlan execution). 3246 | | 3247 | v 3248 \ -[ ] <--- middle-block. 3249 \/ | 3250 /\ v 3251 | ->[ ] <--- new preheader. 3252 | | 3253 (opt) v <-- edge from middle to exit iff epilogue is not required. 3254 | [ ] \ 3255 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3256 \ | 3257 \ v 3258 >[ ] <-- exit block(s). 3259 ... 3260 */ 3261 3262 // Get the metadata of the original loop before it gets modified. 3263 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3264 3265 // Workaround! Compute the trip count of the original loop and cache it 3266 // before we start modifying the CFG. This code has a systemic problem 3267 // wherein it tries to run analysis over partially constructed IR; this is 3268 // wrong, and not simply for SCEV. The trip count of the original loop 3269 // simply happens to be prone to hitting this in practice. In theory, we 3270 // can hit the same issue for any SCEV, or ValueTracking query done during 3271 // mutation. See PR49900. 3272 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3273 3274 // Create an empty vector loop, and prepare basic blocks for the runtime 3275 // checks. 3276 createVectorLoopSkeleton(""); 3277 3278 // Now, compare the new count to zero. If it is zero skip the vector loop and 3279 // jump to the scalar loop. This check also covers the case where the 3280 // backedge-taken count is uint##_max: adding one to it will overflow leading 3281 // to an incorrect trip count of zero. In this (rare) case we will also jump 3282 // to the scalar loop. 3283 emitIterationCountCheck(LoopScalarPreHeader); 3284 3285 // Generate the code to check any assumptions that we've made for SCEV 3286 // expressions. 3287 emitSCEVChecks(LoopScalarPreHeader); 3288 3289 // Generate the code that checks in runtime if arrays overlap. We put the 3290 // checks into a separate block to make the more common case of few elements 3291 // faster. 3292 emitMemRuntimeChecks(LoopScalarPreHeader); 3293 3294 // Emit phis for the new starting index of the scalar loop. 3295 createInductionResumeValues(); 3296 3297 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3298 } 3299 3300 // Fix up external users of the induction variable. At this point, we are 3301 // in LCSSA form, with all external PHIs that use the IV having one input value, 3302 // coming from the remainder loop. We need those PHIs to also have a correct 3303 // value for the IV when arriving directly from the middle block. 3304 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3305 const InductionDescriptor &II, 3306 Value *VectorTripCount, Value *EndValue, 3307 BasicBlock *MiddleBlock, 3308 BasicBlock *VectorHeader, VPlan &Plan) { 3309 // There are two kinds of external IV usages - those that use the value 3310 // computed in the last iteration (the PHI) and those that use the penultimate 3311 // value (the value that feeds into the phi from the loop latch). 3312 // We allow both, but they, obviously, have different values. 3313 3314 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3315 3316 DenseMap<Value *, Value *> MissingVals; 3317 3318 // An external user of the last iteration's value should see the value that 3319 // the remainder loop uses to initialize its own IV. 3320 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3321 for (User *U : PostInc->users()) { 3322 Instruction *UI = cast<Instruction>(U); 3323 if (!OrigLoop->contains(UI)) { 3324 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3325 MissingVals[UI] = EndValue; 3326 } 3327 } 3328 3329 // An external user of the penultimate value need to see EndValue - Step. 3330 // The simplest way to get this is to recompute it from the constituent SCEVs, 3331 // that is Start + (Step * (CRD - 1)). 3332 for (User *U : OrigPhi->users()) { 3333 auto *UI = cast<Instruction>(U); 3334 if (!OrigLoop->contains(UI)) { 3335 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3336 3337 IRBuilder<> B(MiddleBlock->getTerminator()); 3338 3339 // Fast-math-flags propagate from the original induction instruction. 3340 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3341 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3342 3343 Value *CountMinusOne = B.CreateSub( 3344 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3345 Value *CMO = 3346 !II.getStep()->getType()->isIntegerTy() 3347 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3348 II.getStep()->getType()) 3349 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3350 CMO->setName("cast.cmo"); 3351 3352 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3353 VectorHeader->getTerminator()); 3354 Value *Escape = 3355 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3356 Escape->setName("ind.escape"); 3357 MissingVals[UI] = Escape; 3358 } 3359 } 3360 3361 for (auto &I : MissingVals) { 3362 PHINode *PHI = cast<PHINode>(I.first); 3363 // One corner case we have to handle is two IVs "chasing" each-other, 3364 // that is %IV2 = phi [...], [ %IV1, %latch ] 3365 // In this case, if IV1 has an external use, we need to avoid adding both 3366 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3367 // don't already have an incoming value for the middle block. 3368 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3369 PHI->addIncoming(I.second, MiddleBlock); 3370 Plan.removeLiveOut(PHI); 3371 } 3372 } 3373 } 3374 3375 namespace { 3376 3377 struct CSEDenseMapInfo { 3378 static bool canHandle(const Instruction *I) { 3379 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3380 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3381 } 3382 3383 static inline Instruction *getEmptyKey() { 3384 return DenseMapInfo<Instruction *>::getEmptyKey(); 3385 } 3386 3387 static inline Instruction *getTombstoneKey() { 3388 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3389 } 3390 3391 static unsigned getHashValue(const Instruction *I) { 3392 assert(canHandle(I) && "Unknown instruction!"); 3393 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3394 I->value_op_end())); 3395 } 3396 3397 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3398 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3399 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3400 return LHS == RHS; 3401 return LHS->isIdenticalTo(RHS); 3402 } 3403 }; 3404 3405 } // end anonymous namespace 3406 3407 ///Perform cse of induction variable instructions. 3408 static void cse(BasicBlock *BB) { 3409 // Perform simple cse. 3410 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3411 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3412 if (!CSEDenseMapInfo::canHandle(&In)) 3413 continue; 3414 3415 // Check if we can replace this instruction with any of the 3416 // visited instructions. 3417 if (Instruction *V = CSEMap.lookup(&In)) { 3418 In.replaceAllUsesWith(V); 3419 In.eraseFromParent(); 3420 continue; 3421 } 3422 3423 CSEMap[&In] = &In; 3424 } 3425 } 3426 3427 InstructionCost 3428 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3429 bool &NeedToScalarize) const { 3430 Function *F = CI->getCalledFunction(); 3431 Type *ScalarRetTy = CI->getType(); 3432 SmallVector<Type *, 4> Tys, ScalarTys; 3433 for (auto &ArgOp : CI->args()) 3434 ScalarTys.push_back(ArgOp->getType()); 3435 3436 // Estimate cost of scalarized vector call. The source operands are assumed 3437 // to be vectors, so we need to extract individual elements from there, 3438 // execute VF scalar calls, and then gather the result into the vector return 3439 // value. 3440 InstructionCost ScalarCallCost = 3441 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3442 if (VF.isScalar()) 3443 return ScalarCallCost; 3444 3445 // Compute corresponding vector type for return value and arguments. 3446 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3447 for (Type *ScalarTy : ScalarTys) 3448 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3449 3450 // Compute costs of unpacking argument values for the scalar calls and 3451 // packing the return values to a vector. 3452 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3453 3454 InstructionCost Cost = 3455 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3456 3457 // If we can't emit a vector call for this function, then the currently found 3458 // cost is the cost we need to return. 3459 NeedToScalarize = true; 3460 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3461 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3462 3463 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3464 return Cost; 3465 3466 // If the corresponding vector cost is cheaper, return its cost. 3467 InstructionCost VectorCallCost = 3468 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3469 if (VectorCallCost < Cost) { 3470 NeedToScalarize = false; 3471 Cost = VectorCallCost; 3472 } 3473 return Cost; 3474 } 3475 3476 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3477 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3478 return Elt; 3479 return VectorType::get(Elt, VF); 3480 } 3481 3482 InstructionCost 3483 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3484 ElementCount VF) const { 3485 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3486 assert(ID && "Expected intrinsic call!"); 3487 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3488 FastMathFlags FMF; 3489 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3490 FMF = FPMO->getFastMathFlags(); 3491 3492 SmallVector<const Value *> Arguments(CI->args()); 3493 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3494 SmallVector<Type *> ParamTys; 3495 std::transform(FTy->param_begin(), FTy->param_end(), 3496 std::back_inserter(ParamTys), 3497 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3498 3499 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3500 dyn_cast<IntrinsicInst>(CI)); 3501 return TTI.getIntrinsicInstrCost(CostAttrs, 3502 TargetTransformInfo::TCK_RecipThroughput); 3503 } 3504 3505 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3506 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3507 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3508 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3509 } 3510 3511 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3512 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3513 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3514 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3515 } 3516 3517 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3518 // For every instruction `I` in MinBWs, truncate the operands, create a 3519 // truncated version of `I` and reextend its result. InstCombine runs 3520 // later and will remove any ext/trunc pairs. 3521 SmallPtrSet<Value *, 4> Erased; 3522 for (const auto &KV : Cost->getMinimalBitwidths()) { 3523 // If the value wasn't vectorized, we must maintain the original scalar 3524 // type. The absence of the value from State indicates that it 3525 // wasn't vectorized. 3526 // FIXME: Should not rely on getVPValue at this point. 3527 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3528 if (!State.hasAnyVectorValue(Def)) 3529 continue; 3530 for (unsigned Part = 0; Part < UF; ++Part) { 3531 Value *I = State.get(Def, Part); 3532 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3533 continue; 3534 Type *OriginalTy = I->getType(); 3535 Type *ScalarTruncatedTy = 3536 IntegerType::get(OriginalTy->getContext(), KV.second); 3537 auto *TruncatedTy = VectorType::get( 3538 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3539 if (TruncatedTy == OriginalTy) 3540 continue; 3541 3542 IRBuilder<> B(cast<Instruction>(I)); 3543 auto ShrinkOperand = [&](Value *V) -> Value * { 3544 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3545 if (ZI->getSrcTy() == TruncatedTy) 3546 return ZI->getOperand(0); 3547 return B.CreateZExtOrTrunc(V, TruncatedTy); 3548 }; 3549 3550 // The actual instruction modification depends on the instruction type, 3551 // unfortunately. 3552 Value *NewI = nullptr; 3553 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3554 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3555 ShrinkOperand(BO->getOperand(1))); 3556 3557 // Any wrapping introduced by shrinking this operation shouldn't be 3558 // considered undefined behavior. So, we can't unconditionally copy 3559 // arithmetic wrapping flags to NewI. 3560 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3561 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3562 NewI = 3563 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3564 ShrinkOperand(CI->getOperand(1))); 3565 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3566 NewI = B.CreateSelect(SI->getCondition(), 3567 ShrinkOperand(SI->getTrueValue()), 3568 ShrinkOperand(SI->getFalseValue())); 3569 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3570 switch (CI->getOpcode()) { 3571 default: 3572 llvm_unreachable("Unhandled cast!"); 3573 case Instruction::Trunc: 3574 NewI = ShrinkOperand(CI->getOperand(0)); 3575 break; 3576 case Instruction::SExt: 3577 NewI = B.CreateSExtOrTrunc( 3578 CI->getOperand(0), 3579 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3580 break; 3581 case Instruction::ZExt: 3582 NewI = B.CreateZExtOrTrunc( 3583 CI->getOperand(0), 3584 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3585 break; 3586 } 3587 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3588 auto Elements0 = 3589 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3590 auto *O0 = B.CreateZExtOrTrunc( 3591 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3592 auto Elements1 = 3593 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3594 auto *O1 = B.CreateZExtOrTrunc( 3595 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3596 3597 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3598 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3599 // Don't do anything with the operands, just extend the result. 3600 continue; 3601 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3602 auto Elements = 3603 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3604 auto *O0 = B.CreateZExtOrTrunc( 3605 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3606 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3607 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3608 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3609 auto Elements = 3610 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3611 auto *O0 = B.CreateZExtOrTrunc( 3612 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3613 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3614 } else { 3615 // If we don't know what to do, be conservative and don't do anything. 3616 continue; 3617 } 3618 3619 // Lastly, extend the result. 3620 NewI->takeName(cast<Instruction>(I)); 3621 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3622 I->replaceAllUsesWith(Res); 3623 cast<Instruction>(I)->eraseFromParent(); 3624 Erased.insert(I); 3625 State.reset(Def, Res, Part); 3626 } 3627 } 3628 3629 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3630 for (const auto &KV : Cost->getMinimalBitwidths()) { 3631 // If the value wasn't vectorized, we must maintain the original scalar 3632 // type. The absence of the value from State indicates that it 3633 // wasn't vectorized. 3634 // FIXME: Should not rely on getVPValue at this point. 3635 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3636 if (!State.hasAnyVectorValue(Def)) 3637 continue; 3638 for (unsigned Part = 0; Part < UF; ++Part) { 3639 Value *I = State.get(Def, Part); 3640 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3641 if (Inst && Inst->use_empty()) { 3642 Value *NewI = Inst->getOperand(0); 3643 Inst->eraseFromParent(); 3644 State.reset(Def, NewI, Part); 3645 } 3646 } 3647 } 3648 } 3649 3650 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3651 VPlan &Plan) { 3652 // Insert truncates and extends for any truncated instructions as hints to 3653 // InstCombine. 3654 if (VF.isVector()) 3655 truncateToMinimalBitwidths(State); 3656 3657 // Fix widened non-induction PHIs by setting up the PHI operands. 3658 if (EnableVPlanNativePath) 3659 fixNonInductionPHIs(Plan, State); 3660 3661 // At this point every instruction in the original loop is widened to a 3662 // vector form. Now we need to fix the recurrences in the loop. These PHI 3663 // nodes are currently empty because we did not want to introduce cycles. 3664 // This is the second stage of vectorizing recurrences. 3665 fixCrossIterationPHIs(State); 3666 3667 // Forget the original basic block. 3668 PSE.getSE()->forgetLoop(OrigLoop); 3669 3670 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3671 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3672 if (Cost->requiresScalarEpilogue(VF)) { 3673 // No edge from the middle block to the unique exit block has been inserted 3674 // and there is nothing to fix from vector loop; phis should have incoming 3675 // from scalar loop only. 3676 Plan.clearLiveOuts(); 3677 } else { 3678 // If we inserted an edge from the middle block to the unique exit block, 3679 // update uses outside the loop (phis) to account for the newly inserted 3680 // edge. 3681 3682 // Fix-up external users of the induction variables. 3683 for (auto &Entry : Legal->getInductionVars()) 3684 fixupIVUsers(Entry.first, Entry.second, 3685 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3686 IVEndValues[Entry.first], LoopMiddleBlock, 3687 VectorLoop->getHeader(), Plan); 3688 } 3689 3690 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3691 // in the exit block, so update the builder. 3692 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3693 for (auto &KV : Plan.getLiveOuts()) 3694 KV.second->fixPhi(Plan, State); 3695 3696 for (Instruction *PI : PredicatedInstructions) 3697 sinkScalarOperands(&*PI); 3698 3699 // Remove redundant induction instructions. 3700 cse(VectorLoop->getHeader()); 3701 3702 // Set/update profile weights for the vector and remainder loops as original 3703 // loop iterations are now distributed among them. Note that original loop 3704 // represented by LoopScalarBody becomes remainder loop after vectorization. 3705 // 3706 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3707 // end up getting slightly roughened result but that should be OK since 3708 // profile is not inherently precise anyway. Note also possible bypass of 3709 // vector code caused by legality checks is ignored, assigning all the weight 3710 // to the vector loop, optimistically. 3711 // 3712 // For scalable vectorization we can't know at compile time how many iterations 3713 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3714 // vscale of '1'. 3715 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3716 LI->getLoopFor(LoopScalarBody), 3717 VF.getKnownMinValue() * UF); 3718 } 3719 3720 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3721 // In order to support recurrences we need to be able to vectorize Phi nodes. 3722 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3723 // stage #2: We now need to fix the recurrences by adding incoming edges to 3724 // the currently empty PHI nodes. At this point every instruction in the 3725 // original loop is widened to a vector form so we can use them to construct 3726 // the incoming edges. 3727 VPBasicBlock *Header = 3728 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3729 for (VPRecipeBase &R : Header->phis()) { 3730 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3731 fixReduction(ReductionPhi, State); 3732 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3733 fixFirstOrderRecurrence(FOR, State); 3734 } 3735 } 3736 3737 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3738 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3739 // This is the second phase of vectorizing first-order recurrences. An 3740 // overview of the transformation is described below. Suppose we have the 3741 // following loop. 3742 // 3743 // for (int i = 0; i < n; ++i) 3744 // b[i] = a[i] - a[i - 1]; 3745 // 3746 // There is a first-order recurrence on "a". For this loop, the shorthand 3747 // scalar IR looks like: 3748 // 3749 // scalar.ph: 3750 // s_init = a[-1] 3751 // br scalar.body 3752 // 3753 // scalar.body: 3754 // i = phi [0, scalar.ph], [i+1, scalar.body] 3755 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3756 // s2 = a[i] 3757 // b[i] = s2 - s1 3758 // br cond, scalar.body, ... 3759 // 3760 // In this example, s1 is a recurrence because it's value depends on the 3761 // previous iteration. In the first phase of vectorization, we created a 3762 // vector phi v1 for s1. We now complete the vectorization and produce the 3763 // shorthand vector IR shown below (for VF = 4, UF = 1). 3764 // 3765 // vector.ph: 3766 // v_init = vector(..., ..., ..., a[-1]) 3767 // br vector.body 3768 // 3769 // vector.body 3770 // i = phi [0, vector.ph], [i+4, vector.body] 3771 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3772 // v2 = a[i, i+1, i+2, i+3]; 3773 // v3 = vector(v1(3), v2(0, 1, 2)) 3774 // b[i, i+1, i+2, i+3] = v2 - v3 3775 // br cond, vector.body, middle.block 3776 // 3777 // middle.block: 3778 // x = v2(3) 3779 // br scalar.ph 3780 // 3781 // scalar.ph: 3782 // s_init = phi [x, middle.block], [a[-1], otherwise] 3783 // br scalar.body 3784 // 3785 // After execution completes the vector loop, we extract the next value of 3786 // the recurrence (x) to use as the initial value in the scalar loop. 3787 3788 // Extract the last vector element in the middle block. This will be the 3789 // initial value for the recurrence when jumping to the scalar loop. 3790 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3791 Value *Incoming = State.get(PreviousDef, UF - 1); 3792 auto *ExtractForScalar = Incoming; 3793 auto *IdxTy = Builder.getInt32Ty(); 3794 if (VF.isVector()) { 3795 auto *One = ConstantInt::get(IdxTy, 1); 3796 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3797 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3798 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3799 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3800 "vector.recur.extract"); 3801 } 3802 // Extract the second last element in the middle block if the 3803 // Phi is used outside the loop. We need to extract the phi itself 3804 // and not the last element (the phi update in the current iteration). This 3805 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3806 // when the scalar loop is not run at all. 3807 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3808 if (VF.isVector()) { 3809 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3810 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3811 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3812 Incoming, Idx, "vector.recur.extract.for.phi"); 3813 } else if (UF > 1) 3814 // When loop is unrolled without vectorizing, initialize 3815 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3816 // of `Incoming`. This is analogous to the vectorized case above: extracting 3817 // the second last element when VF > 1. 3818 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3819 3820 // Fix the initial value of the original recurrence in the scalar loop. 3821 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3822 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3823 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3824 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3825 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3826 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3827 Start->addIncoming(Incoming, BB); 3828 } 3829 3830 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3831 Phi->setName("scalar.recur"); 3832 3833 // Finally, fix users of the recurrence outside the loop. The users will need 3834 // either the last value of the scalar recurrence or the last value of the 3835 // vector recurrence we extracted in the middle block. Since the loop is in 3836 // LCSSA form, we just need to find all the phi nodes for the original scalar 3837 // recurrence in the exit block, and then add an edge for the middle block. 3838 // Note that LCSSA does not imply single entry when the original scalar loop 3839 // had multiple exiting edges (as we always run the last iteration in the 3840 // scalar epilogue); in that case, there is no edge from middle to exit and 3841 // and thus no phis which needed updated. 3842 if (!Cost->requiresScalarEpilogue(VF)) 3843 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3844 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3845 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3846 State.Plan->removeLiveOut(&LCSSAPhi); 3847 } 3848 } 3849 3850 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3851 VPTransformState &State) { 3852 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3853 // Get it's reduction variable descriptor. 3854 assert(Legal->isReductionVariable(OrigPhi) && 3855 "Unable to find the reduction variable"); 3856 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3857 3858 RecurKind RK = RdxDesc.getRecurrenceKind(); 3859 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3860 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3861 setDebugLocFromInst(ReductionStartValue); 3862 3863 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3864 // This is the vector-clone of the value that leaves the loop. 3865 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3866 3867 // Wrap flags are in general invalid after vectorization, clear them. 3868 clearReductionWrapFlags(PhiR, State); 3869 3870 // Before each round, move the insertion point right between 3871 // the PHIs and the values we are going to write. 3872 // This allows us to write both PHINodes and the extractelement 3873 // instructions. 3874 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3875 3876 setDebugLocFromInst(LoopExitInst); 3877 3878 Type *PhiTy = OrigPhi->getType(); 3879 3880 VPBasicBlock *LatchVPBB = 3881 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3882 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3883 // If tail is folded by masking, the vector value to leave the loop should be 3884 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3885 // instead of the former. For an inloop reduction the reduction will already 3886 // be predicated, and does not need to be handled here. 3887 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3888 for (unsigned Part = 0; Part < UF; ++Part) { 3889 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3890 SelectInst *Sel = nullptr; 3891 for (User *U : VecLoopExitInst->users()) { 3892 if (isa<SelectInst>(U)) { 3893 assert(!Sel && "Reduction exit feeding two selects"); 3894 Sel = cast<SelectInst>(U); 3895 } else 3896 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3897 } 3898 assert(Sel && "Reduction exit feeds no select"); 3899 State.reset(LoopExitInstDef, Sel, Part); 3900 3901 if (isa<FPMathOperator>(Sel)) 3902 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3903 3904 // If the target can create a predicated operator for the reduction at no 3905 // extra cost in the loop (for example a predicated vadd), it can be 3906 // cheaper for the select to remain in the loop than be sunk out of it, 3907 // and so use the select value for the phi instead of the old 3908 // LoopExitValue. 3909 if (PreferPredicatedReductionSelect || 3910 TTI->preferPredicatedReductionSelect( 3911 RdxDesc.getOpcode(), PhiTy, 3912 TargetTransformInfo::ReductionFlags())) { 3913 auto *VecRdxPhi = 3914 cast<PHINode>(State.get(PhiR, Part)); 3915 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3916 } 3917 } 3918 } 3919 3920 // If the vector reduction can be performed in a smaller type, we truncate 3921 // then extend the loop exit value to enable InstCombine to evaluate the 3922 // entire expression in the smaller type. 3923 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3924 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3925 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3926 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3927 VectorParts RdxParts(UF); 3928 for (unsigned Part = 0; Part < UF; ++Part) { 3929 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3930 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3931 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3932 : Builder.CreateZExt(Trunc, VecTy); 3933 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3934 if (U != Trunc) { 3935 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3936 RdxParts[Part] = Extnd; 3937 } 3938 } 3939 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3940 for (unsigned Part = 0; Part < UF; ++Part) { 3941 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3942 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3943 } 3944 } 3945 3946 // Reduce all of the unrolled parts into a single vector. 3947 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3948 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3949 3950 // The middle block terminator has already been assigned a DebugLoc here (the 3951 // OrigLoop's single latch terminator). We want the whole middle block to 3952 // appear to execute on this line because: (a) it is all compiler generated, 3953 // (b) these instructions are always executed after evaluating the latch 3954 // conditional branch, and (c) other passes may add new predecessors which 3955 // terminate on this line. This is the easiest way to ensure we don't 3956 // accidentally cause an extra step back into the loop while debugging. 3957 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3958 if (PhiR->isOrdered()) 3959 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3960 else { 3961 // Floating-point operations should have some FMF to enable the reduction. 3962 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3963 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3964 for (unsigned Part = 1; Part < UF; ++Part) { 3965 Value *RdxPart = State.get(LoopExitInstDef, Part); 3966 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3967 ReducedPartRdx = Builder.CreateBinOp( 3968 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3969 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3970 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3971 ReducedPartRdx, RdxPart); 3972 else 3973 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3974 } 3975 } 3976 3977 // Create the reduction after the loop. Note that inloop reductions create the 3978 // target reduction in the loop using a Reduction recipe. 3979 if (VF.isVector() && !PhiR->isInLoop()) { 3980 ReducedPartRdx = 3981 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3982 // If the reduction can be performed in a smaller type, we need to extend 3983 // the reduction to the wider type before we branch to the original loop. 3984 if (PhiTy != RdxDesc.getRecurrenceType()) 3985 ReducedPartRdx = RdxDesc.isSigned() 3986 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3987 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3988 } 3989 3990 PHINode *ResumePhi = 3991 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 3992 3993 // Create a phi node that merges control-flow from the backedge-taken check 3994 // block and the middle block. 3995 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 3996 LoopScalarPreHeader->getTerminator()); 3997 3998 // If we are fixing reductions in the epilogue loop then we should already 3999 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4000 // we carry over the incoming values correctly. 4001 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4002 if (Incoming == LoopMiddleBlock) 4003 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4004 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4005 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4006 Incoming); 4007 else 4008 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4009 } 4010 4011 // Set the resume value for this reduction 4012 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4013 4014 // If there were stores of the reduction value to a uniform memory address 4015 // inside the loop, create the final store here. 4016 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4017 StoreInst *NewSI = 4018 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4019 propagateMetadata(NewSI, SI); 4020 4021 // If the reduction value is used in other places, 4022 // then let the code below create PHI's for that. 4023 } 4024 4025 // Now, we need to fix the users of the reduction variable 4026 // inside and outside of the scalar remainder loop. 4027 4028 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4029 // in the exit blocks. See comment on analogous loop in 4030 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4031 if (!Cost->requiresScalarEpilogue(VF)) 4032 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4033 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4034 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4035 State.Plan->removeLiveOut(&LCSSAPhi); 4036 } 4037 4038 // Fix the scalar loop reduction variable with the incoming reduction sum 4039 // from the vector body and from the backedge value. 4040 int IncomingEdgeBlockIdx = 4041 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4042 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4043 // Pick the other block. 4044 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4045 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4046 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4047 } 4048 4049 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4050 VPTransformState &State) { 4051 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4052 RecurKind RK = RdxDesc.getRecurrenceKind(); 4053 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4054 return; 4055 4056 SmallVector<VPValue *, 8> Worklist; 4057 SmallPtrSet<VPValue *, 8> Visited; 4058 Worklist.push_back(PhiR); 4059 Visited.insert(PhiR); 4060 4061 while (!Worklist.empty()) { 4062 VPValue *Cur = Worklist.pop_back_val(); 4063 for (unsigned Part = 0; Part < UF; ++Part) { 4064 Value *V = State.get(Cur, Part); 4065 if (!isa<OverflowingBinaryOperator>(V)) 4066 break; 4067 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4068 } 4069 4070 for (VPUser *U : Cur->users()) { 4071 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4072 if (!UserRecipe) 4073 continue; 4074 for (VPValue *V : UserRecipe->definedValues()) 4075 if (Visited.insert(V).second) 4076 Worklist.push_back(V); 4077 } 4078 } 4079 } 4080 4081 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4082 // The basic block and loop containing the predicated instruction. 4083 auto *PredBB = PredInst->getParent(); 4084 auto *VectorLoop = LI->getLoopFor(PredBB); 4085 4086 // Initialize a worklist with the operands of the predicated instruction. 4087 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4088 4089 // Holds instructions that we need to analyze again. An instruction may be 4090 // reanalyzed if we don't yet know if we can sink it or not. 4091 SmallVector<Instruction *, 8> InstsToReanalyze; 4092 4093 // Returns true if a given use occurs in the predicated block. Phi nodes use 4094 // their operands in their corresponding predecessor blocks. 4095 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4096 auto *I = cast<Instruction>(U.getUser()); 4097 BasicBlock *BB = I->getParent(); 4098 if (auto *Phi = dyn_cast<PHINode>(I)) 4099 BB = Phi->getIncomingBlock( 4100 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4101 return BB == PredBB; 4102 }; 4103 4104 // Iteratively sink the scalarized operands of the predicated instruction 4105 // into the block we created for it. When an instruction is sunk, it's 4106 // operands are then added to the worklist. The algorithm ends after one pass 4107 // through the worklist doesn't sink a single instruction. 4108 bool Changed; 4109 do { 4110 // Add the instructions that need to be reanalyzed to the worklist, and 4111 // reset the changed indicator. 4112 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4113 InstsToReanalyze.clear(); 4114 Changed = false; 4115 4116 while (!Worklist.empty()) { 4117 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4118 4119 // We can't sink an instruction if it is a phi node, is not in the loop, 4120 // or may have side effects. 4121 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4122 I->mayHaveSideEffects()) 4123 continue; 4124 4125 // If the instruction is already in PredBB, check if we can sink its 4126 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4127 // sinking the scalar instruction I, hence it appears in PredBB; but it 4128 // may have failed to sink I's operands (recursively), which we try 4129 // (again) here. 4130 if (I->getParent() == PredBB) { 4131 Worklist.insert(I->op_begin(), I->op_end()); 4132 continue; 4133 } 4134 4135 // It's legal to sink the instruction if all its uses occur in the 4136 // predicated block. Otherwise, there's nothing to do yet, and we may 4137 // need to reanalyze the instruction. 4138 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4139 InstsToReanalyze.push_back(I); 4140 continue; 4141 } 4142 4143 // Move the instruction to the beginning of the predicated block, and add 4144 // it's operands to the worklist. 4145 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4146 Worklist.insert(I->op_begin(), I->op_end()); 4147 4148 // The sinking may have enabled other instructions to be sunk, so we will 4149 // need to iterate. 4150 Changed = true; 4151 } 4152 } while (Changed); 4153 } 4154 4155 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4156 VPTransformState &State) { 4157 auto Iter = depth_first( 4158 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); 4159 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4160 for (VPRecipeBase &P : VPBB->phis()) { 4161 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4162 if (!VPPhi) 4163 continue; 4164 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4165 // Make sure the builder has a valid insert point. 4166 Builder.SetInsertPoint(NewPhi); 4167 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4168 VPValue *Inc = VPPhi->getIncomingValue(i); 4169 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4170 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4171 } 4172 } 4173 } 4174 } 4175 4176 bool InnerLoopVectorizer::useOrderedReductions( 4177 const RecurrenceDescriptor &RdxDesc) { 4178 return Cost->useOrderedReductions(RdxDesc); 4179 } 4180 4181 /// A helper function for checking whether an integer division-related 4182 /// instruction may divide by zero (in which case it must be predicated if 4183 /// executed conditionally in the scalar code). 4184 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4185 /// Non-zero divisors that are non compile-time constants will not be 4186 /// converted into multiplication, so we will still end up scalarizing 4187 /// the division, but can do so w/o predication. 4188 static bool mayDivideByZero(Instruction &I) { 4189 assert((I.getOpcode() == Instruction::UDiv || 4190 I.getOpcode() == Instruction::SDiv || 4191 I.getOpcode() == Instruction::URem || 4192 I.getOpcode() == Instruction::SRem) && 4193 "Unexpected instruction"); 4194 Value *Divisor = I.getOperand(1); 4195 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4196 return !CInt || CInt->isZero(); 4197 } 4198 4199 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4200 VPUser &ArgOperands, 4201 VPTransformState &State) { 4202 assert(!isa<DbgInfoIntrinsic>(I) && 4203 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4204 setDebugLocFromInst(&I); 4205 4206 Module *M = I.getParent()->getParent()->getParent(); 4207 auto *CI = cast<CallInst>(&I); 4208 4209 SmallVector<Type *, 4> Tys; 4210 for (Value *ArgOperand : CI->args()) 4211 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4212 4213 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4214 4215 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4216 // version of the instruction. 4217 // Is it beneficial to perform intrinsic call compared to lib call? 4218 bool NeedToScalarize = false; 4219 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4220 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4221 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4222 assert((UseVectorIntrinsic || !NeedToScalarize) && 4223 "Instruction should be scalarized elsewhere."); 4224 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4225 "Either the intrinsic cost or vector call cost must be valid"); 4226 4227 for (unsigned Part = 0; Part < UF; ++Part) { 4228 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4229 SmallVector<Value *, 4> Args; 4230 for (auto &I : enumerate(ArgOperands.operands())) { 4231 // Some intrinsics have a scalar argument - don't replace it with a 4232 // vector. 4233 Value *Arg; 4234 if (!UseVectorIntrinsic || 4235 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4236 Arg = State.get(I.value(), Part); 4237 else 4238 Arg = State.get(I.value(), VPIteration(0, 0)); 4239 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4240 TysForDecl.push_back(Arg->getType()); 4241 Args.push_back(Arg); 4242 } 4243 4244 Function *VectorF; 4245 if (UseVectorIntrinsic) { 4246 // Use vector version of the intrinsic. 4247 if (VF.isVector()) 4248 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4249 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4250 assert(VectorF && "Can't retrieve vector intrinsic."); 4251 } else { 4252 // Use vector version of the function call. 4253 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4254 #ifndef NDEBUG 4255 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4256 "Can't create vector function."); 4257 #endif 4258 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4259 } 4260 SmallVector<OperandBundleDef, 1> OpBundles; 4261 CI->getOperandBundlesAsDefs(OpBundles); 4262 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4263 4264 if (isa<FPMathOperator>(V)) 4265 V->copyFastMathFlags(CI); 4266 4267 State.set(Def, V, Part); 4268 addMetadata(V, &I); 4269 } 4270 } 4271 4272 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4273 // We should not collect Scalars more than once per VF. Right now, this 4274 // function is called from collectUniformsAndScalars(), which already does 4275 // this check. Collecting Scalars for VF=1 does not make any sense. 4276 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4277 "This function should not be visited twice for the same VF"); 4278 4279 // This avoids any chances of creating a REPLICATE recipe during planning 4280 // since that would result in generation of scalarized code during execution, 4281 // which is not supported for scalable vectors. 4282 if (VF.isScalable()) { 4283 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4284 return; 4285 } 4286 4287 SmallSetVector<Instruction *, 8> Worklist; 4288 4289 // These sets are used to seed the analysis with pointers used by memory 4290 // accesses that will remain scalar. 4291 SmallSetVector<Instruction *, 8> ScalarPtrs; 4292 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4293 auto *Latch = TheLoop->getLoopLatch(); 4294 4295 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4296 // The pointer operands of loads and stores will be scalar as long as the 4297 // memory access is not a gather or scatter operation. The value operand of a 4298 // store will remain scalar if the store is scalarized. 4299 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4300 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4301 assert(WideningDecision != CM_Unknown && 4302 "Widening decision should be ready at this moment"); 4303 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4304 if (Ptr == Store->getValueOperand()) 4305 return WideningDecision == CM_Scalarize; 4306 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4307 "Ptr is neither a value or pointer operand"); 4308 return WideningDecision != CM_GatherScatter; 4309 }; 4310 4311 // A helper that returns true if the given value is a bitcast or 4312 // getelementptr instruction contained in the loop. 4313 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4314 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4315 isa<GetElementPtrInst>(V)) && 4316 !TheLoop->isLoopInvariant(V); 4317 }; 4318 4319 // A helper that evaluates a memory access's use of a pointer. If the use will 4320 // be a scalar use and the pointer is only used by memory accesses, we place 4321 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4322 // PossibleNonScalarPtrs. 4323 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4324 // We only care about bitcast and getelementptr instructions contained in 4325 // the loop. 4326 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4327 return; 4328 4329 // If the pointer has already been identified as scalar (e.g., if it was 4330 // also identified as uniform), there's nothing to do. 4331 auto *I = cast<Instruction>(Ptr); 4332 if (Worklist.count(I)) 4333 return; 4334 4335 // If the use of the pointer will be a scalar use, and all users of the 4336 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4337 // place the pointer in PossibleNonScalarPtrs. 4338 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4339 return isa<LoadInst>(U) || isa<StoreInst>(U); 4340 })) 4341 ScalarPtrs.insert(I); 4342 else 4343 PossibleNonScalarPtrs.insert(I); 4344 }; 4345 4346 // We seed the scalars analysis with three classes of instructions: (1) 4347 // instructions marked uniform-after-vectorization and (2) bitcast, 4348 // getelementptr and (pointer) phi instructions used by memory accesses 4349 // requiring a scalar use. 4350 // 4351 // (1) Add to the worklist all instructions that have been identified as 4352 // uniform-after-vectorization. 4353 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4354 4355 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4356 // memory accesses requiring a scalar use. The pointer operands of loads and 4357 // stores will be scalar as long as the memory accesses is not a gather or 4358 // scatter operation. The value operand of a store will remain scalar if the 4359 // store is scalarized. 4360 for (auto *BB : TheLoop->blocks()) 4361 for (auto &I : *BB) { 4362 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4363 evaluatePtrUse(Load, Load->getPointerOperand()); 4364 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4365 evaluatePtrUse(Store, Store->getPointerOperand()); 4366 evaluatePtrUse(Store, Store->getValueOperand()); 4367 } 4368 } 4369 for (auto *I : ScalarPtrs) 4370 if (!PossibleNonScalarPtrs.count(I)) { 4371 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4372 Worklist.insert(I); 4373 } 4374 4375 // Insert the forced scalars. 4376 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4377 // induction variable when the PHI user is scalarized. 4378 auto ForcedScalar = ForcedScalars.find(VF); 4379 if (ForcedScalar != ForcedScalars.end()) 4380 for (auto *I : ForcedScalar->second) 4381 Worklist.insert(I); 4382 4383 // Expand the worklist by looking through any bitcasts and getelementptr 4384 // instructions we've already identified as scalar. This is similar to the 4385 // expansion step in collectLoopUniforms(); however, here we're only 4386 // expanding to include additional bitcasts and getelementptr instructions. 4387 unsigned Idx = 0; 4388 while (Idx != Worklist.size()) { 4389 Instruction *Dst = Worklist[Idx++]; 4390 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4391 continue; 4392 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4393 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4394 auto *J = cast<Instruction>(U); 4395 return !TheLoop->contains(J) || Worklist.count(J) || 4396 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4397 isScalarUse(J, Src)); 4398 })) { 4399 Worklist.insert(Src); 4400 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4401 } 4402 } 4403 4404 // An induction variable will remain scalar if all users of the induction 4405 // variable and induction variable update remain scalar. 4406 for (auto &Induction : Legal->getInductionVars()) { 4407 auto *Ind = Induction.first; 4408 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4409 4410 // If tail-folding is applied, the primary induction variable will be used 4411 // to feed a vector compare. 4412 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4413 continue; 4414 4415 // Returns true if \p Indvar is a pointer induction that is used directly by 4416 // load/store instruction \p I. 4417 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4418 Instruction *I) { 4419 return Induction.second.getKind() == 4420 InductionDescriptor::IK_PtrInduction && 4421 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4422 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4423 }; 4424 4425 // Determine if all users of the induction variable are scalar after 4426 // vectorization. 4427 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4428 auto *I = cast<Instruction>(U); 4429 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4430 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4431 }); 4432 if (!ScalarInd) 4433 continue; 4434 4435 // Determine if all users of the induction variable update instruction are 4436 // scalar after vectorization. 4437 auto ScalarIndUpdate = 4438 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4439 auto *I = cast<Instruction>(U); 4440 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4441 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4442 }); 4443 if (!ScalarIndUpdate) 4444 continue; 4445 4446 // The induction variable and its update instruction will remain scalar. 4447 Worklist.insert(Ind); 4448 Worklist.insert(IndUpdate); 4449 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4450 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4451 << "\n"); 4452 } 4453 4454 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4455 } 4456 4457 bool LoopVectorizationCostModel::isScalarWithPredication( 4458 Instruction *I, ElementCount VF) const { 4459 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4460 return false; 4461 switch(I->getOpcode()) { 4462 default: 4463 break; 4464 case Instruction::Load: 4465 case Instruction::Store: { 4466 if (!Legal->isMaskRequired(I)) 4467 return false; 4468 auto *Ptr = getLoadStorePointerOperand(I); 4469 auto *Ty = getLoadStoreType(I); 4470 Type *VTy = Ty; 4471 if (VF.isVector()) 4472 VTy = VectorType::get(Ty, VF); 4473 const Align Alignment = getLoadStoreAlignment(I); 4474 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4475 TTI.isLegalMaskedGather(VTy, Alignment)) 4476 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4477 TTI.isLegalMaskedScatter(VTy, Alignment)); 4478 } 4479 case Instruction::UDiv: 4480 case Instruction::SDiv: 4481 case Instruction::SRem: 4482 case Instruction::URem: 4483 return mayDivideByZero(*I); 4484 } 4485 return false; 4486 } 4487 4488 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4489 Instruction *I, ElementCount VF) { 4490 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4491 assert(getWideningDecision(I, VF) == CM_Unknown && 4492 "Decision should not be set yet."); 4493 auto *Group = getInterleavedAccessGroup(I); 4494 assert(Group && "Must have a group."); 4495 4496 // If the instruction's allocated size doesn't equal it's type size, it 4497 // requires padding and will be scalarized. 4498 auto &DL = I->getModule()->getDataLayout(); 4499 auto *ScalarTy = getLoadStoreType(I); 4500 if (hasIrregularType(ScalarTy, DL)) 4501 return false; 4502 4503 // If the group involves a non-integral pointer, we may not be able to 4504 // losslessly cast all values to a common type. 4505 unsigned InterleaveFactor = Group->getFactor(); 4506 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4507 for (unsigned i = 0; i < InterleaveFactor; i++) { 4508 Instruction *Member = Group->getMember(i); 4509 if (!Member) 4510 continue; 4511 auto *MemberTy = getLoadStoreType(Member); 4512 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4513 // Don't coerce non-integral pointers to integers or vice versa. 4514 if (MemberNI != ScalarNI) { 4515 // TODO: Consider adding special nullptr value case here 4516 return false; 4517 } else if (MemberNI && ScalarNI && 4518 ScalarTy->getPointerAddressSpace() != 4519 MemberTy->getPointerAddressSpace()) { 4520 return false; 4521 } 4522 } 4523 4524 // Check if masking is required. 4525 // A Group may need masking for one of two reasons: it resides in a block that 4526 // needs predication, or it was decided to use masking to deal with gaps 4527 // (either a gap at the end of a load-access that may result in a speculative 4528 // load, or any gaps in a store-access). 4529 bool PredicatedAccessRequiresMasking = 4530 blockNeedsPredicationForAnyReason(I->getParent()) && 4531 Legal->isMaskRequired(I); 4532 bool LoadAccessWithGapsRequiresEpilogMasking = 4533 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4534 !isScalarEpilogueAllowed(); 4535 bool StoreAccessWithGapsRequiresMasking = 4536 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4537 if (!PredicatedAccessRequiresMasking && 4538 !LoadAccessWithGapsRequiresEpilogMasking && 4539 !StoreAccessWithGapsRequiresMasking) 4540 return true; 4541 4542 // If masked interleaving is required, we expect that the user/target had 4543 // enabled it, because otherwise it either wouldn't have been created or 4544 // it should have been invalidated by the CostModel. 4545 assert(useMaskedInterleavedAccesses(TTI) && 4546 "Masked interleave-groups for predicated accesses are not enabled."); 4547 4548 if (Group->isReverse()) 4549 return false; 4550 4551 auto *Ty = getLoadStoreType(I); 4552 const Align Alignment = getLoadStoreAlignment(I); 4553 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4554 : TTI.isLegalMaskedStore(Ty, Alignment); 4555 } 4556 4557 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4558 Instruction *I, ElementCount VF) { 4559 // Get and ensure we have a valid memory instruction. 4560 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4561 4562 auto *Ptr = getLoadStorePointerOperand(I); 4563 auto *ScalarTy = getLoadStoreType(I); 4564 4565 // In order to be widened, the pointer should be consecutive, first of all. 4566 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4567 return false; 4568 4569 // If the instruction is a store located in a predicated block, it will be 4570 // scalarized. 4571 if (isScalarWithPredication(I, VF)) 4572 return false; 4573 4574 // If the instruction's allocated size doesn't equal it's type size, it 4575 // requires padding and will be scalarized. 4576 auto &DL = I->getModule()->getDataLayout(); 4577 if (hasIrregularType(ScalarTy, DL)) 4578 return false; 4579 4580 return true; 4581 } 4582 4583 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4584 // We should not collect Uniforms more than once per VF. Right now, 4585 // this function is called from collectUniformsAndScalars(), which 4586 // already does this check. Collecting Uniforms for VF=1 does not make any 4587 // sense. 4588 4589 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4590 "This function should not be visited twice for the same VF"); 4591 4592 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4593 // not analyze again. Uniforms.count(VF) will return 1. 4594 Uniforms[VF].clear(); 4595 4596 // We now know that the loop is vectorizable! 4597 // Collect instructions inside the loop that will remain uniform after 4598 // vectorization. 4599 4600 // Global values, params and instructions outside of current loop are out of 4601 // scope. 4602 auto isOutOfScope = [&](Value *V) -> bool { 4603 Instruction *I = dyn_cast<Instruction>(V); 4604 return (!I || !TheLoop->contains(I)); 4605 }; 4606 4607 // Worklist containing uniform instructions demanding lane 0. 4608 SetVector<Instruction *> Worklist; 4609 BasicBlock *Latch = TheLoop->getLoopLatch(); 4610 4611 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4612 // that are scalar with predication must not be considered uniform after 4613 // vectorization, because that would create an erroneous replicating region 4614 // where only a single instance out of VF should be formed. 4615 // TODO: optimize such seldom cases if found important, see PR40816. 4616 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4617 if (isOutOfScope(I)) { 4618 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4619 << *I << "\n"); 4620 return; 4621 } 4622 if (isScalarWithPredication(I, VF)) { 4623 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4624 << *I << "\n"); 4625 return; 4626 } 4627 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4628 Worklist.insert(I); 4629 }; 4630 4631 // Start with the conditional branch. If the branch condition is an 4632 // instruction contained in the loop that is only used by the branch, it is 4633 // uniform. 4634 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4635 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4636 addToWorklistIfAllowed(Cmp); 4637 4638 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4639 InstWidening WideningDecision = getWideningDecision(I, VF); 4640 assert(WideningDecision != CM_Unknown && 4641 "Widening decision should be ready at this moment"); 4642 4643 // A uniform memory op is itself uniform. We exclude uniform stores 4644 // here as they demand the last lane, not the first one. 4645 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4646 assert(WideningDecision == CM_Scalarize); 4647 return true; 4648 } 4649 4650 return (WideningDecision == CM_Widen || 4651 WideningDecision == CM_Widen_Reverse || 4652 WideningDecision == CM_Interleave); 4653 }; 4654 4655 4656 // Returns true if Ptr is the pointer operand of a memory access instruction 4657 // I, and I is known to not require scalarization. 4658 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4659 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4660 }; 4661 4662 // Holds a list of values which are known to have at least one uniform use. 4663 // Note that there may be other uses which aren't uniform. A "uniform use" 4664 // here is something which only demands lane 0 of the unrolled iterations; 4665 // it does not imply that all lanes produce the same value (e.g. this is not 4666 // the usual meaning of uniform) 4667 SetVector<Value *> HasUniformUse; 4668 4669 // Scan the loop for instructions which are either a) known to have only 4670 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4671 for (auto *BB : TheLoop->blocks()) 4672 for (auto &I : *BB) { 4673 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4674 switch (II->getIntrinsicID()) { 4675 case Intrinsic::sideeffect: 4676 case Intrinsic::experimental_noalias_scope_decl: 4677 case Intrinsic::assume: 4678 case Intrinsic::lifetime_start: 4679 case Intrinsic::lifetime_end: 4680 if (TheLoop->hasLoopInvariantOperands(&I)) 4681 addToWorklistIfAllowed(&I); 4682 break; 4683 default: 4684 break; 4685 } 4686 } 4687 4688 // ExtractValue instructions must be uniform, because the operands are 4689 // known to be loop-invariant. 4690 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4691 assert(isOutOfScope(EVI->getAggregateOperand()) && 4692 "Expected aggregate value to be loop invariant"); 4693 addToWorklistIfAllowed(EVI); 4694 continue; 4695 } 4696 4697 // If there's no pointer operand, there's nothing to do. 4698 auto *Ptr = getLoadStorePointerOperand(&I); 4699 if (!Ptr) 4700 continue; 4701 4702 // A uniform memory op is itself uniform. We exclude uniform stores 4703 // here as they demand the last lane, not the first one. 4704 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4705 addToWorklistIfAllowed(&I); 4706 4707 if (isUniformDecision(&I, VF)) { 4708 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4709 HasUniformUse.insert(Ptr); 4710 } 4711 } 4712 4713 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4714 // demanding) users. Since loops are assumed to be in LCSSA form, this 4715 // disallows uses outside the loop as well. 4716 for (auto *V : HasUniformUse) { 4717 if (isOutOfScope(V)) 4718 continue; 4719 auto *I = cast<Instruction>(V); 4720 auto UsersAreMemAccesses = 4721 llvm::all_of(I->users(), [&](User *U) -> bool { 4722 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4723 }); 4724 if (UsersAreMemAccesses) 4725 addToWorklistIfAllowed(I); 4726 } 4727 4728 // Expand Worklist in topological order: whenever a new instruction 4729 // is added , its users should be already inside Worklist. It ensures 4730 // a uniform instruction will only be used by uniform instructions. 4731 unsigned idx = 0; 4732 while (idx != Worklist.size()) { 4733 Instruction *I = Worklist[idx++]; 4734 4735 for (auto OV : I->operand_values()) { 4736 // isOutOfScope operands cannot be uniform instructions. 4737 if (isOutOfScope(OV)) 4738 continue; 4739 // First order recurrence Phi's should typically be considered 4740 // non-uniform. 4741 auto *OP = dyn_cast<PHINode>(OV); 4742 if (OP && Legal->isFirstOrderRecurrence(OP)) 4743 continue; 4744 // If all the users of the operand are uniform, then add the 4745 // operand into the uniform worklist. 4746 auto *OI = cast<Instruction>(OV); 4747 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4748 auto *J = cast<Instruction>(U); 4749 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4750 })) 4751 addToWorklistIfAllowed(OI); 4752 } 4753 } 4754 4755 // For an instruction to be added into Worklist above, all its users inside 4756 // the loop should also be in Worklist. However, this condition cannot be 4757 // true for phi nodes that form a cyclic dependence. We must process phi 4758 // nodes separately. An induction variable will remain uniform if all users 4759 // of the induction variable and induction variable update remain uniform. 4760 // The code below handles both pointer and non-pointer induction variables. 4761 for (auto &Induction : Legal->getInductionVars()) { 4762 auto *Ind = Induction.first; 4763 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4764 4765 // Determine if all users of the induction variable are uniform after 4766 // vectorization. 4767 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4768 auto *I = cast<Instruction>(U); 4769 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4770 isVectorizedMemAccessUse(I, Ind); 4771 }); 4772 if (!UniformInd) 4773 continue; 4774 4775 // Determine if all users of the induction variable update instruction are 4776 // uniform after vectorization. 4777 auto UniformIndUpdate = 4778 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4779 auto *I = cast<Instruction>(U); 4780 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4781 isVectorizedMemAccessUse(I, IndUpdate); 4782 }); 4783 if (!UniformIndUpdate) 4784 continue; 4785 4786 // The induction variable and its update instruction will remain uniform. 4787 addToWorklistIfAllowed(Ind); 4788 addToWorklistIfAllowed(IndUpdate); 4789 } 4790 4791 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4792 } 4793 4794 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4795 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4796 4797 if (Legal->getRuntimePointerChecking()->Need) { 4798 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4799 "runtime pointer checks needed. Enable vectorization of this " 4800 "loop with '#pragma clang loop vectorize(enable)' when " 4801 "compiling with -Os/-Oz", 4802 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4803 return true; 4804 } 4805 4806 if (!PSE.getPredicate().isAlwaysTrue()) { 4807 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4808 "runtime SCEV checks needed. Enable vectorization of this " 4809 "loop with '#pragma clang loop vectorize(enable)' when " 4810 "compiling with -Os/-Oz", 4811 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4812 return true; 4813 } 4814 4815 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4816 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4817 reportVectorizationFailure("Runtime stride check for small trip count", 4818 "runtime stride == 1 checks needed. Enable vectorization of " 4819 "this loop without such check by compiling with -Os/-Oz", 4820 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4821 return true; 4822 } 4823 4824 return false; 4825 } 4826 4827 ElementCount 4828 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4829 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4830 return ElementCount::getScalable(0); 4831 4832 if (Hints->isScalableVectorizationDisabled()) { 4833 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4834 "ScalableVectorizationDisabled", ORE, TheLoop); 4835 return ElementCount::getScalable(0); 4836 } 4837 4838 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4839 4840 auto MaxScalableVF = ElementCount::getScalable( 4841 std::numeric_limits<ElementCount::ScalarTy>::max()); 4842 4843 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4844 // FIXME: While for scalable vectors this is currently sufficient, this should 4845 // be replaced by a more detailed mechanism that filters out specific VFs, 4846 // instead of invalidating vectorization for a whole set of VFs based on the 4847 // MaxVF. 4848 4849 // Disable scalable vectorization if the loop contains unsupported reductions. 4850 if (!canVectorizeReductions(MaxScalableVF)) { 4851 reportVectorizationInfo( 4852 "Scalable vectorization not supported for the reduction " 4853 "operations found in this loop.", 4854 "ScalableVFUnfeasible", ORE, TheLoop); 4855 return ElementCount::getScalable(0); 4856 } 4857 4858 // Disable scalable vectorization if the loop contains any instructions 4859 // with element types not supported for scalable vectors. 4860 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4861 return !Ty->isVoidTy() && 4862 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4863 })) { 4864 reportVectorizationInfo("Scalable vectorization is not supported " 4865 "for all element types found in this loop.", 4866 "ScalableVFUnfeasible", ORE, TheLoop); 4867 return ElementCount::getScalable(0); 4868 } 4869 4870 if (Legal->isSafeForAnyVectorWidth()) 4871 return MaxScalableVF; 4872 4873 // Limit MaxScalableVF by the maximum safe dependence distance. 4874 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4875 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4876 MaxVScale = 4877 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4878 MaxScalableVF = ElementCount::getScalable( 4879 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 4880 if (!MaxScalableVF) 4881 reportVectorizationInfo( 4882 "Max legal vector width too small, scalable vectorization " 4883 "unfeasible.", 4884 "ScalableVFUnfeasible", ORE, TheLoop); 4885 4886 return MaxScalableVF; 4887 } 4888 4889 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4890 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4891 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4892 unsigned SmallestType, WidestType; 4893 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4894 4895 // Get the maximum safe dependence distance in bits computed by LAA. 4896 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4897 // the memory accesses that is most restrictive (involved in the smallest 4898 // dependence distance). 4899 unsigned MaxSafeElements = 4900 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4901 4902 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4903 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4904 4905 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4906 << ".\n"); 4907 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4908 << ".\n"); 4909 4910 // First analyze the UserVF, fall back if the UserVF should be ignored. 4911 if (UserVF) { 4912 auto MaxSafeUserVF = 4913 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4914 4915 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4916 // If `VF=vscale x N` is safe, then so is `VF=N` 4917 if (UserVF.isScalable()) 4918 return FixedScalableVFPair( 4919 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4920 else 4921 return UserVF; 4922 } 4923 4924 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4925 4926 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4927 // is better to ignore the hint and let the compiler choose a suitable VF. 4928 if (!UserVF.isScalable()) { 4929 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4930 << " is unsafe, clamping to max safe VF=" 4931 << MaxSafeFixedVF << ".\n"); 4932 ORE->emit([&]() { 4933 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4934 TheLoop->getStartLoc(), 4935 TheLoop->getHeader()) 4936 << "User-specified vectorization factor " 4937 << ore::NV("UserVectorizationFactor", UserVF) 4938 << " is unsafe, clamping to maximum safe vectorization factor " 4939 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4940 }); 4941 return MaxSafeFixedVF; 4942 } 4943 4944 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4945 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4946 << " is ignored because scalable vectors are not " 4947 "available.\n"); 4948 ORE->emit([&]() { 4949 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4950 TheLoop->getStartLoc(), 4951 TheLoop->getHeader()) 4952 << "User-specified vectorization factor " 4953 << ore::NV("UserVectorizationFactor", UserVF) 4954 << " is ignored because the target does not support scalable " 4955 "vectors. The compiler will pick a more suitable value."; 4956 }); 4957 } else { 4958 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4959 << " is unsafe. Ignoring scalable UserVF.\n"); 4960 ORE->emit([&]() { 4961 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4962 TheLoop->getStartLoc(), 4963 TheLoop->getHeader()) 4964 << "User-specified vectorization factor " 4965 << ore::NV("UserVectorizationFactor", UserVF) 4966 << " is unsafe. Ignoring the hint to let the compiler pick a " 4967 "more suitable value."; 4968 }); 4969 } 4970 } 4971 4972 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4973 << " / " << WidestType << " bits.\n"); 4974 4975 FixedScalableVFPair Result(ElementCount::getFixed(1), 4976 ElementCount::getScalable(0)); 4977 if (auto MaxVF = 4978 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4979 MaxSafeFixedVF, FoldTailByMasking)) 4980 Result.FixedVF = MaxVF; 4981 4982 if (auto MaxVF = 4983 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4984 MaxSafeScalableVF, FoldTailByMasking)) 4985 if (MaxVF.isScalable()) { 4986 Result.ScalableVF = MaxVF; 4987 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4988 << "\n"); 4989 } 4990 4991 return Result; 4992 } 4993 4994 FixedScalableVFPair 4995 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4996 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4997 // TODO: It may by useful to do since it's still likely to be dynamically 4998 // uniform if the target can skip. 4999 reportVectorizationFailure( 5000 "Not inserting runtime ptr check for divergent target", 5001 "runtime pointer checks needed. Not enabled for divergent target", 5002 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5003 return FixedScalableVFPair::getNone(); 5004 } 5005 5006 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5007 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5008 if (TC == 1) { 5009 reportVectorizationFailure("Single iteration (non) loop", 5010 "loop trip count is one, irrelevant for vectorization", 5011 "SingleIterationLoop", ORE, TheLoop); 5012 return FixedScalableVFPair::getNone(); 5013 } 5014 5015 switch (ScalarEpilogueStatus) { 5016 case CM_ScalarEpilogueAllowed: 5017 return computeFeasibleMaxVF(TC, UserVF, false); 5018 case CM_ScalarEpilogueNotAllowedUsePredicate: 5019 LLVM_FALLTHROUGH; 5020 case CM_ScalarEpilogueNotNeededUsePredicate: 5021 LLVM_DEBUG( 5022 dbgs() << "LV: vector predicate hint/switch found.\n" 5023 << "LV: Not allowing scalar epilogue, creating predicated " 5024 << "vector loop.\n"); 5025 break; 5026 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5027 // fallthrough as a special case of OptForSize 5028 case CM_ScalarEpilogueNotAllowedOptSize: 5029 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5030 LLVM_DEBUG( 5031 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5032 else 5033 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5034 << "count.\n"); 5035 5036 // Bail if runtime checks are required, which are not good when optimising 5037 // for size. 5038 if (runtimeChecksRequired()) 5039 return FixedScalableVFPair::getNone(); 5040 5041 break; 5042 } 5043 5044 // The only loops we can vectorize without a scalar epilogue, are loops with 5045 // a bottom-test and a single exiting block. We'd have to handle the fact 5046 // that not every instruction executes on the last iteration. This will 5047 // require a lane mask which varies through the vector loop body. (TODO) 5048 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5049 // If there was a tail-folding hint/switch, but we can't fold the tail by 5050 // masking, fallback to a vectorization with a scalar epilogue. 5051 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5052 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5053 "scalar epilogue instead.\n"); 5054 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5055 return computeFeasibleMaxVF(TC, UserVF, false); 5056 } 5057 return FixedScalableVFPair::getNone(); 5058 } 5059 5060 // Now try the tail folding 5061 5062 // Invalidate interleave groups that require an epilogue if we can't mask 5063 // the interleave-group. 5064 if (!useMaskedInterleavedAccesses(TTI)) { 5065 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5066 "No decisions should have been taken at this point"); 5067 // Note: There is no need to invalidate any cost modeling decisions here, as 5068 // non where taken so far. 5069 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5070 } 5071 5072 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5073 // Avoid tail folding if the trip count is known to be a multiple of any VF 5074 // we chose. 5075 // FIXME: The condition below pessimises the case for fixed-width vectors, 5076 // when scalable VFs are also candidates for vectorization. 5077 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5078 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5079 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5080 "MaxFixedVF must be a power of 2"); 5081 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5082 : MaxFixedVF.getFixedValue(); 5083 ScalarEvolution *SE = PSE.getSE(); 5084 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5085 const SCEV *ExitCount = SE->getAddExpr( 5086 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5087 const SCEV *Rem = SE->getURemExpr( 5088 SE->applyLoopGuards(ExitCount, TheLoop), 5089 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5090 if (Rem->isZero()) { 5091 // Accept MaxFixedVF if we do not have a tail. 5092 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5093 return MaxFactors; 5094 } 5095 } 5096 5097 // If we don't know the precise trip count, or if the trip count that we 5098 // found modulo the vectorization factor is not zero, try to fold the tail 5099 // by masking. 5100 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5101 if (Legal->prepareToFoldTailByMasking()) { 5102 FoldTailByMasking = true; 5103 return MaxFactors; 5104 } 5105 5106 // If there was a tail-folding hint/switch, but we can't fold the tail by 5107 // masking, fallback to a vectorization with a scalar epilogue. 5108 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5109 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5110 "scalar epilogue instead.\n"); 5111 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5112 return MaxFactors; 5113 } 5114 5115 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5116 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5117 return FixedScalableVFPair::getNone(); 5118 } 5119 5120 if (TC == 0) { 5121 reportVectorizationFailure( 5122 "Unable to calculate the loop count due to complex control flow", 5123 "unable to calculate the loop count due to complex control flow", 5124 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5125 return FixedScalableVFPair::getNone(); 5126 } 5127 5128 reportVectorizationFailure( 5129 "Cannot optimize for size and vectorize at the same time.", 5130 "cannot optimize for size and vectorize at the same time. " 5131 "Enable vectorization of this loop with '#pragma clang loop " 5132 "vectorize(enable)' when compiling with -Os/-Oz", 5133 "NoTailLoopWithOptForSize", ORE, TheLoop); 5134 return FixedScalableVFPair::getNone(); 5135 } 5136 5137 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5138 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5139 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5140 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5141 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5142 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5143 : TargetTransformInfo::RGK_FixedWidthVector); 5144 5145 // Convenience function to return the minimum of two ElementCounts. 5146 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5147 assert((LHS.isScalable() == RHS.isScalable()) && 5148 "Scalable flags must match"); 5149 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5150 }; 5151 5152 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5153 // Note that both WidestRegister and WidestType may not be a powers of 2. 5154 auto MaxVectorElementCount = ElementCount::get( 5155 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5156 ComputeScalableMaxVF); 5157 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5158 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5159 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5160 5161 if (!MaxVectorElementCount) { 5162 LLVM_DEBUG(dbgs() << "LV: The target has no " 5163 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5164 << " vector registers.\n"); 5165 return ElementCount::getFixed(1); 5166 } 5167 5168 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5169 if (ConstTripCount && 5170 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5171 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5172 // If loop trip count (TC) is known at compile time there is no point in 5173 // choosing VF greater than TC (as done in the loop below). Select maximum 5174 // power of two which doesn't exceed TC. 5175 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5176 // when the TC is less than or equal to the known number of lanes. 5177 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5178 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5179 "exceeding the constant trip count: " 5180 << ClampedConstTripCount << "\n"); 5181 return ElementCount::getFixed(ClampedConstTripCount); 5182 } 5183 5184 TargetTransformInfo::RegisterKind RegKind = 5185 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5186 : TargetTransformInfo::RGK_FixedWidthVector; 5187 ElementCount MaxVF = MaxVectorElementCount; 5188 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5189 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5190 auto MaxVectorElementCountMaxBW = ElementCount::get( 5191 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5192 ComputeScalableMaxVF); 5193 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5194 5195 // Collect all viable vectorization factors larger than the default MaxVF 5196 // (i.e. MaxVectorElementCount). 5197 SmallVector<ElementCount, 8> VFs; 5198 for (ElementCount VS = MaxVectorElementCount * 2; 5199 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5200 VFs.push_back(VS); 5201 5202 // For each VF calculate its register usage. 5203 auto RUs = calculateRegisterUsage(VFs); 5204 5205 // Select the largest VF which doesn't require more registers than existing 5206 // ones. 5207 for (int i = RUs.size() - 1; i >= 0; --i) { 5208 bool Selected = true; 5209 for (auto &pair : RUs[i].MaxLocalUsers) { 5210 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5211 if (pair.second > TargetNumRegisters) 5212 Selected = false; 5213 } 5214 if (Selected) { 5215 MaxVF = VFs[i]; 5216 break; 5217 } 5218 } 5219 if (ElementCount MinVF = 5220 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5221 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5222 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5223 << ") with target's minimum: " << MinVF << '\n'); 5224 MaxVF = MinVF; 5225 } 5226 } 5227 5228 // Invalidate any widening decisions we might have made, in case the loop 5229 // requires prediction (decided later), but we have already made some 5230 // load/store widening decisions. 5231 invalidateCostModelingDecisions(); 5232 } 5233 return MaxVF; 5234 } 5235 5236 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5237 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5238 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5239 auto Min = Attr.getVScaleRangeMin(); 5240 auto Max = Attr.getVScaleRangeMax(); 5241 if (Max && Min == Max) 5242 return Max; 5243 } 5244 5245 return TTI.getVScaleForTuning(); 5246 } 5247 5248 bool LoopVectorizationCostModel::isMoreProfitable( 5249 const VectorizationFactor &A, const VectorizationFactor &B) const { 5250 InstructionCost CostA = A.Cost; 5251 InstructionCost CostB = B.Cost; 5252 5253 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5254 5255 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5256 MaxTripCount) { 5257 // If we are folding the tail and the trip count is a known (possibly small) 5258 // constant, the trip count will be rounded up to an integer number of 5259 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5260 // which we compare directly. When not folding the tail, the total cost will 5261 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5262 // approximated with the per-lane cost below instead of using the tripcount 5263 // as here. 5264 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5265 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5266 return RTCostA < RTCostB; 5267 } 5268 5269 // Improve estimate for the vector width if it is scalable. 5270 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5271 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5272 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5273 if (A.Width.isScalable()) 5274 EstimatedWidthA *= VScale.getValue(); 5275 if (B.Width.isScalable()) 5276 EstimatedWidthB *= VScale.getValue(); 5277 } 5278 5279 // Assume vscale may be larger than 1 (or the value being tuned for), 5280 // so that scalable vectorization is slightly favorable over fixed-width 5281 // vectorization. 5282 if (A.Width.isScalable() && !B.Width.isScalable()) 5283 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5284 5285 // To avoid the need for FP division: 5286 // (CostA / A.Width) < (CostB / B.Width) 5287 // <=> (CostA * B.Width) < (CostB * A.Width) 5288 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5289 } 5290 5291 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5292 const ElementCountSet &VFCandidates) { 5293 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5294 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5295 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5296 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5297 "Expected Scalar VF to be a candidate"); 5298 5299 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5300 VectorizationFactor ChosenFactor = ScalarCost; 5301 5302 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5303 if (ForceVectorization && VFCandidates.size() > 1) { 5304 // Ignore scalar width, because the user explicitly wants vectorization. 5305 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5306 // evaluation. 5307 ChosenFactor.Cost = InstructionCost::getMax(); 5308 } 5309 5310 SmallVector<InstructionVFPair> InvalidCosts; 5311 for (const auto &i : VFCandidates) { 5312 // The cost for scalar VF=1 is already calculated, so ignore it. 5313 if (i.isScalar()) 5314 continue; 5315 5316 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5317 VectorizationFactor Candidate(i, C.first); 5318 5319 #ifndef NDEBUG 5320 unsigned AssumedMinimumVscale = 1; 5321 if (Optional<unsigned> VScale = getVScaleForTuning()) 5322 AssumedMinimumVscale = VScale.getValue(); 5323 unsigned Width = 5324 Candidate.Width.isScalable() 5325 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5326 : Candidate.Width.getFixedValue(); 5327 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5328 << " costs: " << (Candidate.Cost / Width)); 5329 if (i.isScalable()) 5330 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5331 << AssumedMinimumVscale << ")"); 5332 LLVM_DEBUG(dbgs() << ".\n"); 5333 #endif 5334 5335 if (!C.second && !ForceVectorization) { 5336 LLVM_DEBUG( 5337 dbgs() << "LV: Not considering vector loop of width " << i 5338 << " because it will not generate any vector instructions.\n"); 5339 continue; 5340 } 5341 5342 // If profitable add it to ProfitableVF list. 5343 if (isMoreProfitable(Candidate, ScalarCost)) 5344 ProfitableVFs.push_back(Candidate); 5345 5346 if (isMoreProfitable(Candidate, ChosenFactor)) 5347 ChosenFactor = Candidate; 5348 } 5349 5350 // Emit a report of VFs with invalid costs in the loop. 5351 if (!InvalidCosts.empty()) { 5352 // Group the remarks per instruction, keeping the instruction order from 5353 // InvalidCosts. 5354 std::map<Instruction *, unsigned> Numbering; 5355 unsigned I = 0; 5356 for (auto &Pair : InvalidCosts) 5357 if (!Numbering.count(Pair.first)) 5358 Numbering[Pair.first] = I++; 5359 5360 // Sort the list, first on instruction(number) then on VF. 5361 llvm::sort(InvalidCosts, 5362 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5363 if (Numbering[A.first] != Numbering[B.first]) 5364 return Numbering[A.first] < Numbering[B.first]; 5365 ElementCountComparator ECC; 5366 return ECC(A.second, B.second); 5367 }); 5368 5369 // For a list of ordered instruction-vf pairs: 5370 // [(load, vf1), (load, vf2), (store, vf1)] 5371 // Group the instructions together to emit separate remarks for: 5372 // load (vf1, vf2) 5373 // store (vf1) 5374 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5375 auto Subset = ArrayRef<InstructionVFPair>(); 5376 do { 5377 if (Subset.empty()) 5378 Subset = Tail.take_front(1); 5379 5380 Instruction *I = Subset.front().first; 5381 5382 // If the next instruction is different, or if there are no other pairs, 5383 // emit a remark for the collated subset. e.g. 5384 // [(load, vf1), (load, vf2))] 5385 // to emit: 5386 // remark: invalid costs for 'load' at VF=(vf, vf2) 5387 if (Subset == Tail || Tail[Subset.size()].first != I) { 5388 std::string OutString; 5389 raw_string_ostream OS(OutString); 5390 assert(!Subset.empty() && "Unexpected empty range"); 5391 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5392 for (auto &Pair : Subset) 5393 OS << (Pair.second == Subset.front().second ? "" : ", ") 5394 << Pair.second; 5395 OS << "):"; 5396 if (auto *CI = dyn_cast<CallInst>(I)) 5397 OS << " call to " << CI->getCalledFunction()->getName(); 5398 else 5399 OS << " " << I->getOpcodeName(); 5400 OS.flush(); 5401 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5402 Tail = Tail.drop_front(Subset.size()); 5403 Subset = {}; 5404 } else 5405 // Grow the subset by one element 5406 Subset = Tail.take_front(Subset.size() + 1); 5407 } while (!Tail.empty()); 5408 } 5409 5410 if (!EnableCondStoresVectorization && NumPredStores) { 5411 reportVectorizationFailure("There are conditional stores.", 5412 "store that is conditionally executed prevents vectorization", 5413 "ConditionalStore", ORE, TheLoop); 5414 ChosenFactor = ScalarCost; 5415 } 5416 5417 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5418 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5419 << "LV: Vectorization seems to be not beneficial, " 5420 << "but was forced by a user.\n"); 5421 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5422 return ChosenFactor; 5423 } 5424 5425 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5426 const Loop &L, ElementCount VF) const { 5427 // Cross iteration phis such as reductions need special handling and are 5428 // currently unsupported. 5429 if (any_of(L.getHeader()->phis(), 5430 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5431 return false; 5432 5433 // Phis with uses outside of the loop require special handling and are 5434 // currently unsupported. 5435 for (auto &Entry : Legal->getInductionVars()) { 5436 // Look for uses of the value of the induction at the last iteration. 5437 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5438 for (User *U : PostInc->users()) 5439 if (!L.contains(cast<Instruction>(U))) 5440 return false; 5441 // Look for uses of penultimate value of the induction. 5442 for (User *U : Entry.first->users()) 5443 if (!L.contains(cast<Instruction>(U))) 5444 return false; 5445 } 5446 5447 // Induction variables that are widened require special handling that is 5448 // currently not supported. 5449 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5450 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5451 this->isProfitableToScalarize(Entry.first, VF)); 5452 })) 5453 return false; 5454 5455 // Epilogue vectorization code has not been auditted to ensure it handles 5456 // non-latch exits properly. It may be fine, but it needs auditted and 5457 // tested. 5458 if (L.getExitingBlock() != L.getLoopLatch()) 5459 return false; 5460 5461 return true; 5462 } 5463 5464 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5465 const ElementCount VF) const { 5466 // FIXME: We need a much better cost-model to take different parameters such 5467 // as register pressure, code size increase and cost of extra branches into 5468 // account. For now we apply a very crude heuristic and only consider loops 5469 // with vectorization factors larger than a certain value. 5470 // We also consider epilogue vectorization unprofitable for targets that don't 5471 // consider interleaving beneficial (eg. MVE). 5472 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5473 return false; 5474 // FIXME: We should consider changing the threshold for scalable 5475 // vectors to take VScaleForTuning into account. 5476 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5477 return true; 5478 return false; 5479 } 5480 5481 VectorizationFactor 5482 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5483 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5484 VectorizationFactor Result = VectorizationFactor::Disabled(); 5485 if (!EnableEpilogueVectorization) { 5486 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5487 return Result; 5488 } 5489 5490 if (!isScalarEpilogueAllowed()) { 5491 LLVM_DEBUG( 5492 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5493 "allowed.\n";); 5494 return Result; 5495 } 5496 5497 // Not really a cost consideration, but check for unsupported cases here to 5498 // simplify the logic. 5499 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5500 LLVM_DEBUG( 5501 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5502 "not a supported candidate.\n";); 5503 return Result; 5504 } 5505 5506 if (EpilogueVectorizationForceVF > 1) { 5507 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5508 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5509 if (LVP.hasPlanWithVF(ForcedEC)) 5510 return {ForcedEC, 0}; 5511 else { 5512 LLVM_DEBUG( 5513 dbgs() 5514 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5515 return Result; 5516 } 5517 } 5518 5519 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5520 TheLoop->getHeader()->getParent()->hasMinSize()) { 5521 LLVM_DEBUG( 5522 dbgs() 5523 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5524 return Result; 5525 } 5526 5527 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5528 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5529 "this loop\n"); 5530 return Result; 5531 } 5532 5533 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5534 // the main loop handles 8 lanes per iteration. We could still benefit from 5535 // vectorizing the epilogue loop with VF=4. 5536 ElementCount EstimatedRuntimeVF = MainLoopVF; 5537 if (MainLoopVF.isScalable()) { 5538 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5539 if (Optional<unsigned> VScale = getVScaleForTuning()) 5540 EstimatedRuntimeVF *= VScale.getValue(); 5541 } 5542 5543 for (auto &NextVF : ProfitableVFs) 5544 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5545 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5546 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5547 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5548 LVP.hasPlanWithVF(NextVF.Width)) 5549 Result = NextVF; 5550 5551 if (Result != VectorizationFactor::Disabled()) 5552 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5553 << Result.Width << "\n";); 5554 return Result; 5555 } 5556 5557 std::pair<unsigned, unsigned> 5558 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5559 unsigned MinWidth = -1U; 5560 unsigned MaxWidth = 8; 5561 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5562 // For in-loop reductions, no element types are added to ElementTypesInLoop 5563 // if there are no loads/stores in the loop. In this case, check through the 5564 // reduction variables to determine the maximum width. 5565 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5566 // Reset MaxWidth so that we can find the smallest type used by recurrences 5567 // in the loop. 5568 MaxWidth = -1U; 5569 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5570 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5571 // When finding the min width used by the recurrence we need to account 5572 // for casts on the input operands of the recurrence. 5573 MaxWidth = std::min<unsigned>( 5574 MaxWidth, std::min<unsigned>( 5575 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5576 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5577 } 5578 } else { 5579 for (Type *T : ElementTypesInLoop) { 5580 MinWidth = std::min<unsigned>( 5581 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5582 MaxWidth = std::max<unsigned>( 5583 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5584 } 5585 } 5586 return {MinWidth, MaxWidth}; 5587 } 5588 5589 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5590 ElementTypesInLoop.clear(); 5591 // For each block. 5592 for (BasicBlock *BB : TheLoop->blocks()) { 5593 // For each instruction in the loop. 5594 for (Instruction &I : BB->instructionsWithoutDebug()) { 5595 Type *T = I.getType(); 5596 5597 // Skip ignored values. 5598 if (ValuesToIgnore.count(&I)) 5599 continue; 5600 5601 // Only examine Loads, Stores and PHINodes. 5602 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5603 continue; 5604 5605 // Examine PHI nodes that are reduction variables. Update the type to 5606 // account for the recurrence type. 5607 if (auto *PN = dyn_cast<PHINode>(&I)) { 5608 if (!Legal->isReductionVariable(PN)) 5609 continue; 5610 const RecurrenceDescriptor &RdxDesc = 5611 Legal->getReductionVars().find(PN)->second; 5612 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5613 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5614 RdxDesc.getRecurrenceType(), 5615 TargetTransformInfo::ReductionFlags())) 5616 continue; 5617 T = RdxDesc.getRecurrenceType(); 5618 } 5619 5620 // Examine the stored values. 5621 if (auto *ST = dyn_cast<StoreInst>(&I)) 5622 T = ST->getValueOperand()->getType(); 5623 5624 assert(T->isSized() && 5625 "Expected the load/store/recurrence type to be sized"); 5626 5627 ElementTypesInLoop.insert(T); 5628 } 5629 } 5630 } 5631 5632 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5633 unsigned LoopCost) { 5634 // -- The interleave heuristics -- 5635 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5636 // There are many micro-architectural considerations that we can't predict 5637 // at this level. For example, frontend pressure (on decode or fetch) due to 5638 // code size, or the number and capabilities of the execution ports. 5639 // 5640 // We use the following heuristics to select the interleave count: 5641 // 1. If the code has reductions, then we interleave to break the cross 5642 // iteration dependency. 5643 // 2. If the loop is really small, then we interleave to reduce the loop 5644 // overhead. 5645 // 3. We don't interleave if we think that we will spill registers to memory 5646 // due to the increased register pressure. 5647 5648 if (!isScalarEpilogueAllowed()) 5649 return 1; 5650 5651 // We used the distance for the interleave count. 5652 if (Legal->getMaxSafeDepDistBytes() != -1U) 5653 return 1; 5654 5655 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5656 const bool HasReductions = !Legal->getReductionVars().empty(); 5657 // Do not interleave loops with a relatively small known or estimated trip 5658 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5659 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5660 // because with the above conditions interleaving can expose ILP and break 5661 // cross iteration dependences for reductions. 5662 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5663 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5664 return 1; 5665 5666 // If we did not calculate the cost for VF (because the user selected the VF) 5667 // then we calculate the cost of VF here. 5668 if (LoopCost == 0) { 5669 InstructionCost C = expectedCost(VF).first; 5670 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5671 LoopCost = *C.getValue(); 5672 5673 // Loop body is free and there is no need for interleaving. 5674 if (LoopCost == 0) 5675 return 1; 5676 } 5677 5678 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5679 // We divide by these constants so assume that we have at least one 5680 // instruction that uses at least one register. 5681 for (auto& pair : R.MaxLocalUsers) { 5682 pair.second = std::max(pair.second, 1U); 5683 } 5684 5685 // We calculate the interleave count using the following formula. 5686 // Subtract the number of loop invariants from the number of available 5687 // registers. These registers are used by all of the interleaved instances. 5688 // Next, divide the remaining registers by the number of registers that is 5689 // required by the loop, in order to estimate how many parallel instances 5690 // fit without causing spills. All of this is rounded down if necessary to be 5691 // a power of two. We want power of two interleave count to simplify any 5692 // addressing operations or alignment considerations. 5693 // We also want power of two interleave counts to ensure that the induction 5694 // variable of the vector loop wraps to zero, when tail is folded by masking; 5695 // this currently happens when OptForSize, in which case IC is set to 1 above. 5696 unsigned IC = UINT_MAX; 5697 5698 for (auto& pair : R.MaxLocalUsers) { 5699 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5700 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5701 << " registers of " 5702 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5703 if (VF.isScalar()) { 5704 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5705 TargetNumRegisters = ForceTargetNumScalarRegs; 5706 } else { 5707 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5708 TargetNumRegisters = ForceTargetNumVectorRegs; 5709 } 5710 unsigned MaxLocalUsers = pair.second; 5711 unsigned LoopInvariantRegs = 0; 5712 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5713 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5714 5715 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5716 // Don't count the induction variable as interleaved. 5717 if (EnableIndVarRegisterHeur) { 5718 TmpIC = 5719 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5720 std::max(1U, (MaxLocalUsers - 1))); 5721 } 5722 5723 IC = std::min(IC, TmpIC); 5724 } 5725 5726 // Clamp the interleave ranges to reasonable counts. 5727 unsigned MaxInterleaveCount = 5728 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5729 5730 // Check if the user has overridden the max. 5731 if (VF.isScalar()) { 5732 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5733 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5734 } else { 5735 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5736 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5737 } 5738 5739 // If trip count is known or estimated compile time constant, limit the 5740 // interleave count to be less than the trip count divided by VF, provided it 5741 // is at least 1. 5742 // 5743 // For scalable vectors we can't know if interleaving is beneficial. It may 5744 // not be beneficial for small loops if none of the lanes in the second vector 5745 // iterations is enabled. However, for larger loops, there is likely to be a 5746 // similar benefit as for fixed-width vectors. For now, we choose to leave 5747 // the InterleaveCount as if vscale is '1', although if some information about 5748 // the vector is known (e.g. min vector size), we can make a better decision. 5749 if (BestKnownTC) { 5750 MaxInterleaveCount = 5751 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5752 // Make sure MaxInterleaveCount is greater than 0. 5753 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5754 } 5755 5756 assert(MaxInterleaveCount > 0 && 5757 "Maximum interleave count must be greater than 0"); 5758 5759 // Clamp the calculated IC to be between the 1 and the max interleave count 5760 // that the target and trip count allows. 5761 if (IC > MaxInterleaveCount) 5762 IC = MaxInterleaveCount; 5763 else 5764 // Make sure IC is greater than 0. 5765 IC = std::max(1u, IC); 5766 5767 assert(IC > 0 && "Interleave count must be greater than 0."); 5768 5769 // Interleave if we vectorized this loop and there is a reduction that could 5770 // benefit from interleaving. 5771 if (VF.isVector() && HasReductions) { 5772 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5773 return IC; 5774 } 5775 5776 // For any scalar loop that either requires runtime checks or predication we 5777 // are better off leaving this to the unroller. Note that if we've already 5778 // vectorized the loop we will have done the runtime check and so interleaving 5779 // won't require further checks. 5780 bool ScalarInterleavingRequiresPredication = 5781 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5782 return Legal->blockNeedsPredication(BB); 5783 })); 5784 bool ScalarInterleavingRequiresRuntimePointerCheck = 5785 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5786 5787 // We want to interleave small loops in order to reduce the loop overhead and 5788 // potentially expose ILP opportunities. 5789 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5790 << "LV: IC is " << IC << '\n' 5791 << "LV: VF is " << VF << '\n'); 5792 const bool AggressivelyInterleaveReductions = 5793 TTI.enableAggressiveInterleaving(HasReductions); 5794 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5795 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5796 // We assume that the cost overhead is 1 and we use the cost model 5797 // to estimate the cost of the loop and interleave until the cost of the 5798 // loop overhead is about 5% of the cost of the loop. 5799 unsigned SmallIC = 5800 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5801 5802 // Interleave until store/load ports (estimated by max interleave count) are 5803 // saturated. 5804 unsigned NumStores = Legal->getNumStores(); 5805 unsigned NumLoads = Legal->getNumLoads(); 5806 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5807 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5808 5809 // There is little point in interleaving for reductions containing selects 5810 // and compares when VF=1 since it may just create more overhead than it's 5811 // worth for loops with small trip counts. This is because we still have to 5812 // do the final reduction after the loop. 5813 bool HasSelectCmpReductions = 5814 HasReductions && 5815 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5816 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5817 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5818 RdxDesc.getRecurrenceKind()); 5819 }); 5820 if (HasSelectCmpReductions) { 5821 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5822 return 1; 5823 } 5824 5825 // If we have a scalar reduction (vector reductions are already dealt with 5826 // by this point), we can increase the critical path length if the loop 5827 // we're interleaving is inside another loop. For tree-wise reductions 5828 // set the limit to 2, and for ordered reductions it's best to disable 5829 // interleaving entirely. 5830 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5831 bool HasOrderedReductions = 5832 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5833 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5834 return RdxDesc.isOrdered(); 5835 }); 5836 if (HasOrderedReductions) { 5837 LLVM_DEBUG( 5838 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5839 return 1; 5840 } 5841 5842 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5843 SmallIC = std::min(SmallIC, F); 5844 StoresIC = std::min(StoresIC, F); 5845 LoadsIC = std::min(LoadsIC, F); 5846 } 5847 5848 if (EnableLoadStoreRuntimeInterleave && 5849 std::max(StoresIC, LoadsIC) > SmallIC) { 5850 LLVM_DEBUG( 5851 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5852 return std::max(StoresIC, LoadsIC); 5853 } 5854 5855 // If there are scalar reductions and TTI has enabled aggressive 5856 // interleaving for reductions, we will interleave to expose ILP. 5857 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5858 AggressivelyInterleaveReductions) { 5859 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5860 // Interleave no less than SmallIC but not as aggressive as the normal IC 5861 // to satisfy the rare situation when resources are too limited. 5862 return std::max(IC / 2, SmallIC); 5863 } else { 5864 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5865 return SmallIC; 5866 } 5867 } 5868 5869 // Interleave if this is a large loop (small loops are already dealt with by 5870 // this point) that could benefit from interleaving. 5871 if (AggressivelyInterleaveReductions) { 5872 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5873 return IC; 5874 } 5875 5876 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5877 return 1; 5878 } 5879 5880 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5881 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5882 // This function calculates the register usage by measuring the highest number 5883 // of values that are alive at a single location. Obviously, this is a very 5884 // rough estimation. We scan the loop in a topological order in order and 5885 // assign a number to each instruction. We use RPO to ensure that defs are 5886 // met before their users. We assume that each instruction that has in-loop 5887 // users starts an interval. We record every time that an in-loop value is 5888 // used, so we have a list of the first and last occurrences of each 5889 // instruction. Next, we transpose this data structure into a multi map that 5890 // holds the list of intervals that *end* at a specific location. This multi 5891 // map allows us to perform a linear search. We scan the instructions linearly 5892 // and record each time that a new interval starts, by placing it in a set. 5893 // If we find this value in the multi-map then we remove it from the set. 5894 // The max register usage is the maximum size of the set. 5895 // We also search for instructions that are defined outside the loop, but are 5896 // used inside the loop. We need this number separately from the max-interval 5897 // usage number because when we unroll, loop-invariant values do not take 5898 // more register. 5899 LoopBlocksDFS DFS(TheLoop); 5900 DFS.perform(LI); 5901 5902 RegisterUsage RU; 5903 5904 // Each 'key' in the map opens a new interval. The values 5905 // of the map are the index of the 'last seen' usage of the 5906 // instruction that is the key. 5907 using IntervalMap = DenseMap<Instruction *, unsigned>; 5908 5909 // Maps instruction to its index. 5910 SmallVector<Instruction *, 64> IdxToInstr; 5911 // Marks the end of each interval. 5912 IntervalMap EndPoint; 5913 // Saves the list of instruction indices that are used in the loop. 5914 SmallPtrSet<Instruction *, 8> Ends; 5915 // Saves the list of values that are used in the loop but are 5916 // defined outside the loop, such as arguments and constants. 5917 SmallPtrSet<Value *, 8> LoopInvariants; 5918 5919 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5920 for (Instruction &I : BB->instructionsWithoutDebug()) { 5921 IdxToInstr.push_back(&I); 5922 5923 // Save the end location of each USE. 5924 for (Value *U : I.operands()) { 5925 auto *Instr = dyn_cast<Instruction>(U); 5926 5927 // Ignore non-instruction values such as arguments, constants, etc. 5928 if (!Instr) 5929 continue; 5930 5931 // If this instruction is outside the loop then record it and continue. 5932 if (!TheLoop->contains(Instr)) { 5933 LoopInvariants.insert(Instr); 5934 continue; 5935 } 5936 5937 // Overwrite previous end points. 5938 EndPoint[Instr] = IdxToInstr.size(); 5939 Ends.insert(Instr); 5940 } 5941 } 5942 } 5943 5944 // Saves the list of intervals that end with the index in 'key'. 5945 using InstrList = SmallVector<Instruction *, 2>; 5946 DenseMap<unsigned, InstrList> TransposeEnds; 5947 5948 // Transpose the EndPoints to a list of values that end at each index. 5949 for (auto &Interval : EndPoint) 5950 TransposeEnds[Interval.second].push_back(Interval.first); 5951 5952 SmallPtrSet<Instruction *, 8> OpenIntervals; 5953 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5954 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5955 5956 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5957 5958 auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { 5959 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5960 return 0; 5961 return TTI.getRegUsageForType(VectorType::get(Ty, VF)); 5962 }; 5963 5964 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5965 Instruction *I = IdxToInstr[i]; 5966 5967 // Remove all of the instructions that end at this location. 5968 InstrList &List = TransposeEnds[i]; 5969 for (Instruction *ToRemove : List) 5970 OpenIntervals.erase(ToRemove); 5971 5972 // Ignore instructions that are never used within the loop. 5973 if (!Ends.count(I)) 5974 continue; 5975 5976 // Skip ignored values. 5977 if (ValuesToIgnore.count(I)) 5978 continue; 5979 5980 // For each VF find the maximum usage of registers. 5981 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5982 // Count the number of live intervals. 5983 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5984 5985 if (VFs[j].isScalar()) { 5986 for (auto Inst : OpenIntervals) { 5987 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5988 if (RegUsage.find(ClassID) == RegUsage.end()) 5989 RegUsage[ClassID] = 1; 5990 else 5991 RegUsage[ClassID] += 1; 5992 } 5993 } else { 5994 collectUniformsAndScalars(VFs[j]); 5995 for (auto Inst : OpenIntervals) { 5996 // Skip ignored values for VF > 1. 5997 if (VecValuesToIgnore.count(Inst)) 5998 continue; 5999 if (isScalarAfterVectorization(Inst, VFs[j])) { 6000 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6001 if (RegUsage.find(ClassID) == RegUsage.end()) 6002 RegUsage[ClassID] = 1; 6003 else 6004 RegUsage[ClassID] += 1; 6005 } else { 6006 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6007 if (RegUsage.find(ClassID) == RegUsage.end()) 6008 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6009 else 6010 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6011 } 6012 } 6013 } 6014 6015 for (auto& pair : RegUsage) { 6016 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6017 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6018 else 6019 MaxUsages[j][pair.first] = pair.second; 6020 } 6021 } 6022 6023 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6024 << OpenIntervals.size() << '\n'); 6025 6026 // Add the current instruction to the list of open intervals. 6027 OpenIntervals.insert(I); 6028 } 6029 6030 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6031 SmallMapVector<unsigned, unsigned, 4> Invariant; 6032 6033 for (auto Inst : LoopInvariants) { 6034 unsigned Usage = 6035 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6036 unsigned ClassID = 6037 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6038 if (Invariant.find(ClassID) == Invariant.end()) 6039 Invariant[ClassID] = Usage; 6040 else 6041 Invariant[ClassID] += Usage; 6042 } 6043 6044 LLVM_DEBUG({ 6045 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6046 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6047 << " item\n"; 6048 for (const auto &pair : MaxUsages[i]) { 6049 dbgs() << "LV(REG): RegisterClass: " 6050 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6051 << " registers\n"; 6052 } 6053 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6054 << " item\n"; 6055 for (const auto &pair : Invariant) { 6056 dbgs() << "LV(REG): RegisterClass: " 6057 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6058 << " registers\n"; 6059 } 6060 }); 6061 6062 RU.LoopInvariantRegs = Invariant; 6063 RU.MaxLocalUsers = MaxUsages[i]; 6064 RUs[i] = RU; 6065 } 6066 6067 return RUs; 6068 } 6069 6070 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6071 ElementCount VF) { 6072 // TODO: Cost model for emulated masked load/store is completely 6073 // broken. This hack guides the cost model to use an artificially 6074 // high enough value to practically disable vectorization with such 6075 // operations, except where previously deployed legality hack allowed 6076 // using very low cost values. This is to avoid regressions coming simply 6077 // from moving "masked load/store" check from legality to cost model. 6078 // Masked Load/Gather emulation was previously never allowed. 6079 // Limited number of Masked Store/Scatter emulation was allowed. 6080 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6081 return isa<LoadInst>(I) || 6082 (isa<StoreInst>(I) && 6083 NumPredStores > NumberOfStoresToPredicate); 6084 } 6085 6086 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6087 // If we aren't vectorizing the loop, or if we've already collected the 6088 // instructions to scalarize, there's nothing to do. Collection may already 6089 // have occurred if we have a user-selected VF and are now computing the 6090 // expected cost for interleaving. 6091 if (VF.isScalar() || VF.isZero() || 6092 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6093 return; 6094 6095 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6096 // not profitable to scalarize any instructions, the presence of VF in the 6097 // map will indicate that we've analyzed it already. 6098 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6099 6100 // Find all the instructions that are scalar with predication in the loop and 6101 // determine if it would be better to not if-convert the blocks they are in. 6102 // If so, we also record the instructions to scalarize. 6103 for (BasicBlock *BB : TheLoop->blocks()) { 6104 if (!blockNeedsPredicationForAnyReason(BB)) 6105 continue; 6106 for (Instruction &I : *BB) 6107 if (isScalarWithPredication(&I, VF)) { 6108 ScalarCostsTy ScalarCosts; 6109 // Do not apply discount if scalable, because that would lead to 6110 // invalid scalarization costs. 6111 // Do not apply discount logic if hacked cost is needed 6112 // for emulated masked memrefs. 6113 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6114 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6115 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6116 // Remember that BB will remain after vectorization. 6117 PredicatedBBsAfterVectorization.insert(BB); 6118 } 6119 } 6120 } 6121 6122 int LoopVectorizationCostModel::computePredInstDiscount( 6123 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6124 assert(!isUniformAfterVectorization(PredInst, VF) && 6125 "Instruction marked uniform-after-vectorization will be predicated"); 6126 6127 // Initialize the discount to zero, meaning that the scalar version and the 6128 // vector version cost the same. 6129 InstructionCost Discount = 0; 6130 6131 // Holds instructions to analyze. The instructions we visit are mapped in 6132 // ScalarCosts. Those instructions are the ones that would be scalarized if 6133 // we find that the scalar version costs less. 6134 SmallVector<Instruction *, 8> Worklist; 6135 6136 // Returns true if the given instruction can be scalarized. 6137 auto canBeScalarized = [&](Instruction *I) -> bool { 6138 // We only attempt to scalarize instructions forming a single-use chain 6139 // from the original predicated block that would otherwise be vectorized. 6140 // Although not strictly necessary, we give up on instructions we know will 6141 // already be scalar to avoid traversing chains that are unlikely to be 6142 // beneficial. 6143 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6144 isScalarAfterVectorization(I, VF)) 6145 return false; 6146 6147 // If the instruction is scalar with predication, it will be analyzed 6148 // separately. We ignore it within the context of PredInst. 6149 if (isScalarWithPredication(I, VF)) 6150 return false; 6151 6152 // If any of the instruction's operands are uniform after vectorization, 6153 // the instruction cannot be scalarized. This prevents, for example, a 6154 // masked load from being scalarized. 6155 // 6156 // We assume we will only emit a value for lane zero of an instruction 6157 // marked uniform after vectorization, rather than VF identical values. 6158 // Thus, if we scalarize an instruction that uses a uniform, we would 6159 // create uses of values corresponding to the lanes we aren't emitting code 6160 // for. This behavior can be changed by allowing getScalarValue to clone 6161 // the lane zero values for uniforms rather than asserting. 6162 for (Use &U : I->operands()) 6163 if (auto *J = dyn_cast<Instruction>(U.get())) 6164 if (isUniformAfterVectorization(J, VF)) 6165 return false; 6166 6167 // Otherwise, we can scalarize the instruction. 6168 return true; 6169 }; 6170 6171 // Compute the expected cost discount from scalarizing the entire expression 6172 // feeding the predicated instruction. We currently only consider expressions 6173 // that are single-use instruction chains. 6174 Worklist.push_back(PredInst); 6175 while (!Worklist.empty()) { 6176 Instruction *I = Worklist.pop_back_val(); 6177 6178 // If we've already analyzed the instruction, there's nothing to do. 6179 if (ScalarCosts.find(I) != ScalarCosts.end()) 6180 continue; 6181 6182 // Compute the cost of the vector instruction. Note that this cost already 6183 // includes the scalarization overhead of the predicated instruction. 6184 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6185 6186 // Compute the cost of the scalarized instruction. This cost is the cost of 6187 // the instruction as if it wasn't if-converted and instead remained in the 6188 // predicated block. We will scale this cost by block probability after 6189 // computing the scalarization overhead. 6190 InstructionCost ScalarCost = 6191 VF.getFixedValue() * 6192 getInstructionCost(I, ElementCount::getFixed(1)).first; 6193 6194 // Compute the scalarization overhead of needed insertelement instructions 6195 // and phi nodes. 6196 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6197 ScalarCost += TTI.getScalarizationOverhead( 6198 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6199 APInt::getAllOnes(VF.getFixedValue()), true, false); 6200 ScalarCost += 6201 VF.getFixedValue() * 6202 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6203 } 6204 6205 // Compute the scalarization overhead of needed extractelement 6206 // instructions. For each of the instruction's operands, if the operand can 6207 // be scalarized, add it to the worklist; otherwise, account for the 6208 // overhead. 6209 for (Use &U : I->operands()) 6210 if (auto *J = dyn_cast<Instruction>(U.get())) { 6211 assert(VectorType::isValidElementType(J->getType()) && 6212 "Instruction has non-scalar type"); 6213 if (canBeScalarized(J)) 6214 Worklist.push_back(J); 6215 else if (needsExtract(J, VF)) { 6216 ScalarCost += TTI.getScalarizationOverhead( 6217 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6218 APInt::getAllOnes(VF.getFixedValue()), false, true); 6219 } 6220 } 6221 6222 // Scale the total scalar cost by block probability. 6223 ScalarCost /= getReciprocalPredBlockProb(); 6224 6225 // Compute the discount. A non-negative discount means the vector version 6226 // of the instruction costs more, and scalarizing would be beneficial. 6227 Discount += VectorCost - ScalarCost; 6228 ScalarCosts[I] = ScalarCost; 6229 } 6230 6231 return *Discount.getValue(); 6232 } 6233 6234 LoopVectorizationCostModel::VectorizationCostTy 6235 LoopVectorizationCostModel::expectedCost( 6236 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6237 VectorizationCostTy Cost; 6238 6239 // For each block. 6240 for (BasicBlock *BB : TheLoop->blocks()) { 6241 VectorizationCostTy BlockCost; 6242 6243 // For each instruction in the old loop. 6244 for (Instruction &I : BB->instructionsWithoutDebug()) { 6245 // Skip ignored values. 6246 if (ValuesToIgnore.count(&I) || 6247 (VF.isVector() && VecValuesToIgnore.count(&I))) 6248 continue; 6249 6250 VectorizationCostTy C = getInstructionCost(&I, VF); 6251 6252 // Check if we should override the cost. 6253 if (C.first.isValid() && 6254 ForceTargetInstructionCost.getNumOccurrences() > 0) 6255 C.first = InstructionCost(ForceTargetInstructionCost); 6256 6257 // Keep a list of instructions with invalid costs. 6258 if (Invalid && !C.first.isValid()) 6259 Invalid->emplace_back(&I, VF); 6260 6261 BlockCost.first += C.first; 6262 BlockCost.second |= C.second; 6263 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6264 << " for VF " << VF << " For instruction: " << I 6265 << '\n'); 6266 } 6267 6268 // If we are vectorizing a predicated block, it will have been 6269 // if-converted. This means that the block's instructions (aside from 6270 // stores and instructions that may divide by zero) will now be 6271 // unconditionally executed. For the scalar case, we may not always execute 6272 // the predicated block, if it is an if-else block. Thus, scale the block's 6273 // cost by the probability of executing it. blockNeedsPredication from 6274 // Legal is used so as to not include all blocks in tail folded loops. 6275 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6276 BlockCost.first /= getReciprocalPredBlockProb(); 6277 6278 Cost.first += BlockCost.first; 6279 Cost.second |= BlockCost.second; 6280 } 6281 6282 return Cost; 6283 } 6284 6285 /// Gets Address Access SCEV after verifying that the access pattern 6286 /// is loop invariant except the induction variable dependence. 6287 /// 6288 /// This SCEV can be sent to the Target in order to estimate the address 6289 /// calculation cost. 6290 static const SCEV *getAddressAccessSCEV( 6291 Value *Ptr, 6292 LoopVectorizationLegality *Legal, 6293 PredicatedScalarEvolution &PSE, 6294 const Loop *TheLoop) { 6295 6296 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6297 if (!Gep) 6298 return nullptr; 6299 6300 // We are looking for a gep with all loop invariant indices except for one 6301 // which should be an induction variable. 6302 auto SE = PSE.getSE(); 6303 unsigned NumOperands = Gep->getNumOperands(); 6304 for (unsigned i = 1; i < NumOperands; ++i) { 6305 Value *Opd = Gep->getOperand(i); 6306 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6307 !Legal->isInductionVariable(Opd)) 6308 return nullptr; 6309 } 6310 6311 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6312 return PSE.getSCEV(Ptr); 6313 } 6314 6315 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6316 return Legal->hasStride(I->getOperand(0)) || 6317 Legal->hasStride(I->getOperand(1)); 6318 } 6319 6320 InstructionCost 6321 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6322 ElementCount VF) { 6323 assert(VF.isVector() && 6324 "Scalarization cost of instruction implies vectorization."); 6325 if (VF.isScalable()) 6326 return InstructionCost::getInvalid(); 6327 6328 Type *ValTy = getLoadStoreType(I); 6329 auto SE = PSE.getSE(); 6330 6331 unsigned AS = getLoadStoreAddressSpace(I); 6332 Value *Ptr = getLoadStorePointerOperand(I); 6333 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6334 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6335 // that it is being called from this specific place. 6336 6337 // Figure out whether the access is strided and get the stride value 6338 // if it's known in compile time 6339 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6340 6341 // Get the cost of the scalar memory instruction and address computation. 6342 InstructionCost Cost = 6343 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6344 6345 // Don't pass *I here, since it is scalar but will actually be part of a 6346 // vectorized loop where the user of it is a vectorized instruction. 6347 const Align Alignment = getLoadStoreAlignment(I); 6348 Cost += VF.getKnownMinValue() * 6349 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6350 AS, TTI::TCK_RecipThroughput); 6351 6352 // Get the overhead of the extractelement and insertelement instructions 6353 // we might create due to scalarization. 6354 Cost += getScalarizationOverhead(I, VF); 6355 6356 // If we have a predicated load/store, it will need extra i1 extracts and 6357 // conditional branches, but may not be executed for each vector lane. Scale 6358 // the cost by the probability of executing the predicated block. 6359 if (isPredicatedInst(I, VF)) { 6360 Cost /= getReciprocalPredBlockProb(); 6361 6362 // Add the cost of an i1 extract and a branch 6363 auto *Vec_i1Ty = 6364 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6365 Cost += TTI.getScalarizationOverhead( 6366 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6367 /*Insert=*/false, /*Extract=*/true); 6368 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6369 6370 if (useEmulatedMaskMemRefHack(I, VF)) 6371 // Artificially setting to a high enough value to practically disable 6372 // vectorization with such operations. 6373 Cost = 3000000; 6374 } 6375 6376 return Cost; 6377 } 6378 6379 InstructionCost 6380 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6381 ElementCount VF) { 6382 Type *ValTy = getLoadStoreType(I); 6383 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6384 Value *Ptr = getLoadStorePointerOperand(I); 6385 unsigned AS = getLoadStoreAddressSpace(I); 6386 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6387 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6388 6389 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6390 "Stride should be 1 or -1 for consecutive memory access"); 6391 const Align Alignment = getLoadStoreAlignment(I); 6392 InstructionCost Cost = 0; 6393 if (Legal->isMaskRequired(I)) 6394 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6395 CostKind); 6396 else 6397 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6398 CostKind, I); 6399 6400 bool Reverse = ConsecutiveStride < 0; 6401 if (Reverse) 6402 Cost += 6403 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6404 return Cost; 6405 } 6406 6407 InstructionCost 6408 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6409 ElementCount VF) { 6410 assert(Legal->isUniformMemOp(*I)); 6411 6412 Type *ValTy = getLoadStoreType(I); 6413 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6414 const Align Alignment = getLoadStoreAlignment(I); 6415 unsigned AS = getLoadStoreAddressSpace(I); 6416 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6417 if (isa<LoadInst>(I)) { 6418 return TTI.getAddressComputationCost(ValTy) + 6419 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6420 CostKind) + 6421 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6422 } 6423 StoreInst *SI = cast<StoreInst>(I); 6424 6425 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6426 return TTI.getAddressComputationCost(ValTy) + 6427 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6428 CostKind) + 6429 (isLoopInvariantStoreValue 6430 ? 0 6431 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6432 VF.getKnownMinValue() - 1)); 6433 } 6434 6435 InstructionCost 6436 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6437 ElementCount VF) { 6438 Type *ValTy = getLoadStoreType(I); 6439 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6440 const Align Alignment = getLoadStoreAlignment(I); 6441 const Value *Ptr = getLoadStorePointerOperand(I); 6442 6443 return TTI.getAddressComputationCost(VectorTy) + 6444 TTI.getGatherScatterOpCost( 6445 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6446 TargetTransformInfo::TCK_RecipThroughput, I); 6447 } 6448 6449 InstructionCost 6450 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6451 ElementCount VF) { 6452 // TODO: Once we have support for interleaving with scalable vectors 6453 // we can calculate the cost properly here. 6454 if (VF.isScalable()) 6455 return InstructionCost::getInvalid(); 6456 6457 Type *ValTy = getLoadStoreType(I); 6458 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6459 unsigned AS = getLoadStoreAddressSpace(I); 6460 6461 auto Group = getInterleavedAccessGroup(I); 6462 assert(Group && "Fail to get an interleaved access group."); 6463 6464 unsigned InterleaveFactor = Group->getFactor(); 6465 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6466 6467 // Holds the indices of existing members in the interleaved group. 6468 SmallVector<unsigned, 4> Indices; 6469 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6470 if (Group->getMember(IF)) 6471 Indices.push_back(IF); 6472 6473 // Calculate the cost of the whole interleaved group. 6474 bool UseMaskForGaps = 6475 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6476 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6477 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6478 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6479 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6480 6481 if (Group->isReverse()) { 6482 // TODO: Add support for reversed masked interleaved access. 6483 assert(!Legal->isMaskRequired(I) && 6484 "Reverse masked interleaved access not supported."); 6485 Cost += 6486 Group->getNumMembers() * 6487 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6488 } 6489 return Cost; 6490 } 6491 6492 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6493 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6494 using namespace llvm::PatternMatch; 6495 // Early exit for no inloop reductions 6496 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6497 return None; 6498 auto *VectorTy = cast<VectorType>(Ty); 6499 6500 // We are looking for a pattern of, and finding the minimal acceptable cost: 6501 // reduce(mul(ext(A), ext(B))) or 6502 // reduce(mul(A, B)) or 6503 // reduce(ext(A)) or 6504 // reduce(A). 6505 // The basic idea is that we walk down the tree to do that, finding the root 6506 // reduction instruction in InLoopReductionImmediateChains. From there we find 6507 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6508 // of the components. If the reduction cost is lower then we return it for the 6509 // reduction instruction and 0 for the other instructions in the pattern. If 6510 // it is not we return an invalid cost specifying the orignal cost method 6511 // should be used. 6512 Instruction *RetI = I; 6513 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6514 if (!RetI->hasOneUser()) 6515 return None; 6516 RetI = RetI->user_back(); 6517 } 6518 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6519 RetI->user_back()->getOpcode() == Instruction::Add) { 6520 if (!RetI->hasOneUser()) 6521 return None; 6522 RetI = RetI->user_back(); 6523 } 6524 6525 // Test if the found instruction is a reduction, and if not return an invalid 6526 // cost specifying the parent to use the original cost modelling. 6527 if (!InLoopReductionImmediateChains.count(RetI)) 6528 return None; 6529 6530 // Find the reduction this chain is a part of and calculate the basic cost of 6531 // the reduction on its own. 6532 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6533 Instruction *ReductionPhi = LastChain; 6534 while (!isa<PHINode>(ReductionPhi)) 6535 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6536 6537 const RecurrenceDescriptor &RdxDesc = 6538 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6539 6540 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6541 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6542 6543 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6544 // normal fmul instruction to the cost of the fadd reduction. 6545 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6546 BaseCost += 6547 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6548 6549 // If we're using ordered reductions then we can just return the base cost 6550 // here, since getArithmeticReductionCost calculates the full ordered 6551 // reduction cost when FP reassociation is not allowed. 6552 if (useOrderedReductions(RdxDesc)) 6553 return BaseCost; 6554 6555 // Get the operand that was not the reduction chain and match it to one of the 6556 // patterns, returning the better cost if it is found. 6557 Instruction *RedOp = RetI->getOperand(1) == LastChain 6558 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6559 : dyn_cast<Instruction>(RetI->getOperand(1)); 6560 6561 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6562 6563 Instruction *Op0, *Op1; 6564 if (RedOp && 6565 match(RedOp, 6566 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6567 match(Op0, m_ZExtOrSExt(m_Value())) && 6568 Op0->getOpcode() == Op1->getOpcode() && 6569 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6570 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6571 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6572 6573 // Matched reduce(ext(mul(ext(A), ext(B))) 6574 // Note that the extend opcodes need to all match, or if A==B they will have 6575 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6576 // which is equally fine. 6577 bool IsUnsigned = isa<ZExtInst>(Op0); 6578 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6579 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6580 6581 InstructionCost ExtCost = 6582 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6583 TTI::CastContextHint::None, CostKind, Op0); 6584 InstructionCost MulCost = 6585 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6586 InstructionCost Ext2Cost = 6587 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6588 TTI::CastContextHint::None, CostKind, RedOp); 6589 6590 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6591 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6592 CostKind); 6593 6594 if (RedCost.isValid() && 6595 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6596 return I == RetI ? RedCost : 0; 6597 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6598 !TheLoop->isLoopInvariant(RedOp)) { 6599 // Matched reduce(ext(A)) 6600 bool IsUnsigned = isa<ZExtInst>(RedOp); 6601 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6602 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6603 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6604 CostKind); 6605 6606 InstructionCost ExtCost = 6607 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6608 TTI::CastContextHint::None, CostKind, RedOp); 6609 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6610 return I == RetI ? RedCost : 0; 6611 } else if (RedOp && 6612 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6613 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6614 Op0->getOpcode() == Op1->getOpcode() && 6615 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6616 bool IsUnsigned = isa<ZExtInst>(Op0); 6617 Type *Op0Ty = Op0->getOperand(0)->getType(); 6618 Type *Op1Ty = Op1->getOperand(0)->getType(); 6619 Type *LargestOpTy = 6620 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6621 : Op0Ty; 6622 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6623 6624 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6625 // different sizes. We take the largest type as the ext to reduce, and add 6626 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6627 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6628 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6629 TTI::CastContextHint::None, CostKind, Op0); 6630 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6631 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6632 TTI::CastContextHint::None, CostKind, Op1); 6633 InstructionCost MulCost = 6634 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6635 6636 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6637 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6638 CostKind); 6639 InstructionCost ExtraExtCost = 0; 6640 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6641 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6642 ExtraExtCost = TTI.getCastInstrCost( 6643 ExtraExtOp->getOpcode(), ExtType, 6644 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6645 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6646 } 6647 6648 if (RedCost.isValid() && 6649 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6650 return I == RetI ? RedCost : 0; 6651 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6652 // Matched reduce(mul()) 6653 InstructionCost MulCost = 6654 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6655 6656 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6657 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6658 CostKind); 6659 6660 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6661 return I == RetI ? RedCost : 0; 6662 } 6663 } 6664 6665 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6666 } 6667 6668 InstructionCost 6669 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6670 ElementCount VF) { 6671 // Calculate scalar cost only. Vectorization cost should be ready at this 6672 // moment. 6673 if (VF.isScalar()) { 6674 Type *ValTy = getLoadStoreType(I); 6675 const Align Alignment = getLoadStoreAlignment(I); 6676 unsigned AS = getLoadStoreAddressSpace(I); 6677 6678 return TTI.getAddressComputationCost(ValTy) + 6679 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6680 TTI::TCK_RecipThroughput, I); 6681 } 6682 return getWideningCost(I, VF); 6683 } 6684 6685 LoopVectorizationCostModel::VectorizationCostTy 6686 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6687 ElementCount VF) { 6688 // If we know that this instruction will remain uniform, check the cost of 6689 // the scalar version. 6690 if (isUniformAfterVectorization(I, VF)) 6691 VF = ElementCount::getFixed(1); 6692 6693 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6694 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6695 6696 // Forced scalars do not have any scalarization overhead. 6697 auto ForcedScalar = ForcedScalars.find(VF); 6698 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6699 auto InstSet = ForcedScalar->second; 6700 if (InstSet.count(I)) 6701 return VectorizationCostTy( 6702 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6703 VF.getKnownMinValue()), 6704 false); 6705 } 6706 6707 Type *VectorTy; 6708 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6709 6710 bool TypeNotScalarized = false; 6711 if (VF.isVector() && VectorTy->isVectorTy()) { 6712 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 6713 if (NumParts) 6714 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6715 else 6716 C = InstructionCost::getInvalid(); 6717 } 6718 return VectorizationCostTy(C, TypeNotScalarized); 6719 } 6720 6721 InstructionCost 6722 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6723 ElementCount VF) const { 6724 6725 // There is no mechanism yet to create a scalable scalarization loop, 6726 // so this is currently Invalid. 6727 if (VF.isScalable()) 6728 return InstructionCost::getInvalid(); 6729 6730 if (VF.isScalar()) 6731 return 0; 6732 6733 InstructionCost Cost = 0; 6734 Type *RetTy = ToVectorTy(I->getType(), VF); 6735 if (!RetTy->isVoidTy() && 6736 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6737 Cost += TTI.getScalarizationOverhead( 6738 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6739 false); 6740 6741 // Some targets keep addresses scalar. 6742 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6743 return Cost; 6744 6745 // Some targets support efficient element stores. 6746 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6747 return Cost; 6748 6749 // Collect operands to consider. 6750 CallInst *CI = dyn_cast<CallInst>(I); 6751 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6752 6753 // Skip operands that do not require extraction/scalarization and do not incur 6754 // any overhead. 6755 SmallVector<Type *> Tys; 6756 for (auto *V : filterExtractingOperands(Ops, VF)) 6757 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6758 return Cost + TTI.getOperandsScalarizationOverhead( 6759 filterExtractingOperands(Ops, VF), Tys); 6760 } 6761 6762 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6763 if (VF.isScalar()) 6764 return; 6765 NumPredStores = 0; 6766 for (BasicBlock *BB : TheLoop->blocks()) { 6767 // For each instruction in the old loop. 6768 for (Instruction &I : *BB) { 6769 Value *Ptr = getLoadStorePointerOperand(&I); 6770 if (!Ptr) 6771 continue; 6772 6773 // TODO: We should generate better code and update the cost model for 6774 // predicated uniform stores. Today they are treated as any other 6775 // predicated store (see added test cases in 6776 // invariant-store-vectorization.ll). 6777 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6778 NumPredStores++; 6779 6780 if (Legal->isUniformMemOp(I)) { 6781 // TODO: Avoid replicating loads and stores instead of 6782 // relying on instcombine to remove them. 6783 // Load: Scalar load + broadcast 6784 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6785 InstructionCost Cost; 6786 if (isa<StoreInst>(&I) && VF.isScalable() && 6787 isLegalGatherOrScatter(&I, VF)) { 6788 Cost = getGatherScatterCost(&I, VF); 6789 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6790 } else { 6791 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 6792 "Cannot yet scalarize uniform stores"); 6793 Cost = getUniformMemOpCost(&I, VF); 6794 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6795 } 6796 continue; 6797 } 6798 6799 // We assume that widening is the best solution when possible. 6800 if (memoryInstructionCanBeWidened(&I, VF)) { 6801 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6802 int ConsecutiveStride = Legal->isConsecutivePtr( 6803 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6804 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6805 "Expected consecutive stride."); 6806 InstWidening Decision = 6807 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6808 setWideningDecision(&I, VF, Decision, Cost); 6809 continue; 6810 } 6811 6812 // Choose between Interleaving, Gather/Scatter or Scalarization. 6813 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6814 unsigned NumAccesses = 1; 6815 if (isAccessInterleaved(&I)) { 6816 auto Group = getInterleavedAccessGroup(&I); 6817 assert(Group && "Fail to get an interleaved access group."); 6818 6819 // Make one decision for the whole group. 6820 if (getWideningDecision(&I, VF) != CM_Unknown) 6821 continue; 6822 6823 NumAccesses = Group->getNumMembers(); 6824 if (interleavedAccessCanBeWidened(&I, VF)) 6825 InterleaveCost = getInterleaveGroupCost(&I, VF); 6826 } 6827 6828 InstructionCost GatherScatterCost = 6829 isLegalGatherOrScatter(&I, VF) 6830 ? getGatherScatterCost(&I, VF) * NumAccesses 6831 : InstructionCost::getInvalid(); 6832 6833 InstructionCost ScalarizationCost = 6834 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6835 6836 // Choose better solution for the current VF, 6837 // write down this decision and use it during vectorization. 6838 InstructionCost Cost; 6839 InstWidening Decision; 6840 if (InterleaveCost <= GatherScatterCost && 6841 InterleaveCost < ScalarizationCost) { 6842 Decision = CM_Interleave; 6843 Cost = InterleaveCost; 6844 } else if (GatherScatterCost < ScalarizationCost) { 6845 Decision = CM_GatherScatter; 6846 Cost = GatherScatterCost; 6847 } else { 6848 Decision = CM_Scalarize; 6849 Cost = ScalarizationCost; 6850 } 6851 // If the instructions belongs to an interleave group, the whole group 6852 // receives the same decision. The whole group receives the cost, but 6853 // the cost will actually be assigned to one instruction. 6854 if (auto Group = getInterleavedAccessGroup(&I)) 6855 setWideningDecision(Group, VF, Decision, Cost); 6856 else 6857 setWideningDecision(&I, VF, Decision, Cost); 6858 } 6859 } 6860 6861 // Make sure that any load of address and any other address computation 6862 // remains scalar unless there is gather/scatter support. This avoids 6863 // inevitable extracts into address registers, and also has the benefit of 6864 // activating LSR more, since that pass can't optimize vectorized 6865 // addresses. 6866 if (TTI.prefersVectorizedAddressing()) 6867 return; 6868 6869 // Start with all scalar pointer uses. 6870 SmallPtrSet<Instruction *, 8> AddrDefs; 6871 for (BasicBlock *BB : TheLoop->blocks()) 6872 for (Instruction &I : *BB) { 6873 Instruction *PtrDef = 6874 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6875 if (PtrDef && TheLoop->contains(PtrDef) && 6876 getWideningDecision(&I, VF) != CM_GatherScatter) 6877 AddrDefs.insert(PtrDef); 6878 } 6879 6880 // Add all instructions used to generate the addresses. 6881 SmallVector<Instruction *, 4> Worklist; 6882 append_range(Worklist, AddrDefs); 6883 while (!Worklist.empty()) { 6884 Instruction *I = Worklist.pop_back_val(); 6885 for (auto &Op : I->operands()) 6886 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6887 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6888 AddrDefs.insert(InstOp).second) 6889 Worklist.push_back(InstOp); 6890 } 6891 6892 for (auto *I : AddrDefs) { 6893 if (isa<LoadInst>(I)) { 6894 // Setting the desired widening decision should ideally be handled in 6895 // by cost functions, but since this involves the task of finding out 6896 // if the loaded register is involved in an address computation, it is 6897 // instead changed here when we know this is the case. 6898 InstWidening Decision = getWideningDecision(I, VF); 6899 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6900 // Scalarize a widened load of address. 6901 setWideningDecision( 6902 I, VF, CM_Scalarize, 6903 (VF.getKnownMinValue() * 6904 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6905 else if (auto Group = getInterleavedAccessGroup(I)) { 6906 // Scalarize an interleave group of address loads. 6907 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6908 if (Instruction *Member = Group->getMember(I)) 6909 setWideningDecision( 6910 Member, VF, CM_Scalarize, 6911 (VF.getKnownMinValue() * 6912 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6913 } 6914 } 6915 } else 6916 // Make sure I gets scalarized and a cost estimate without 6917 // scalarization overhead. 6918 ForcedScalars[VF].insert(I); 6919 } 6920 } 6921 6922 InstructionCost 6923 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6924 Type *&VectorTy) { 6925 Type *RetTy = I->getType(); 6926 if (canTruncateToMinimalBitwidth(I, VF)) 6927 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6928 auto SE = PSE.getSE(); 6929 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6930 6931 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6932 ElementCount VF) -> bool { 6933 if (VF.isScalar()) 6934 return true; 6935 6936 auto Scalarized = InstsToScalarize.find(VF); 6937 assert(Scalarized != InstsToScalarize.end() && 6938 "VF not yet analyzed for scalarization profitability"); 6939 return !Scalarized->second.count(I) && 6940 llvm::all_of(I->users(), [&](User *U) { 6941 auto *UI = cast<Instruction>(U); 6942 return !Scalarized->second.count(UI); 6943 }); 6944 }; 6945 (void) hasSingleCopyAfterVectorization; 6946 6947 if (isScalarAfterVectorization(I, VF)) { 6948 // With the exception of GEPs and PHIs, after scalarization there should 6949 // only be one copy of the instruction generated in the loop. This is 6950 // because the VF is either 1, or any instructions that need scalarizing 6951 // have already been dealt with by the the time we get here. As a result, 6952 // it means we don't have to multiply the instruction cost by VF. 6953 assert(I->getOpcode() == Instruction::GetElementPtr || 6954 I->getOpcode() == Instruction::PHI || 6955 (I->getOpcode() == Instruction::BitCast && 6956 I->getType()->isPointerTy()) || 6957 hasSingleCopyAfterVectorization(I, VF)); 6958 VectorTy = RetTy; 6959 } else 6960 VectorTy = ToVectorTy(RetTy, VF); 6961 6962 // TODO: We need to estimate the cost of intrinsic calls. 6963 switch (I->getOpcode()) { 6964 case Instruction::GetElementPtr: 6965 // We mark this instruction as zero-cost because the cost of GEPs in 6966 // vectorized code depends on whether the corresponding memory instruction 6967 // is scalarized or not. Therefore, we handle GEPs with the memory 6968 // instruction cost. 6969 return 0; 6970 case Instruction::Br: { 6971 // In cases of scalarized and predicated instructions, there will be VF 6972 // predicated blocks in the vectorized loop. Each branch around these 6973 // blocks requires also an extract of its vector compare i1 element. 6974 bool ScalarPredicatedBB = false; 6975 BranchInst *BI = cast<BranchInst>(I); 6976 if (VF.isVector() && BI->isConditional() && 6977 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6978 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6979 ScalarPredicatedBB = true; 6980 6981 if (ScalarPredicatedBB) { 6982 // Not possible to scalarize scalable vector with predicated instructions. 6983 if (VF.isScalable()) 6984 return InstructionCost::getInvalid(); 6985 // Return cost for branches around scalarized and predicated blocks. 6986 auto *Vec_i1Ty = 6987 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6988 return ( 6989 TTI.getScalarizationOverhead( 6990 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 6991 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6992 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6993 // The back-edge branch will remain, as will all scalar branches. 6994 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6995 else 6996 // This branch will be eliminated by if-conversion. 6997 return 0; 6998 // Note: We currently assume zero cost for an unconditional branch inside 6999 // a predicated block since it will become a fall-through, although we 7000 // may decide in the future to call TTI for all branches. 7001 } 7002 case Instruction::PHI: { 7003 auto *Phi = cast<PHINode>(I); 7004 7005 // First-order recurrences are replaced by vector shuffles inside the loop. 7006 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7007 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7008 return TTI.getShuffleCost( 7009 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7010 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7011 7012 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7013 // converted into select instructions. We require N - 1 selects per phi 7014 // node, where N is the number of incoming values. 7015 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7016 return (Phi->getNumIncomingValues() - 1) * 7017 TTI.getCmpSelInstrCost( 7018 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7019 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7020 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7021 7022 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7023 } 7024 case Instruction::UDiv: 7025 case Instruction::SDiv: 7026 case Instruction::URem: 7027 case Instruction::SRem: 7028 // If we have a predicated instruction, it may not be executed for each 7029 // vector lane. Get the scalarization cost and scale this amount by the 7030 // probability of executing the predicated block. If the instruction is not 7031 // predicated, we fall through to the next case. 7032 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7033 InstructionCost Cost = 0; 7034 7035 // These instructions have a non-void type, so account for the phi nodes 7036 // that we will create. This cost is likely to be zero. The phi node 7037 // cost, if any, should be scaled by the block probability because it 7038 // models a copy at the end of each predicated block. 7039 Cost += VF.getKnownMinValue() * 7040 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7041 7042 // The cost of the non-predicated instruction. 7043 Cost += VF.getKnownMinValue() * 7044 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7045 7046 // The cost of insertelement and extractelement instructions needed for 7047 // scalarization. 7048 Cost += getScalarizationOverhead(I, VF); 7049 7050 // Scale the cost by the probability of executing the predicated blocks. 7051 // This assumes the predicated block for each vector lane is equally 7052 // likely. 7053 return Cost / getReciprocalPredBlockProb(); 7054 } 7055 LLVM_FALLTHROUGH; 7056 case Instruction::Add: 7057 case Instruction::FAdd: 7058 case Instruction::Sub: 7059 case Instruction::FSub: 7060 case Instruction::Mul: 7061 case Instruction::FMul: 7062 case Instruction::FDiv: 7063 case Instruction::FRem: 7064 case Instruction::Shl: 7065 case Instruction::LShr: 7066 case Instruction::AShr: 7067 case Instruction::And: 7068 case Instruction::Or: 7069 case Instruction::Xor: { 7070 // Since we will replace the stride by 1 the multiplication should go away. 7071 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7072 return 0; 7073 7074 // Detect reduction patterns 7075 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7076 return *RedCost; 7077 7078 // Certain instructions can be cheaper to vectorize if they have a constant 7079 // second vector operand. One example of this are shifts on x86. 7080 Value *Op2 = I->getOperand(1); 7081 TargetTransformInfo::OperandValueProperties Op2VP; 7082 TargetTransformInfo::OperandValueKind Op2VK = 7083 TTI.getOperandInfo(Op2, Op2VP); 7084 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7085 Op2VK = TargetTransformInfo::OK_UniformValue; 7086 7087 SmallVector<const Value *, 4> Operands(I->operand_values()); 7088 return TTI.getArithmeticInstrCost( 7089 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7090 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7091 } 7092 case Instruction::FNeg: { 7093 return TTI.getArithmeticInstrCost( 7094 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7095 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7096 TargetTransformInfo::OP_None, I->getOperand(0), I); 7097 } 7098 case Instruction::Select: { 7099 SelectInst *SI = cast<SelectInst>(I); 7100 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7101 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7102 7103 const Value *Op0, *Op1; 7104 using namespace llvm::PatternMatch; 7105 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7106 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7107 // select x, y, false --> x & y 7108 // select x, true, y --> x | y 7109 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7110 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7111 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7112 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7113 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7114 Op1->getType()->getScalarSizeInBits() == 1); 7115 7116 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7117 return TTI.getArithmeticInstrCost( 7118 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7119 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7120 } 7121 7122 Type *CondTy = SI->getCondition()->getType(); 7123 if (!ScalarCond) 7124 CondTy = VectorType::get(CondTy, VF); 7125 7126 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7127 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7128 Pred = Cmp->getPredicate(); 7129 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7130 CostKind, I); 7131 } 7132 case Instruction::ICmp: 7133 case Instruction::FCmp: { 7134 Type *ValTy = I->getOperand(0)->getType(); 7135 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7136 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7137 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7138 VectorTy = ToVectorTy(ValTy, VF); 7139 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7140 cast<CmpInst>(I)->getPredicate(), CostKind, 7141 I); 7142 } 7143 case Instruction::Store: 7144 case Instruction::Load: { 7145 ElementCount Width = VF; 7146 if (Width.isVector()) { 7147 InstWidening Decision = getWideningDecision(I, Width); 7148 assert(Decision != CM_Unknown && 7149 "CM decision should be taken at this point"); 7150 if (Decision == CM_Scalarize) 7151 Width = ElementCount::getFixed(1); 7152 } 7153 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7154 return getMemoryInstructionCost(I, VF); 7155 } 7156 case Instruction::BitCast: 7157 if (I->getType()->isPointerTy()) 7158 return 0; 7159 LLVM_FALLTHROUGH; 7160 case Instruction::ZExt: 7161 case Instruction::SExt: 7162 case Instruction::FPToUI: 7163 case Instruction::FPToSI: 7164 case Instruction::FPExt: 7165 case Instruction::PtrToInt: 7166 case Instruction::IntToPtr: 7167 case Instruction::SIToFP: 7168 case Instruction::UIToFP: 7169 case Instruction::Trunc: 7170 case Instruction::FPTrunc: { 7171 // Computes the CastContextHint from a Load/Store instruction. 7172 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7173 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7174 "Expected a load or a store!"); 7175 7176 if (VF.isScalar() || !TheLoop->contains(I)) 7177 return TTI::CastContextHint::Normal; 7178 7179 switch (getWideningDecision(I, VF)) { 7180 case LoopVectorizationCostModel::CM_GatherScatter: 7181 return TTI::CastContextHint::GatherScatter; 7182 case LoopVectorizationCostModel::CM_Interleave: 7183 return TTI::CastContextHint::Interleave; 7184 case LoopVectorizationCostModel::CM_Scalarize: 7185 case LoopVectorizationCostModel::CM_Widen: 7186 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7187 : TTI::CastContextHint::Normal; 7188 case LoopVectorizationCostModel::CM_Widen_Reverse: 7189 return TTI::CastContextHint::Reversed; 7190 case LoopVectorizationCostModel::CM_Unknown: 7191 llvm_unreachable("Instr did not go through cost modelling?"); 7192 } 7193 7194 llvm_unreachable("Unhandled case!"); 7195 }; 7196 7197 unsigned Opcode = I->getOpcode(); 7198 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7199 // For Trunc, the context is the only user, which must be a StoreInst. 7200 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7201 if (I->hasOneUse()) 7202 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7203 CCH = ComputeCCH(Store); 7204 } 7205 // For Z/Sext, the context is the operand, which must be a LoadInst. 7206 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7207 Opcode == Instruction::FPExt) { 7208 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7209 CCH = ComputeCCH(Load); 7210 } 7211 7212 // We optimize the truncation of induction variables having constant 7213 // integer steps. The cost of these truncations is the same as the scalar 7214 // operation. 7215 if (isOptimizableIVTruncate(I, VF)) { 7216 auto *Trunc = cast<TruncInst>(I); 7217 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7218 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7219 } 7220 7221 // Detect reduction patterns 7222 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7223 return *RedCost; 7224 7225 Type *SrcScalarTy = I->getOperand(0)->getType(); 7226 Type *SrcVecTy = 7227 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7228 if (canTruncateToMinimalBitwidth(I, VF)) { 7229 // This cast is going to be shrunk. This may remove the cast or it might 7230 // turn it into slightly different cast. For example, if MinBW == 16, 7231 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7232 // 7233 // Calculate the modified src and dest types. 7234 Type *MinVecTy = VectorTy; 7235 if (Opcode == Instruction::Trunc) { 7236 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7237 VectorTy = 7238 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7239 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7240 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7241 VectorTy = 7242 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7243 } 7244 } 7245 7246 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7247 } 7248 case Instruction::Call: { 7249 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7250 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7251 return *RedCost; 7252 bool NeedToScalarize; 7253 CallInst *CI = cast<CallInst>(I); 7254 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7255 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7256 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7257 return std::min(CallCost, IntrinsicCost); 7258 } 7259 return CallCost; 7260 } 7261 case Instruction::ExtractValue: 7262 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7263 case Instruction::Alloca: 7264 // We cannot easily widen alloca to a scalable alloca, as 7265 // the result would need to be a vector of pointers. 7266 if (VF.isScalable()) 7267 return InstructionCost::getInvalid(); 7268 LLVM_FALLTHROUGH; 7269 default: 7270 // This opcode is unknown. Assume that it is the same as 'mul'. 7271 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7272 } // end of switch. 7273 } 7274 7275 char LoopVectorize::ID = 0; 7276 7277 static const char lv_name[] = "Loop Vectorization"; 7278 7279 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7280 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7281 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7282 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7283 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7284 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7285 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7286 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7287 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7288 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7289 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7290 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7291 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7292 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7293 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7294 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7295 7296 namespace llvm { 7297 7298 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7299 7300 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7301 bool VectorizeOnlyWhenForced) { 7302 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7303 } 7304 7305 } // end namespace llvm 7306 7307 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7308 // Check if the pointer operand of a load or store instruction is 7309 // consecutive. 7310 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7311 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7312 return false; 7313 } 7314 7315 void LoopVectorizationCostModel::collectValuesToIgnore() { 7316 // Ignore ephemeral values. 7317 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7318 7319 // Find all stores to invariant variables. Since they are going to sink 7320 // outside the loop we do not need calculate cost for them. 7321 for (BasicBlock *BB : TheLoop->blocks()) 7322 for (Instruction &I : *BB) { 7323 StoreInst *SI; 7324 if ((SI = dyn_cast<StoreInst>(&I)) && 7325 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7326 ValuesToIgnore.insert(&I); 7327 } 7328 7329 // Ignore type-promoting instructions we identified during reduction 7330 // detection. 7331 for (auto &Reduction : Legal->getReductionVars()) { 7332 const RecurrenceDescriptor &RedDes = Reduction.second; 7333 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7334 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7335 } 7336 // Ignore type-casting instructions we identified during induction 7337 // detection. 7338 for (auto &Induction : Legal->getInductionVars()) { 7339 const InductionDescriptor &IndDes = Induction.second; 7340 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7341 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7342 } 7343 } 7344 7345 void LoopVectorizationCostModel::collectInLoopReductions() { 7346 for (auto &Reduction : Legal->getReductionVars()) { 7347 PHINode *Phi = Reduction.first; 7348 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7349 7350 // We don't collect reductions that are type promoted (yet). 7351 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7352 continue; 7353 7354 // If the target would prefer this reduction to happen "in-loop", then we 7355 // want to record it as such. 7356 unsigned Opcode = RdxDesc.getOpcode(); 7357 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7358 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7359 TargetTransformInfo::ReductionFlags())) 7360 continue; 7361 7362 // Check that we can correctly put the reductions into the loop, by 7363 // finding the chain of operations that leads from the phi to the loop 7364 // exit value. 7365 SmallVector<Instruction *, 4> ReductionOperations = 7366 RdxDesc.getReductionOpChain(Phi, TheLoop); 7367 bool InLoop = !ReductionOperations.empty(); 7368 if (InLoop) { 7369 InLoopReductionChains[Phi] = ReductionOperations; 7370 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7371 Instruction *LastChain = Phi; 7372 for (auto *I : ReductionOperations) { 7373 InLoopReductionImmediateChains[I] = LastChain; 7374 LastChain = I; 7375 } 7376 } 7377 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7378 << " reduction for phi: " << *Phi << "\n"); 7379 } 7380 } 7381 7382 // TODO: we could return a pair of values that specify the max VF and 7383 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7384 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7385 // doesn't have a cost model that can choose which plan to execute if 7386 // more than one is generated. 7387 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7388 LoopVectorizationCostModel &CM) { 7389 unsigned WidestType; 7390 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7391 return WidestVectorRegBits / WidestType; 7392 } 7393 7394 VectorizationFactor 7395 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7396 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7397 ElementCount VF = UserVF; 7398 // Outer loop handling: They may require CFG and instruction level 7399 // transformations before even evaluating whether vectorization is profitable. 7400 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7401 // the vectorization pipeline. 7402 if (!OrigLoop->isInnermost()) { 7403 // If the user doesn't provide a vectorization factor, determine a 7404 // reasonable one. 7405 if (UserVF.isZero()) { 7406 VF = ElementCount::getFixed(determineVPlanVF( 7407 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7408 .getFixedSize(), 7409 CM)); 7410 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7411 7412 // Make sure we have a VF > 1 for stress testing. 7413 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7414 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7415 << "overriding computed VF.\n"); 7416 VF = ElementCount::getFixed(4); 7417 } 7418 } 7419 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7420 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7421 "VF needs to be a power of two"); 7422 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7423 << "VF " << VF << " to build VPlans.\n"); 7424 buildVPlans(VF, VF); 7425 7426 // For VPlan build stress testing, we bail out after VPlan construction. 7427 if (VPlanBuildStressTest) 7428 return VectorizationFactor::Disabled(); 7429 7430 return {VF, 0 /*Cost*/}; 7431 } 7432 7433 LLVM_DEBUG( 7434 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7435 "VPlan-native path.\n"); 7436 return VectorizationFactor::Disabled(); 7437 } 7438 7439 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { 7440 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7441 return (NumRuntimePointerChecks > 7442 VectorizerParams::RuntimeMemoryCheckThreshold && 7443 !Hints.allowReordering()) || 7444 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7445 } 7446 7447 Optional<VectorizationFactor> 7448 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7449 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7450 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7451 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7452 return None; 7453 7454 // Invalidate interleave groups if all blocks of loop will be predicated. 7455 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7456 !useMaskedInterleavedAccesses(*TTI)) { 7457 LLVM_DEBUG( 7458 dbgs() 7459 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7460 "which requires masked-interleaved support.\n"); 7461 if (CM.InterleaveInfo.invalidateGroups()) 7462 // Invalidating interleave groups also requires invalidating all decisions 7463 // based on them, which includes widening decisions and uniform and scalar 7464 // values. 7465 CM.invalidateCostModelingDecisions(); 7466 } 7467 7468 ElementCount MaxUserVF = 7469 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7470 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7471 if (!UserVF.isZero() && UserVFIsLegal) { 7472 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7473 "VF needs to be a power of two"); 7474 // Collect the instructions (and their associated costs) that will be more 7475 // profitable to scalarize. 7476 if (CM.selectUserVectorizationFactor(UserVF)) { 7477 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7478 CM.collectInLoopReductions(); 7479 buildVPlansWithVPRecipes(UserVF, UserVF); 7480 LLVM_DEBUG(printPlans(dbgs())); 7481 return {{UserVF, 0}}; 7482 } else 7483 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7484 "InvalidCost", ORE, OrigLoop); 7485 } 7486 7487 // Populate the set of Vectorization Factor Candidates. 7488 ElementCountSet VFCandidates; 7489 for (auto VF = ElementCount::getFixed(1); 7490 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7491 VFCandidates.insert(VF); 7492 for (auto VF = ElementCount::getScalable(1); 7493 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7494 VFCandidates.insert(VF); 7495 7496 for (const auto &VF : VFCandidates) { 7497 // Collect Uniform and Scalar instructions after vectorization with VF. 7498 CM.collectUniformsAndScalars(VF); 7499 7500 // Collect the instructions (and their associated costs) that will be more 7501 // profitable to scalarize. 7502 if (VF.isVector()) 7503 CM.collectInstsToScalarize(VF); 7504 } 7505 7506 CM.collectInLoopReductions(); 7507 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7508 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7509 7510 LLVM_DEBUG(printPlans(dbgs())); 7511 if (!MaxFactors.hasVector()) 7512 return VectorizationFactor::Disabled(); 7513 7514 // Select the optimal vectorization factor. 7515 return CM.selectVectorizationFactor(VFCandidates); 7516 } 7517 7518 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7519 assert(count_if(VPlans, 7520 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7521 1 && 7522 "Best VF has not a single VPlan."); 7523 7524 for (const VPlanPtr &Plan : VPlans) { 7525 if (Plan->hasVF(VF)) 7526 return *Plan.get(); 7527 } 7528 llvm_unreachable("No plan found!"); 7529 } 7530 7531 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7532 SmallVector<Metadata *, 4> MDs; 7533 // Reserve first location for self reference to the LoopID metadata node. 7534 MDs.push_back(nullptr); 7535 bool IsUnrollMetadata = false; 7536 MDNode *LoopID = L->getLoopID(); 7537 if (LoopID) { 7538 // First find existing loop unrolling disable metadata. 7539 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7540 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7541 if (MD) { 7542 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7543 IsUnrollMetadata = 7544 S && S->getString().startswith("llvm.loop.unroll.disable"); 7545 } 7546 MDs.push_back(LoopID->getOperand(i)); 7547 } 7548 } 7549 7550 if (!IsUnrollMetadata) { 7551 // Add runtime unroll disable metadata. 7552 LLVMContext &Context = L->getHeader()->getContext(); 7553 SmallVector<Metadata *, 1> DisableOperands; 7554 DisableOperands.push_back( 7555 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7556 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7557 MDs.push_back(DisableNode); 7558 MDNode *NewLoopID = MDNode::get(Context, MDs); 7559 // Set operand 0 to refer to the loop id itself. 7560 NewLoopID->replaceOperandWith(0, NewLoopID); 7561 L->setLoopID(NewLoopID); 7562 } 7563 } 7564 7565 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7566 VPlan &BestVPlan, 7567 InnerLoopVectorizer &ILV, 7568 DominatorTree *DT) { 7569 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7570 << '\n'); 7571 7572 // Perform the actual loop transformation. 7573 7574 // 1. Set up the skeleton for vectorization, including vector pre-header and 7575 // middle block. The vector loop is created during VPlan execution. 7576 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7577 Value *CanonicalIVStartValue; 7578 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7579 ILV.createVectorizedLoopSkeleton(); 7580 ILV.collectPoisonGeneratingRecipes(State); 7581 7582 ILV.printDebugTracesAtStart(); 7583 7584 //===------------------------------------------------===// 7585 // 7586 // Notice: any optimization or new instruction that go 7587 // into the code below should also be implemented in 7588 // the cost-model. 7589 // 7590 //===------------------------------------------------===// 7591 7592 // 2. Copy and widen instructions from the old loop into the new loop. 7593 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7594 ILV.getOrCreateVectorTripCount(nullptr), 7595 CanonicalIVStartValue, State); 7596 BestVPlan.execute(&State); 7597 7598 // Keep all loop hints from the original loop on the vector loop (we'll 7599 // replace the vectorizer-specific hints below). 7600 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7601 7602 Optional<MDNode *> VectorizedLoopID = 7603 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7604 LLVMLoopVectorizeFollowupVectorized}); 7605 7606 VPBasicBlock *HeaderVPBB = 7607 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7608 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7609 if (VectorizedLoopID.hasValue()) 7610 L->setLoopID(VectorizedLoopID.getValue()); 7611 else { 7612 // Keep all loop hints from the original loop on the vector loop (we'll 7613 // replace the vectorizer-specific hints below). 7614 if (MDNode *LID = OrigLoop->getLoopID()) 7615 L->setLoopID(LID); 7616 7617 LoopVectorizeHints Hints(L, true, *ORE); 7618 Hints.setAlreadyVectorized(); 7619 } 7620 // Disable runtime unrolling when vectorizing the epilogue loop. 7621 if (CanonicalIVStartValue) 7622 AddRuntimeUnrollDisableMetaData(L); 7623 7624 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7625 // predication, updating analyses. 7626 ILV.fixVectorizedLoop(State, BestVPlan); 7627 7628 ILV.printDebugTracesAtEnd(); 7629 } 7630 7631 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7632 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7633 for (const auto &Plan : VPlans) 7634 if (PrintVPlansInDotFormat) 7635 Plan->printDOT(O); 7636 else 7637 Plan->print(O); 7638 } 7639 #endif 7640 7641 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7642 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7643 7644 // We create new control-flow for the vectorized loop, so the original exit 7645 // conditions will be dead after vectorization if it's only used by the 7646 // terminator 7647 SmallVector<BasicBlock*> ExitingBlocks; 7648 OrigLoop->getExitingBlocks(ExitingBlocks); 7649 for (auto *BB : ExitingBlocks) { 7650 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7651 if (!Cmp || !Cmp->hasOneUse()) 7652 continue; 7653 7654 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7655 if (!DeadInstructions.insert(Cmp).second) 7656 continue; 7657 7658 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7659 // TODO: can recurse through operands in general 7660 for (Value *Op : Cmp->operands()) { 7661 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7662 DeadInstructions.insert(cast<Instruction>(Op)); 7663 } 7664 } 7665 7666 // We create new "steps" for induction variable updates to which the original 7667 // induction variables map. An original update instruction will be dead if 7668 // all its users except the induction variable are dead. 7669 auto *Latch = OrigLoop->getLoopLatch(); 7670 for (auto &Induction : Legal->getInductionVars()) { 7671 PHINode *Ind = Induction.first; 7672 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7673 7674 // If the tail is to be folded by masking, the primary induction variable, 7675 // if exists, isn't dead: it will be used for masking. Don't kill it. 7676 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7677 continue; 7678 7679 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7680 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7681 })) 7682 DeadInstructions.insert(IndUpdate); 7683 } 7684 } 7685 7686 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7687 7688 //===--------------------------------------------------------------------===// 7689 // EpilogueVectorizerMainLoop 7690 //===--------------------------------------------------------------------===// 7691 7692 /// This function is partially responsible for generating the control flow 7693 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7694 std::pair<BasicBlock *, Value *> 7695 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7696 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7697 7698 // Workaround! Compute the trip count of the original loop and cache it 7699 // before we start modifying the CFG. This code has a systemic problem 7700 // wherein it tries to run analysis over partially constructed IR; this is 7701 // wrong, and not simply for SCEV. The trip count of the original loop 7702 // simply happens to be prone to hitting this in practice. In theory, we 7703 // can hit the same issue for any SCEV, or ValueTracking query done during 7704 // mutation. See PR49900. 7705 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7706 createVectorLoopSkeleton(""); 7707 7708 // Generate the code to check the minimum iteration count of the vector 7709 // epilogue (see below). 7710 EPI.EpilogueIterationCountCheck = 7711 emitIterationCountCheck(LoopScalarPreHeader, true); 7712 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7713 7714 // Generate the code to check any assumptions that we've made for SCEV 7715 // expressions. 7716 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7717 7718 // Generate the code that checks at runtime if arrays overlap. We put the 7719 // checks into a separate block to make the more common case of few elements 7720 // faster. 7721 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7722 7723 // Generate the iteration count check for the main loop, *after* the check 7724 // for the epilogue loop, so that the path-length is shorter for the case 7725 // that goes directly through the vector epilogue. The longer-path length for 7726 // the main loop is compensated for, by the gain from vectorizing the larger 7727 // trip count. Note: the branch will get updated later on when we vectorize 7728 // the epilogue. 7729 EPI.MainLoopIterationCountCheck = 7730 emitIterationCountCheck(LoopScalarPreHeader, false); 7731 7732 // Generate the induction variable. 7733 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7734 7735 // Skip induction resume value creation here because they will be created in 7736 // the second pass. If we created them here, they wouldn't be used anyway, 7737 // because the vplan in the second pass still contains the inductions from the 7738 // original loop. 7739 7740 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7741 } 7742 7743 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7744 LLVM_DEBUG({ 7745 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7746 << "Main Loop VF:" << EPI.MainLoopVF 7747 << ", Main Loop UF:" << EPI.MainLoopUF 7748 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7749 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7750 }); 7751 } 7752 7753 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7754 DEBUG_WITH_TYPE(VerboseDebug, { 7755 dbgs() << "intermediate fn:\n" 7756 << *OrigLoop->getHeader()->getParent() << "\n"; 7757 }); 7758 } 7759 7760 BasicBlock * 7761 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7762 bool ForEpilogue) { 7763 assert(Bypass && "Expected valid bypass basic block."); 7764 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7765 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7766 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7767 // Reuse existing vector loop preheader for TC checks. 7768 // Note that new preheader block is generated for vector loop. 7769 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7770 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7771 7772 // Generate code to check if the loop's trip count is less than VF * UF of the 7773 // main vector loop. 7774 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7775 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7776 7777 Value *CheckMinIters = Builder.CreateICmp( 7778 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7779 "min.iters.check"); 7780 7781 if (!ForEpilogue) 7782 TCCheckBlock->setName("vector.main.loop.iter.check"); 7783 7784 // Create new preheader for vector loop. 7785 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7786 DT, LI, nullptr, "vector.ph"); 7787 7788 if (ForEpilogue) { 7789 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7790 DT->getNode(Bypass)->getIDom()) && 7791 "TC check is expected to dominate Bypass"); 7792 7793 // Update dominator for Bypass & LoopExit. 7794 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7795 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7796 // For loops with multiple exits, there's no edge from the middle block 7797 // to exit blocks (as the epilogue must run) and thus no need to update 7798 // the immediate dominator of the exit blocks. 7799 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7800 7801 LoopBypassBlocks.push_back(TCCheckBlock); 7802 7803 // Save the trip count so we don't have to regenerate it in the 7804 // vec.epilog.iter.check. This is safe to do because the trip count 7805 // generated here dominates the vector epilog iter check. 7806 EPI.TripCount = Count; 7807 } 7808 7809 ReplaceInstWithInst( 7810 TCCheckBlock->getTerminator(), 7811 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7812 7813 return TCCheckBlock; 7814 } 7815 7816 //===--------------------------------------------------------------------===// 7817 // EpilogueVectorizerEpilogueLoop 7818 //===--------------------------------------------------------------------===// 7819 7820 /// This function is partially responsible for generating the control flow 7821 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7822 std::pair<BasicBlock *, Value *> 7823 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7824 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7825 createVectorLoopSkeleton("vec.epilog."); 7826 7827 // Now, compare the remaining count and if there aren't enough iterations to 7828 // execute the vectorized epilogue skip to the scalar part. 7829 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7830 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7831 LoopVectorPreHeader = 7832 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7833 LI, nullptr, "vec.epilog.ph"); 7834 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7835 VecEpilogueIterationCountCheck); 7836 7837 // Adjust the control flow taking the state info from the main loop 7838 // vectorization into account. 7839 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7840 "expected this to be saved from the previous pass."); 7841 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7842 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7843 7844 DT->changeImmediateDominator(LoopVectorPreHeader, 7845 EPI.MainLoopIterationCountCheck); 7846 7847 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7848 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7849 7850 if (EPI.SCEVSafetyCheck) 7851 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7852 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7853 if (EPI.MemSafetyCheck) 7854 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7855 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7856 7857 DT->changeImmediateDominator( 7858 VecEpilogueIterationCountCheck, 7859 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7860 7861 DT->changeImmediateDominator(LoopScalarPreHeader, 7862 EPI.EpilogueIterationCountCheck); 7863 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7864 // If there is an epilogue which must run, there's no edge from the 7865 // middle block to exit blocks and thus no need to update the immediate 7866 // dominator of the exit blocks. 7867 DT->changeImmediateDominator(LoopExitBlock, 7868 EPI.EpilogueIterationCountCheck); 7869 7870 // Keep track of bypass blocks, as they feed start values to the induction 7871 // phis in the scalar loop preheader. 7872 if (EPI.SCEVSafetyCheck) 7873 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7874 if (EPI.MemSafetyCheck) 7875 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7876 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7877 7878 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7879 // merge control-flow from the latch block and the middle block. Update the 7880 // incoming values here and move the Phi into the preheader. 7881 SmallVector<PHINode *, 4> PhisInBlock; 7882 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7883 PhisInBlock.push_back(&Phi); 7884 7885 for (PHINode *Phi : PhisInBlock) { 7886 Phi->replaceIncomingBlockWith( 7887 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7888 VecEpilogueIterationCountCheck); 7889 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7890 if (EPI.SCEVSafetyCheck) 7891 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7892 if (EPI.MemSafetyCheck) 7893 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7894 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7895 } 7896 7897 // Generate a resume induction for the vector epilogue and put it in the 7898 // vector epilogue preheader 7899 Type *IdxTy = Legal->getWidestInductionType(); 7900 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7901 LoopVectorPreHeader->getFirstNonPHI()); 7902 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7903 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7904 EPI.MainLoopIterationCountCheck); 7905 7906 // Generate induction resume values. These variables save the new starting 7907 // indexes for the scalar loop. They are used to test if there are any tail 7908 // iterations left once the vector loop has completed. 7909 // Note that when the vectorized epilogue is skipped due to iteration count 7910 // check, then the resume value for the induction variable comes from 7911 // the trip count of the main vector loop, hence passing the AdditionalBypass 7912 // argument. 7913 createInductionResumeValues({VecEpilogueIterationCountCheck, 7914 EPI.VectorTripCount} /* AdditionalBypass */); 7915 7916 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7917 } 7918 7919 BasicBlock * 7920 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7921 BasicBlock *Bypass, BasicBlock *Insert) { 7922 7923 assert(EPI.TripCount && 7924 "Expected trip count to have been safed in the first pass."); 7925 assert( 7926 (!isa<Instruction>(EPI.TripCount) || 7927 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7928 "saved trip count does not dominate insertion point."); 7929 Value *TC = EPI.TripCount; 7930 IRBuilder<> Builder(Insert->getTerminator()); 7931 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7932 7933 // Generate code to check if the loop's trip count is less than VF * UF of the 7934 // vector epilogue loop. 7935 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7936 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7937 7938 Value *CheckMinIters = 7939 Builder.CreateICmp(P, Count, 7940 createStepForVF(Builder, Count->getType(), 7941 EPI.EpilogueVF, EPI.EpilogueUF), 7942 "min.epilog.iters.check"); 7943 7944 ReplaceInstWithInst( 7945 Insert->getTerminator(), 7946 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7947 7948 LoopBypassBlocks.push_back(Insert); 7949 return Insert; 7950 } 7951 7952 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7953 LLVM_DEBUG({ 7954 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7955 << "Epilogue Loop VF:" << EPI.EpilogueVF 7956 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7957 }); 7958 } 7959 7960 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7961 DEBUG_WITH_TYPE(VerboseDebug, { 7962 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7963 }); 7964 } 7965 7966 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7967 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7968 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7969 bool PredicateAtRangeStart = Predicate(Range.Start); 7970 7971 for (ElementCount TmpVF = Range.Start * 2; 7972 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7973 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7974 Range.End = TmpVF; 7975 break; 7976 } 7977 7978 return PredicateAtRangeStart; 7979 } 7980 7981 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7982 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7983 /// of VF's starting at a given VF and extending it as much as possible. Each 7984 /// vectorization decision can potentially shorten this sub-range during 7985 /// buildVPlan(). 7986 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7987 ElementCount MaxVF) { 7988 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7989 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7990 VFRange SubRange = {VF, MaxVFPlusOne}; 7991 VPlans.push_back(buildVPlan(SubRange)); 7992 VF = SubRange.End; 7993 } 7994 } 7995 7996 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7997 VPlanPtr &Plan) { 7998 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7999 8000 // Look for cached value. 8001 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8002 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8003 if (ECEntryIt != EdgeMaskCache.end()) 8004 return ECEntryIt->second; 8005 8006 VPValue *SrcMask = createBlockInMask(Src, Plan); 8007 8008 // The terminator has to be a branch inst! 8009 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8010 assert(BI && "Unexpected terminator found"); 8011 8012 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8013 return EdgeMaskCache[Edge] = SrcMask; 8014 8015 // If source is an exiting block, we know the exit edge is dynamically dead 8016 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8017 // adding uses of an otherwise potentially dead instruction. 8018 if (OrigLoop->isLoopExiting(Src)) 8019 return EdgeMaskCache[Edge] = SrcMask; 8020 8021 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8022 assert(EdgeMask && "No Edge Mask found for condition"); 8023 8024 if (BI->getSuccessor(0) != Dst) 8025 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8026 8027 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8028 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8029 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8030 // The select version does not introduce new UB if SrcMask is false and 8031 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8032 VPValue *False = Plan->getOrAddVPValue( 8033 ConstantInt::getFalse(BI->getCondition()->getType())); 8034 EdgeMask = 8035 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8036 } 8037 8038 return EdgeMaskCache[Edge] = EdgeMask; 8039 } 8040 8041 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8042 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8043 8044 // Look for cached value. 8045 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8046 if (BCEntryIt != BlockMaskCache.end()) 8047 return BCEntryIt->second; 8048 8049 // All-one mask is modelled as no-mask following the convention for masked 8050 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8051 VPValue *BlockMask = nullptr; 8052 8053 if (OrigLoop->getHeader() == BB) { 8054 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8055 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8056 8057 // Introduce the early-exit compare IV <= BTC to form header block mask. 8058 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8059 // constructing the desired canonical IV in the header block as its first 8060 // non-phi instructions. 8061 assert(CM.foldTailByMasking() && "must fold the tail"); 8062 VPBasicBlock *HeaderVPBB = 8063 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8064 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8065 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8066 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8067 8068 VPBuilder::InsertPointGuard Guard(Builder); 8069 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8070 if (CM.TTI.emitGetActiveLaneMask()) { 8071 VPValue *TC = Plan->getOrCreateTripCount(); 8072 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8073 } else { 8074 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8075 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8076 } 8077 return BlockMaskCache[BB] = BlockMask; 8078 } 8079 8080 // This is the block mask. We OR all incoming edges. 8081 for (auto *Predecessor : predecessors(BB)) { 8082 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8083 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8084 return BlockMaskCache[BB] = EdgeMask; 8085 8086 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8087 BlockMask = EdgeMask; 8088 continue; 8089 } 8090 8091 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8092 } 8093 8094 return BlockMaskCache[BB] = BlockMask; 8095 } 8096 8097 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8098 ArrayRef<VPValue *> Operands, 8099 VFRange &Range, 8100 VPlanPtr &Plan) { 8101 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8102 "Must be called with either a load or store"); 8103 8104 auto willWiden = [&](ElementCount VF) -> bool { 8105 LoopVectorizationCostModel::InstWidening Decision = 8106 CM.getWideningDecision(I, VF); 8107 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8108 "CM decision should be taken at this point."); 8109 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8110 return true; 8111 if (CM.isScalarAfterVectorization(I, VF) || 8112 CM.isProfitableToScalarize(I, VF)) 8113 return false; 8114 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8115 }; 8116 8117 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8118 return nullptr; 8119 8120 VPValue *Mask = nullptr; 8121 if (Legal->isMaskRequired(I)) 8122 Mask = createBlockInMask(I->getParent(), Plan); 8123 8124 // Determine if the pointer operand of the access is either consecutive or 8125 // reverse consecutive. 8126 LoopVectorizationCostModel::InstWidening Decision = 8127 CM.getWideningDecision(I, Range.Start); 8128 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8129 bool Consecutive = 8130 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8131 8132 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8133 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8134 Consecutive, Reverse); 8135 8136 StoreInst *Store = cast<StoreInst>(I); 8137 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8138 Mask, Consecutive, Reverse); 8139 } 8140 8141 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8142 /// insert a recipe to expand the step for the induction recipe. 8143 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8144 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8145 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8146 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8147 // Returns true if an instruction \p I should be scalarized instead of 8148 // vectorized for the chosen vectorization factor. 8149 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8150 return CM.isScalarAfterVectorization(I, VF) || 8151 CM.isProfitableToScalarize(I, VF); 8152 }; 8153 8154 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8155 [&](ElementCount VF) { 8156 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8157 }, 8158 Range); 8159 assert(IndDesc.getStartValue() == 8160 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8161 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8162 "step must be loop invariant"); 8163 8164 VPValue *Step = 8165 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8166 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8167 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8168 !NeedsScalarIVOnly); 8169 } 8170 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8171 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8172 !NeedsScalarIVOnly); 8173 } 8174 8175 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8176 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8177 8178 // Check if this is an integer or fp induction. If so, build the recipe that 8179 // produces its scalar and vector values. 8180 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8181 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8182 *PSE.getSE(), *OrigLoop, Range); 8183 8184 // Check if this is pointer induction. If so, build the recipe for it. 8185 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8186 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8187 *PSE.getSE()); 8188 return nullptr; 8189 } 8190 8191 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8192 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8193 // Optimize the special case where the source is a constant integer 8194 // induction variable. Notice that we can only optimize the 'trunc' case 8195 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8196 // (c) other casts depend on pointer size. 8197 8198 // Determine whether \p K is a truncation based on an induction variable that 8199 // can be optimized. 8200 auto isOptimizableIVTruncate = 8201 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8202 return [=](ElementCount VF) -> bool { 8203 return CM.isOptimizableIVTruncate(K, VF); 8204 }; 8205 }; 8206 8207 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8208 isOptimizableIVTruncate(I), Range)) { 8209 8210 auto *Phi = cast<PHINode>(I->getOperand(0)); 8211 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8212 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8213 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8214 *PSE.getSE(), *OrigLoop, Range); 8215 } 8216 return nullptr; 8217 } 8218 8219 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8220 ArrayRef<VPValue *> Operands, 8221 VPlanPtr &Plan) { 8222 // If all incoming values are equal, the incoming VPValue can be used directly 8223 // instead of creating a new VPBlendRecipe. 8224 VPValue *FirstIncoming = Operands[0]; 8225 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8226 return FirstIncoming == Inc; 8227 })) { 8228 return Operands[0]; 8229 } 8230 8231 unsigned NumIncoming = Phi->getNumIncomingValues(); 8232 // For in-loop reductions, we do not need to create an additional select. 8233 VPValue *InLoopVal = nullptr; 8234 for (unsigned In = 0; In < NumIncoming; In++) { 8235 PHINode *PhiOp = 8236 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8237 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8238 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8239 InLoopVal = Operands[In]; 8240 } 8241 } 8242 8243 assert((!InLoopVal || NumIncoming == 2) && 8244 "Found an in-loop reduction for PHI with unexpected number of " 8245 "incoming values"); 8246 if (InLoopVal) 8247 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8248 8249 // We know that all PHIs in non-header blocks are converted into selects, so 8250 // we don't have to worry about the insertion order and we can just use the 8251 // builder. At this point we generate the predication tree. There may be 8252 // duplications since this is a simple recursive scan, but future 8253 // optimizations will clean it up. 8254 SmallVector<VPValue *, 2> OperandsWithMask; 8255 8256 for (unsigned In = 0; In < NumIncoming; In++) { 8257 VPValue *EdgeMask = 8258 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8259 assert((EdgeMask || NumIncoming == 1) && 8260 "Multiple predecessors with one having a full mask"); 8261 OperandsWithMask.push_back(Operands[In]); 8262 if (EdgeMask) 8263 OperandsWithMask.push_back(EdgeMask); 8264 } 8265 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8266 } 8267 8268 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8269 ArrayRef<VPValue *> Operands, 8270 VFRange &Range) const { 8271 8272 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8273 [this, CI](ElementCount VF) { 8274 return CM.isScalarWithPredication(CI, VF); 8275 }, 8276 Range); 8277 8278 if (IsPredicated) 8279 return nullptr; 8280 8281 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8282 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8283 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8284 ID == Intrinsic::pseudoprobe || 8285 ID == Intrinsic::experimental_noalias_scope_decl)) 8286 return nullptr; 8287 8288 auto willWiden = [&](ElementCount VF) -> bool { 8289 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8290 // The following case may be scalarized depending on the VF. 8291 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8292 // version of the instruction. 8293 // Is it beneficial to perform intrinsic call compared to lib call? 8294 bool NeedToScalarize = false; 8295 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8296 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8297 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8298 return UseVectorIntrinsic || !NeedToScalarize; 8299 }; 8300 8301 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8302 return nullptr; 8303 8304 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8305 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8306 } 8307 8308 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8309 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8310 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8311 // Instruction should be widened, unless it is scalar after vectorization, 8312 // scalarization is profitable or it is predicated. 8313 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8314 return CM.isScalarAfterVectorization(I, VF) || 8315 CM.isProfitableToScalarize(I, VF) || 8316 CM.isScalarWithPredication(I, VF); 8317 }; 8318 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8319 Range); 8320 } 8321 8322 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8323 ArrayRef<VPValue *> Operands) const { 8324 auto IsVectorizableOpcode = [](unsigned Opcode) { 8325 switch (Opcode) { 8326 case Instruction::Add: 8327 case Instruction::And: 8328 case Instruction::AShr: 8329 case Instruction::BitCast: 8330 case Instruction::FAdd: 8331 case Instruction::FCmp: 8332 case Instruction::FDiv: 8333 case Instruction::FMul: 8334 case Instruction::FNeg: 8335 case Instruction::FPExt: 8336 case Instruction::FPToSI: 8337 case Instruction::FPToUI: 8338 case Instruction::FPTrunc: 8339 case Instruction::FRem: 8340 case Instruction::FSub: 8341 case Instruction::ICmp: 8342 case Instruction::IntToPtr: 8343 case Instruction::LShr: 8344 case Instruction::Mul: 8345 case Instruction::Or: 8346 case Instruction::PtrToInt: 8347 case Instruction::SDiv: 8348 case Instruction::Select: 8349 case Instruction::SExt: 8350 case Instruction::Shl: 8351 case Instruction::SIToFP: 8352 case Instruction::SRem: 8353 case Instruction::Sub: 8354 case Instruction::Trunc: 8355 case Instruction::UDiv: 8356 case Instruction::UIToFP: 8357 case Instruction::URem: 8358 case Instruction::Xor: 8359 case Instruction::ZExt: 8360 case Instruction::Freeze: 8361 return true; 8362 } 8363 return false; 8364 }; 8365 8366 if (!IsVectorizableOpcode(I->getOpcode())) 8367 return nullptr; 8368 8369 // Success: widen this instruction. 8370 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8371 } 8372 8373 void VPRecipeBuilder::fixHeaderPhis() { 8374 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8375 for (VPHeaderPHIRecipe *R : PhisToFix) { 8376 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8377 VPRecipeBase *IncR = 8378 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8379 R->addOperand(IncR->getVPSingleValue()); 8380 } 8381 } 8382 8383 VPBasicBlock *VPRecipeBuilder::handleReplication( 8384 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8385 VPlanPtr &Plan) { 8386 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8387 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8388 Range); 8389 8390 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8391 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8392 Range); 8393 8394 // Even if the instruction is not marked as uniform, there are certain 8395 // intrinsic calls that can be effectively treated as such, so we check for 8396 // them here. Conservatively, we only do this for scalable vectors, since 8397 // for fixed-width VFs we can always fall back on full scalarization. 8398 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8399 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8400 case Intrinsic::assume: 8401 case Intrinsic::lifetime_start: 8402 case Intrinsic::lifetime_end: 8403 // For scalable vectors if one of the operands is variant then we still 8404 // want to mark as uniform, which will generate one instruction for just 8405 // the first lane of the vector. We can't scalarize the call in the same 8406 // way as for fixed-width vectors because we don't know how many lanes 8407 // there are. 8408 // 8409 // The reasons for doing it this way for scalable vectors are: 8410 // 1. For the assume intrinsic generating the instruction for the first 8411 // lane is still be better than not generating any at all. For 8412 // example, the input may be a splat across all lanes. 8413 // 2. For the lifetime start/end intrinsics the pointer operand only 8414 // does anything useful when the input comes from a stack object, 8415 // which suggests it should always be uniform. For non-stack objects 8416 // the effect is to poison the object, which still allows us to 8417 // remove the call. 8418 IsUniform = true; 8419 break; 8420 default: 8421 break; 8422 } 8423 } 8424 8425 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8426 IsUniform, IsPredicated); 8427 setRecipe(I, Recipe); 8428 Plan->addVPValue(I, Recipe); 8429 8430 // Find if I uses a predicated instruction. If so, it will use its scalar 8431 // value. Avoid hoisting the insert-element which packs the scalar value into 8432 // a vector value, as that happens iff all users use the vector value. 8433 for (VPValue *Op : Recipe->operands()) { 8434 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8435 if (!PredR) 8436 continue; 8437 auto *RepR = 8438 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8439 assert(RepR->isPredicated() && 8440 "expected Replicate recipe to be predicated"); 8441 RepR->setAlsoPack(false); 8442 } 8443 8444 // Finalize the recipe for Instr, first if it is not predicated. 8445 if (!IsPredicated) { 8446 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8447 VPBB->appendRecipe(Recipe); 8448 return VPBB; 8449 } 8450 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8451 8452 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8453 assert(SingleSucc && "VPBB must have a single successor when handling " 8454 "predicated replication."); 8455 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8456 // Record predicated instructions for above packing optimizations. 8457 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8458 VPBlockUtils::insertBlockAfter(Region, VPBB); 8459 auto *RegSucc = new VPBasicBlock(); 8460 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8461 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8462 return RegSucc; 8463 } 8464 8465 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8466 VPRecipeBase *PredRecipe, 8467 VPlanPtr &Plan) { 8468 // Instructions marked for predication are replicated and placed under an 8469 // if-then construct to prevent side-effects. 8470 8471 // Generate recipes to compute the block mask for this region. 8472 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8473 8474 // Build the triangular if-then region. 8475 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8476 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8477 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8478 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8479 auto *PHIRecipe = Instr->getType()->isVoidTy() 8480 ? nullptr 8481 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8482 if (PHIRecipe) { 8483 Plan->removeVPValueFor(Instr); 8484 Plan->addVPValue(Instr, PHIRecipe); 8485 } 8486 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8487 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8488 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8489 8490 // Note: first set Entry as region entry and then connect successors starting 8491 // from it in order, to propagate the "parent" of each VPBasicBlock. 8492 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8493 VPBlockUtils::connectBlocks(Pred, Exiting); 8494 8495 return Region; 8496 } 8497 8498 VPRecipeOrVPValueTy 8499 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8500 ArrayRef<VPValue *> Operands, 8501 VFRange &Range, VPlanPtr &Plan) { 8502 // First, check for specific widening recipes that deal with inductions, Phi 8503 // nodes, calls and memory operations. 8504 VPRecipeBase *Recipe; 8505 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8506 if (Phi->getParent() != OrigLoop->getHeader()) 8507 return tryToBlend(Phi, Operands, Plan); 8508 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8509 return toVPRecipeResult(Recipe); 8510 8511 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8512 assert((Legal->isReductionVariable(Phi) || 8513 Legal->isFirstOrderRecurrence(Phi)) && 8514 "can only widen reductions and first-order recurrences here"); 8515 VPValue *StartV = Operands[0]; 8516 if (Legal->isReductionVariable(Phi)) { 8517 const RecurrenceDescriptor &RdxDesc = 8518 Legal->getReductionVars().find(Phi)->second; 8519 assert(RdxDesc.getRecurrenceStartValue() == 8520 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8521 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8522 CM.isInLoopReduction(Phi), 8523 CM.useOrderedReductions(RdxDesc)); 8524 } else { 8525 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8526 } 8527 8528 // Record the incoming value from the backedge, so we can add the incoming 8529 // value from the backedge after all recipes have been created. 8530 recordRecipeOf(cast<Instruction>( 8531 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8532 PhisToFix.push_back(PhiRecipe); 8533 return toVPRecipeResult(PhiRecipe); 8534 } 8535 8536 if (isa<TruncInst>(Instr) && 8537 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8538 Range, *Plan))) 8539 return toVPRecipeResult(Recipe); 8540 8541 // All widen recipes below deal only with VF > 1. 8542 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8543 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8544 return nullptr; 8545 8546 if (auto *CI = dyn_cast<CallInst>(Instr)) 8547 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8548 8549 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8550 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8551 8552 if (!shouldWiden(Instr, Range)) 8553 return nullptr; 8554 8555 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8556 return toVPRecipeResult(new VPWidenGEPRecipe( 8557 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8558 8559 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8560 bool InvariantCond = 8561 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8562 return toVPRecipeResult(new VPWidenSelectRecipe( 8563 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8564 } 8565 8566 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8567 } 8568 8569 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8570 ElementCount MaxVF) { 8571 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8572 8573 // Collect instructions from the original loop that will become trivially dead 8574 // in the vectorized loop. We don't need to vectorize these instructions. For 8575 // example, original induction update instructions can become dead because we 8576 // separately emit induction "steps" when generating code for the new loop. 8577 // Similarly, we create a new latch condition when setting up the structure 8578 // of the new loop, so the old one can become dead. 8579 SmallPtrSet<Instruction *, 4> DeadInstructions; 8580 collectTriviallyDeadInstructions(DeadInstructions); 8581 8582 // Add assume instructions we need to drop to DeadInstructions, to prevent 8583 // them from being added to the VPlan. 8584 // TODO: We only need to drop assumes in blocks that get flattend. If the 8585 // control flow is preserved, we should keep them. 8586 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8587 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8588 8589 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8590 // Dead instructions do not need sinking. Remove them from SinkAfter. 8591 for (Instruction *I : DeadInstructions) 8592 SinkAfter.erase(I); 8593 8594 // Cannot sink instructions after dead instructions (there won't be any 8595 // recipes for them). Instead, find the first non-dead previous instruction. 8596 for (auto &P : Legal->getSinkAfter()) { 8597 Instruction *SinkTarget = P.second; 8598 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8599 (void)FirstInst; 8600 while (DeadInstructions.contains(SinkTarget)) { 8601 assert( 8602 SinkTarget != FirstInst && 8603 "Must find a live instruction (at least the one feeding the " 8604 "first-order recurrence PHI) before reaching beginning of the block"); 8605 SinkTarget = SinkTarget->getPrevNode(); 8606 assert(SinkTarget != P.first && 8607 "sink source equals target, no sinking required"); 8608 } 8609 P.second = SinkTarget; 8610 } 8611 8612 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8613 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8614 VFRange SubRange = {VF, MaxVFPlusOne}; 8615 VPlans.push_back( 8616 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8617 VF = SubRange.End; 8618 } 8619 } 8620 8621 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8622 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8623 // BranchOnCount VPInstruction to the latch. 8624 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8625 bool HasNUW) { 8626 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8627 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8628 8629 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8630 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8631 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8632 Header->insert(CanonicalIVPHI, Header->begin()); 8633 8634 auto *CanonicalIVIncrement = 8635 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8636 : VPInstruction::CanonicalIVIncrement, 8637 {CanonicalIVPHI}, DL); 8638 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8639 8640 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8641 EB->appendRecipe(CanonicalIVIncrement); 8642 8643 auto *BranchOnCount = 8644 new VPInstruction(VPInstruction::BranchOnCount, 8645 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8646 EB->appendRecipe(BranchOnCount); 8647 } 8648 8649 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8650 // original exit block. 8651 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8652 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8653 VPlan &Plan) { 8654 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8655 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8656 // Only handle single-exit loops with unique exit blocks for now. 8657 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8658 return; 8659 8660 // Introduce VPUsers modeling the exit values. 8661 for (PHINode &ExitPhi : ExitBB->phis()) { 8662 Value *IncomingValue = 8663 ExitPhi.getIncomingValueForBlock(ExitingBB); 8664 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8665 Plan.addLiveOut(&ExitPhi, V); 8666 } 8667 } 8668 8669 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8670 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8671 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8672 8673 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8674 8675 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8676 8677 // --------------------------------------------------------------------------- 8678 // Pre-construction: record ingredients whose recipes we'll need to further 8679 // process after constructing the initial VPlan. 8680 // --------------------------------------------------------------------------- 8681 8682 // Mark instructions we'll need to sink later and their targets as 8683 // ingredients whose recipe we'll need to record. 8684 for (auto &Entry : SinkAfter) { 8685 RecipeBuilder.recordRecipeOf(Entry.first); 8686 RecipeBuilder.recordRecipeOf(Entry.second); 8687 } 8688 for (auto &Reduction : CM.getInLoopReductionChains()) { 8689 PHINode *Phi = Reduction.first; 8690 RecurKind Kind = 8691 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8692 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8693 8694 RecipeBuilder.recordRecipeOf(Phi); 8695 for (auto &R : ReductionOperations) { 8696 RecipeBuilder.recordRecipeOf(R); 8697 // For min/max reductions, where we have a pair of icmp/select, we also 8698 // need to record the ICmp recipe, so it can be removed later. 8699 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8700 "Only min/max recurrences allowed for inloop reductions"); 8701 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8702 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8703 } 8704 } 8705 8706 // For each interleave group which is relevant for this (possibly trimmed) 8707 // Range, add it to the set of groups to be later applied to the VPlan and add 8708 // placeholders for its members' Recipes which we'll be replacing with a 8709 // single VPInterleaveRecipe. 8710 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8711 auto applyIG = [IG, this](ElementCount VF) -> bool { 8712 return (VF.isVector() && // Query is illegal for VF == 1 8713 CM.getWideningDecision(IG->getInsertPos(), VF) == 8714 LoopVectorizationCostModel::CM_Interleave); 8715 }; 8716 if (!getDecisionAndClampRange(applyIG, Range)) 8717 continue; 8718 InterleaveGroups.insert(IG); 8719 for (unsigned i = 0; i < IG->getFactor(); i++) 8720 if (Instruction *Member = IG->getMember(i)) 8721 RecipeBuilder.recordRecipeOf(Member); 8722 }; 8723 8724 // --------------------------------------------------------------------------- 8725 // Build initial VPlan: Scan the body of the loop in a topological order to 8726 // visit each basic block after having visited its predecessor basic blocks. 8727 // --------------------------------------------------------------------------- 8728 8729 // Create initial VPlan skeleton, starting with a block for the pre-header, 8730 // followed by a region for the vector loop, followed by the middle block. The 8731 // skeleton vector loop region contains a header and latch block. 8732 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8733 auto Plan = std::make_unique<VPlan>(Preheader); 8734 8735 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8736 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8737 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8738 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8739 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8740 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8741 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8742 8743 Instruction *DLInst = 8744 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8745 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8746 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8747 !CM.foldTailByMasking()); 8748 8749 // Scan the body of the loop in a topological order to visit each basic block 8750 // after having visited its predecessor basic blocks. 8751 LoopBlocksDFS DFS(OrigLoop); 8752 DFS.perform(LI); 8753 8754 VPBasicBlock *VPBB = HeaderVPBB; 8755 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8756 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8757 // Relevant instructions from basic block BB will be grouped into VPRecipe 8758 // ingredients and fill a new VPBasicBlock. 8759 unsigned VPBBsForBB = 0; 8760 if (VPBB != HeaderVPBB) 8761 VPBB->setName(BB->getName()); 8762 Builder.setInsertPoint(VPBB); 8763 8764 // Introduce each ingredient into VPlan. 8765 // TODO: Model and preserve debug intrinsics in VPlan. 8766 for (Instruction &I : BB->instructionsWithoutDebug()) { 8767 Instruction *Instr = &I; 8768 8769 // First filter out irrelevant instructions, to ensure no recipes are 8770 // built for them. 8771 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8772 continue; 8773 8774 SmallVector<VPValue *, 4> Operands; 8775 auto *Phi = dyn_cast<PHINode>(Instr); 8776 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8777 Operands.push_back(Plan->getOrAddVPValue( 8778 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8779 } else { 8780 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8781 Operands = {OpRange.begin(), OpRange.end()}; 8782 } 8783 8784 // Invariant stores inside loop will be deleted and a single store 8785 // with the final reduction value will be added to the exit block 8786 StoreInst *SI; 8787 if ((SI = dyn_cast<StoreInst>(&I)) && 8788 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8789 continue; 8790 8791 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8792 Instr, Operands, Range, Plan)) { 8793 // If Instr can be simplified to an existing VPValue, use it. 8794 if (RecipeOrValue.is<VPValue *>()) { 8795 auto *VPV = RecipeOrValue.get<VPValue *>(); 8796 Plan->addVPValue(Instr, VPV); 8797 // If the re-used value is a recipe, register the recipe for the 8798 // instruction, in case the recipe for Instr needs to be recorded. 8799 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8800 RecipeBuilder.setRecipe(Instr, R); 8801 continue; 8802 } 8803 // Otherwise, add the new recipe. 8804 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8805 for (auto *Def : Recipe->definedValues()) { 8806 auto *UV = Def->getUnderlyingValue(); 8807 Plan->addVPValue(UV, Def); 8808 } 8809 8810 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8811 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8812 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8813 // of the header block. That can happen for truncates of induction 8814 // variables. Those recipes are moved to the phi section of the header 8815 // block after applying SinkAfter, which relies on the original 8816 // position of the trunc. 8817 assert(isa<TruncInst>(Instr)); 8818 InductionsToMove.push_back( 8819 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8820 } 8821 RecipeBuilder.setRecipe(Instr, Recipe); 8822 VPBB->appendRecipe(Recipe); 8823 continue; 8824 } 8825 8826 // Otherwise, if all widening options failed, Instruction is to be 8827 // replicated. This may create a successor for VPBB. 8828 VPBasicBlock *NextVPBB = 8829 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8830 if (NextVPBB != VPBB) { 8831 VPBB = NextVPBB; 8832 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8833 : ""); 8834 } 8835 } 8836 8837 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8838 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8839 } 8840 8841 HeaderVPBB->setName("vector.body"); 8842 8843 // Fold the last, empty block into its predecessor. 8844 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8845 assert(VPBB && "expected to fold last (empty) block"); 8846 // After here, VPBB should not be used. 8847 VPBB = nullptr; 8848 8849 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8850 8851 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8852 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8853 "entry block must be set to a VPRegionBlock having a non-empty entry " 8854 "VPBasicBlock"); 8855 RecipeBuilder.fixHeaderPhis(); 8856 8857 // --------------------------------------------------------------------------- 8858 // Transform initial VPlan: Apply previously taken decisions, in order, to 8859 // bring the VPlan to its final state. 8860 // --------------------------------------------------------------------------- 8861 8862 // Apply Sink-After legal constraints. 8863 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8864 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8865 if (Region && Region->isReplicator()) { 8866 assert(Region->getNumSuccessors() == 1 && 8867 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8868 assert(R->getParent()->size() == 1 && 8869 "A recipe in an original replicator region must be the only " 8870 "recipe in its block"); 8871 return Region; 8872 } 8873 return nullptr; 8874 }; 8875 for (auto &Entry : SinkAfter) { 8876 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8877 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8878 8879 auto *TargetRegion = GetReplicateRegion(Target); 8880 auto *SinkRegion = GetReplicateRegion(Sink); 8881 if (!SinkRegion) { 8882 // If the sink source is not a replicate region, sink the recipe directly. 8883 if (TargetRegion) { 8884 // The target is in a replication region, make sure to move Sink to 8885 // the block after it, not into the replication region itself. 8886 VPBasicBlock *NextBlock = 8887 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8888 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8889 } else 8890 Sink->moveAfter(Target); 8891 continue; 8892 } 8893 8894 // The sink source is in a replicate region. Unhook the region from the CFG. 8895 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8896 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8897 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8898 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8899 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8900 8901 if (TargetRegion) { 8902 // The target recipe is also in a replicate region, move the sink region 8903 // after the target region. 8904 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8905 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8906 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8907 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8908 } else { 8909 // The sink source is in a replicate region, we need to move the whole 8910 // replicate region, which should only contain a single recipe in the 8911 // main block. 8912 auto *SplitBlock = 8913 Target->getParent()->splitAt(std::next(Target->getIterator())); 8914 8915 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8916 8917 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8918 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8919 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8920 } 8921 } 8922 8923 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8924 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8925 8926 // Now that sink-after is done, move induction recipes for optimized truncates 8927 // to the phi section of the header block. 8928 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8929 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8930 8931 // Adjust the recipes for any inloop reductions. 8932 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 8933 RecipeBuilder, Range.Start); 8934 8935 // Introduce a recipe to combine the incoming and previous values of a 8936 // first-order recurrence. 8937 for (VPRecipeBase &R : 8938 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8939 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8940 if (!RecurPhi) 8941 continue; 8942 8943 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8944 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8945 auto *Region = GetReplicateRegion(PrevRecipe); 8946 if (Region) 8947 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 8948 if (Region || PrevRecipe->isPhi()) 8949 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8950 else 8951 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 8952 8953 auto *RecurSplice = cast<VPInstruction>( 8954 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 8955 {RecurPhi, RecurPhi->getBackedgeValue()})); 8956 8957 RecurPhi->replaceAllUsesWith(RecurSplice); 8958 // Set the first operand of RecurSplice to RecurPhi again, after replacing 8959 // all users. 8960 RecurSplice->setOperand(0, RecurPhi); 8961 } 8962 8963 // Interleave memory: for each Interleave Group we marked earlier as relevant 8964 // for this VPlan, replace the Recipes widening its memory instructions with a 8965 // single VPInterleaveRecipe at its insertion point. 8966 for (auto IG : InterleaveGroups) { 8967 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8968 RecipeBuilder.getRecipe(IG->getInsertPos())); 8969 SmallVector<VPValue *, 4> StoredValues; 8970 for (unsigned i = 0; i < IG->getFactor(); ++i) 8971 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8972 auto *StoreR = 8973 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8974 StoredValues.push_back(StoreR->getStoredValue()); 8975 } 8976 8977 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8978 Recipe->getMask()); 8979 VPIG->insertBefore(Recipe); 8980 unsigned J = 0; 8981 for (unsigned i = 0; i < IG->getFactor(); ++i) 8982 if (Instruction *Member = IG->getMember(i)) { 8983 if (!Member->getType()->isVoidTy()) { 8984 VPValue *OriginalV = Plan->getVPValue(Member); 8985 Plan->removeVPValueFor(Member); 8986 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8987 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8988 J++; 8989 } 8990 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8991 } 8992 } 8993 8994 // From this point onwards, VPlan-to-VPlan transformations may change the plan 8995 // in ways that accessing values using original IR values is incorrect. 8996 Plan->disableValue2VPValue(); 8997 8998 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 8999 VPlanTransforms::sinkScalarOperands(*Plan); 9000 VPlanTransforms::mergeReplicateRegions(*Plan); 9001 VPlanTransforms::removeDeadRecipes(*Plan); 9002 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9003 9004 std::string PlanName; 9005 raw_string_ostream RSO(PlanName); 9006 ElementCount VF = Range.Start; 9007 Plan->addVF(VF); 9008 RSO << "Initial VPlan for VF={" << VF; 9009 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9010 Plan->addVF(VF); 9011 RSO << "," << VF; 9012 } 9013 RSO << "},UF>=1"; 9014 RSO.flush(); 9015 Plan->setName(PlanName); 9016 9017 // Fold Exit block into its predecessor if possible. 9018 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9019 // VPBasicBlock as exit. 9020 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); 9021 9022 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9023 return Plan; 9024 } 9025 9026 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9027 // Outer loop handling: They may require CFG and instruction level 9028 // transformations before even evaluating whether vectorization is profitable. 9029 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9030 // the vectorization pipeline. 9031 assert(!OrigLoop->isInnermost()); 9032 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9033 9034 // Create new empty VPlan 9035 auto Plan = std::make_unique<VPlan>(); 9036 9037 // Build hierarchical CFG 9038 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9039 HCFGBuilder.buildHierarchicalCFG(); 9040 9041 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9042 VF *= 2) 9043 Plan->addVF(VF); 9044 9045 SmallPtrSet<Instruction *, 1> DeadInstructions; 9046 VPlanTransforms::VPInstructionsToVPRecipes( 9047 OrigLoop, Plan, 9048 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9049 DeadInstructions, *PSE.getSE()); 9050 9051 // Remove the existing terminator of the exiting block of the top-most region. 9052 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9053 auto *Term = 9054 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9055 Term->eraseFromParent(); 9056 9057 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9058 true); 9059 return Plan; 9060 } 9061 9062 // Adjust the recipes for reductions. For in-loop reductions the chain of 9063 // instructions leading from the loop exit instr to the phi need to be converted 9064 // to reductions, with one operand being vector and the other being the scalar 9065 // reduction chain. For other reductions, a select is introduced between the phi 9066 // and live-out recipes when folding the tail. 9067 void LoopVectorizationPlanner::adjustRecipesForReductions( 9068 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9069 ElementCount MinVF) { 9070 for (auto &Reduction : CM.getInLoopReductionChains()) { 9071 PHINode *Phi = Reduction.first; 9072 const RecurrenceDescriptor &RdxDesc = 9073 Legal->getReductionVars().find(Phi)->second; 9074 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9075 9076 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9077 continue; 9078 9079 // ReductionOperations are orders top-down from the phi's use to the 9080 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9081 // which of the two operands will remain scalar and which will be reduced. 9082 // For minmax the chain will be the select instructions. 9083 Instruction *Chain = Phi; 9084 for (Instruction *R : ReductionOperations) { 9085 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9086 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9087 9088 VPValue *ChainOp = Plan->getVPValue(Chain); 9089 unsigned FirstOpId; 9090 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9091 "Only min/max recurrences allowed for inloop reductions"); 9092 // Recognize a call to the llvm.fmuladd intrinsic. 9093 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9094 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9095 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9096 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9097 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9098 "Expected to replace a VPWidenSelectSC"); 9099 FirstOpId = 1; 9100 } else { 9101 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9102 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9103 "Expected to replace a VPWidenSC"); 9104 FirstOpId = 0; 9105 } 9106 unsigned VecOpId = 9107 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9108 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9109 9110 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9111 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9112 : nullptr; 9113 9114 if (IsFMulAdd) { 9115 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9116 // need to create an fmul recipe to use as the vector operand for the 9117 // fadd reduction. 9118 VPInstruction *FMulRecipe = new VPInstruction( 9119 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9120 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9121 WidenRecipe->getParent()->insert(FMulRecipe, 9122 WidenRecipe->getIterator()); 9123 VecOp = FMulRecipe; 9124 } 9125 VPReductionRecipe *RedRecipe = 9126 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9127 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9128 Plan->removeVPValueFor(R); 9129 Plan->addVPValue(R, RedRecipe); 9130 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9131 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9132 WidenRecipe->eraseFromParent(); 9133 9134 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9135 VPRecipeBase *CompareRecipe = 9136 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9137 assert(isa<VPWidenRecipe>(CompareRecipe) && 9138 "Expected to replace a VPWidenSC"); 9139 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9140 "Expected no remaining users"); 9141 CompareRecipe->eraseFromParent(); 9142 } 9143 Chain = R; 9144 } 9145 } 9146 9147 // If tail is folded by masking, introduce selects between the phi 9148 // and the live-out instruction of each reduction, at the beginning of the 9149 // dedicated latch block. 9150 if (CM.foldTailByMasking()) { 9151 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9152 for (VPRecipeBase &R : 9153 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9154 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9155 if (!PhiR || PhiR->isInLoop()) 9156 continue; 9157 VPValue *Cond = 9158 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9159 VPValue *Red = PhiR->getBackedgeValue(); 9160 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9161 "reduction recipe must be defined before latch"); 9162 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9163 } 9164 } 9165 } 9166 9167 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9168 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9169 VPSlotTracker &SlotTracker) const { 9170 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9171 IG->getInsertPos()->printAsOperand(O, false); 9172 O << ", "; 9173 getAddr()->printAsOperand(O, SlotTracker); 9174 VPValue *Mask = getMask(); 9175 if (Mask) { 9176 O << ", "; 9177 Mask->printAsOperand(O, SlotTracker); 9178 } 9179 9180 unsigned OpIdx = 0; 9181 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9182 if (!IG->getMember(i)) 9183 continue; 9184 if (getNumStoreOperands() > 0) { 9185 O << "\n" << Indent << " store "; 9186 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9187 O << " to index " << i; 9188 } else { 9189 O << "\n" << Indent << " "; 9190 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9191 O << " = load from index " << i; 9192 } 9193 ++OpIdx; 9194 } 9195 } 9196 #endif 9197 9198 void VPWidenCallRecipe::execute(VPTransformState &State) { 9199 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9200 *this, State); 9201 } 9202 9203 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9204 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9205 State.ILV->setDebugLocFromInst(&I); 9206 9207 // The condition can be loop invariant but still defined inside the 9208 // loop. This means that we can't just use the original 'cond' value. 9209 // We have to take the 'vectorized' value and pick the first lane. 9210 // Instcombine will make this a no-op. 9211 auto *InvarCond = 9212 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9213 9214 for (unsigned Part = 0; Part < State.UF; ++Part) { 9215 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9216 Value *Op0 = State.get(getOperand(1), Part); 9217 Value *Op1 = State.get(getOperand(2), Part); 9218 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9219 State.set(this, Sel, Part); 9220 State.ILV->addMetadata(Sel, &I); 9221 } 9222 } 9223 9224 void VPWidenRecipe::execute(VPTransformState &State) { 9225 auto &I = *cast<Instruction>(getUnderlyingValue()); 9226 auto &Builder = State.Builder; 9227 switch (I.getOpcode()) { 9228 case Instruction::Call: 9229 case Instruction::Br: 9230 case Instruction::PHI: 9231 case Instruction::GetElementPtr: 9232 case Instruction::Select: 9233 llvm_unreachable("This instruction is handled by a different recipe."); 9234 case Instruction::UDiv: 9235 case Instruction::SDiv: 9236 case Instruction::SRem: 9237 case Instruction::URem: 9238 case Instruction::Add: 9239 case Instruction::FAdd: 9240 case Instruction::Sub: 9241 case Instruction::FSub: 9242 case Instruction::FNeg: 9243 case Instruction::Mul: 9244 case Instruction::FMul: 9245 case Instruction::FDiv: 9246 case Instruction::FRem: 9247 case Instruction::Shl: 9248 case Instruction::LShr: 9249 case Instruction::AShr: 9250 case Instruction::And: 9251 case Instruction::Or: 9252 case Instruction::Xor: { 9253 // Just widen unops and binops. 9254 State.ILV->setDebugLocFromInst(&I); 9255 9256 for (unsigned Part = 0; Part < State.UF; ++Part) { 9257 SmallVector<Value *, 2> Ops; 9258 for (VPValue *VPOp : operands()) 9259 Ops.push_back(State.get(VPOp, Part)); 9260 9261 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9262 9263 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9264 VecOp->copyIRFlags(&I); 9265 9266 // If the instruction is vectorized and was in a basic block that needed 9267 // predication, we can't propagate poison-generating flags (nuw/nsw, 9268 // exact, etc.). The control flow has been linearized and the 9269 // instruction is no longer guarded by the predicate, which could make 9270 // the flag properties to no longer hold. 9271 if (State.MayGeneratePoisonRecipes.contains(this)) 9272 VecOp->dropPoisonGeneratingFlags(); 9273 } 9274 9275 // Use this vector value for all users of the original instruction. 9276 State.set(this, V, Part); 9277 State.ILV->addMetadata(V, &I); 9278 } 9279 9280 break; 9281 } 9282 case Instruction::Freeze: { 9283 State.ILV->setDebugLocFromInst(&I); 9284 9285 for (unsigned Part = 0; Part < State.UF; ++Part) { 9286 Value *Op = State.get(getOperand(0), Part); 9287 9288 Value *Freeze = Builder.CreateFreeze(Op); 9289 State.set(this, Freeze, Part); 9290 } 9291 break; 9292 } 9293 case Instruction::ICmp: 9294 case Instruction::FCmp: { 9295 // Widen compares. Generate vector compares. 9296 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9297 auto *Cmp = cast<CmpInst>(&I); 9298 State.ILV->setDebugLocFromInst(Cmp); 9299 for (unsigned Part = 0; Part < State.UF; ++Part) { 9300 Value *A = State.get(getOperand(0), Part); 9301 Value *B = State.get(getOperand(1), Part); 9302 Value *C = nullptr; 9303 if (FCmp) { 9304 // Propagate fast math flags. 9305 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9306 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9307 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9308 } else { 9309 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9310 } 9311 State.set(this, C, Part); 9312 State.ILV->addMetadata(C, &I); 9313 } 9314 9315 break; 9316 } 9317 9318 case Instruction::ZExt: 9319 case Instruction::SExt: 9320 case Instruction::FPToUI: 9321 case Instruction::FPToSI: 9322 case Instruction::FPExt: 9323 case Instruction::PtrToInt: 9324 case Instruction::IntToPtr: 9325 case Instruction::SIToFP: 9326 case Instruction::UIToFP: 9327 case Instruction::Trunc: 9328 case Instruction::FPTrunc: 9329 case Instruction::BitCast: { 9330 auto *CI = cast<CastInst>(&I); 9331 State.ILV->setDebugLocFromInst(CI); 9332 9333 /// Vectorize casts. 9334 Type *DestTy = (State.VF.isScalar()) 9335 ? CI->getType() 9336 : VectorType::get(CI->getType(), State.VF); 9337 9338 for (unsigned Part = 0; Part < State.UF; ++Part) { 9339 Value *A = State.get(getOperand(0), Part); 9340 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9341 State.set(this, Cast, Part); 9342 State.ILV->addMetadata(Cast, &I); 9343 } 9344 break; 9345 } 9346 default: 9347 // This instruction is not vectorized by simple widening. 9348 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9349 llvm_unreachable("Unhandled instruction!"); 9350 } // end of switch. 9351 } 9352 9353 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9354 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9355 // Construct a vector GEP by widening the operands of the scalar GEP as 9356 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9357 // results in a vector of pointers when at least one operand of the GEP 9358 // is vector-typed. Thus, to keep the representation compact, we only use 9359 // vector-typed operands for loop-varying values. 9360 9361 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9362 // If we are vectorizing, but the GEP has only loop-invariant operands, 9363 // the GEP we build (by only using vector-typed operands for 9364 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9365 // produce a vector of pointers, we need to either arbitrarily pick an 9366 // operand to broadcast, or broadcast a clone of the original GEP. 9367 // Here, we broadcast a clone of the original. 9368 // 9369 // TODO: If at some point we decide to scalarize instructions having 9370 // loop-invariant operands, this special case will no longer be 9371 // required. We would add the scalarization decision to 9372 // collectLoopScalars() and teach getVectorValue() to broadcast 9373 // the lane-zero scalar value. 9374 auto *Clone = State.Builder.Insert(GEP->clone()); 9375 for (unsigned Part = 0; Part < State.UF; ++Part) { 9376 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9377 State.set(this, EntryPart, Part); 9378 State.ILV->addMetadata(EntryPart, GEP); 9379 } 9380 } else { 9381 // If the GEP has at least one loop-varying operand, we are sure to 9382 // produce a vector of pointers. But if we are only unrolling, we want 9383 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9384 // produce with the code below will be scalar (if VF == 1) or vector 9385 // (otherwise). Note that for the unroll-only case, we still maintain 9386 // values in the vector mapping with initVector, as we do for other 9387 // instructions. 9388 for (unsigned Part = 0; Part < State.UF; ++Part) { 9389 // The pointer operand of the new GEP. If it's loop-invariant, we 9390 // won't broadcast it. 9391 auto *Ptr = IsPtrLoopInvariant 9392 ? State.get(getOperand(0), VPIteration(0, 0)) 9393 : State.get(getOperand(0), Part); 9394 9395 // Collect all the indices for the new GEP. If any index is 9396 // loop-invariant, we won't broadcast it. 9397 SmallVector<Value *, 4> Indices; 9398 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9399 VPValue *Operand = getOperand(I); 9400 if (IsIndexLoopInvariant[I - 1]) 9401 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9402 else 9403 Indices.push_back(State.get(Operand, Part)); 9404 } 9405 9406 // If the GEP instruction is vectorized and was in a basic block that 9407 // needed predication, we can't propagate the poison-generating 'inbounds' 9408 // flag. The control flow has been linearized and the GEP is no longer 9409 // guarded by the predicate, which could make the 'inbounds' properties to 9410 // no longer hold. 9411 bool IsInBounds = 9412 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9413 9414 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9415 // but it should be a vector, otherwise. 9416 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 9417 Indices, "", IsInBounds); 9418 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9419 "NewGEP is not a pointer vector"); 9420 State.set(this, NewGEP, Part); 9421 State.ILV->addMetadata(NewGEP, GEP); 9422 } 9423 } 9424 } 9425 9426 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9427 assert(!State.Instance && "Int or FP induction being replicated."); 9428 9429 Value *Start = getStartValue()->getLiveInIRValue(); 9430 const InductionDescriptor &ID = getInductionDescriptor(); 9431 TruncInst *Trunc = getTruncInst(); 9432 IRBuilderBase &Builder = State.Builder; 9433 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9434 assert(State.VF.isVector() && "must have vector VF"); 9435 9436 // The value from the original loop to which we are mapping the new induction 9437 // variable. 9438 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9439 9440 // Fast-math-flags propagate from the original induction instruction. 9441 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9442 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9443 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9444 9445 // Now do the actual transformations, and start with fetching the step value. 9446 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9447 9448 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9449 "Expected either an induction phi-node or a truncate of it!"); 9450 9451 // Construct the initial value of the vector IV in the vector loop preheader 9452 auto CurrIP = Builder.saveIP(); 9453 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9454 Builder.SetInsertPoint(VectorPH->getTerminator()); 9455 if (isa<TruncInst>(EntryVal)) { 9456 assert(Start->getType()->isIntegerTy() && 9457 "Truncation requires an integer type"); 9458 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9459 Step = Builder.CreateTrunc(Step, TruncType); 9460 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9461 } 9462 9463 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9464 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9465 Value *SteppedStart = getStepVector( 9466 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9467 9468 // We create vector phi nodes for both integer and floating-point induction 9469 // variables. Here, we determine the kind of arithmetic we will perform. 9470 Instruction::BinaryOps AddOp; 9471 Instruction::BinaryOps MulOp; 9472 if (Step->getType()->isIntegerTy()) { 9473 AddOp = Instruction::Add; 9474 MulOp = Instruction::Mul; 9475 } else { 9476 AddOp = ID.getInductionOpcode(); 9477 MulOp = Instruction::FMul; 9478 } 9479 9480 // Multiply the vectorization factor by the step using integer or 9481 // floating-point arithmetic as appropriate. 9482 Type *StepType = Step->getType(); 9483 Value *RuntimeVF; 9484 if (Step->getType()->isFloatingPointTy()) 9485 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9486 else 9487 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9488 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9489 9490 // Create a vector splat to use in the induction update. 9491 // 9492 // FIXME: If the step is non-constant, we create the vector splat with 9493 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9494 // handle a constant vector splat. 9495 Value *SplatVF = isa<Constant>(Mul) 9496 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9497 : Builder.CreateVectorSplat(State.VF, Mul); 9498 Builder.restoreIP(CurrIP); 9499 9500 // We may need to add the step a number of times, depending on the unroll 9501 // factor. The last of those goes into the PHI. 9502 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9503 &*State.CFG.PrevBB->getFirstInsertionPt()); 9504 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9505 Instruction *LastInduction = VecInd; 9506 for (unsigned Part = 0; Part < State.UF; ++Part) { 9507 State.set(this, LastInduction, Part); 9508 9509 if (isa<TruncInst>(EntryVal)) 9510 State.ILV->addMetadata(LastInduction, EntryVal); 9511 9512 LastInduction = cast<Instruction>( 9513 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9514 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9515 } 9516 9517 LastInduction->setName("vec.ind.next"); 9518 VecInd->addIncoming(SteppedStart, VectorPH); 9519 // Add induction update using an incorrect block temporarily. The phi node 9520 // will be fixed after VPlan execution. Note that at this point the latch 9521 // block cannot be used, as it does not exist yet. 9522 // TODO: Model increment value in VPlan, by turning the recipe into a 9523 // multi-def and a subclass of VPHeaderPHIRecipe. 9524 VecInd->addIncoming(LastInduction, VectorPH); 9525 } 9526 9527 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9528 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9529 "Not a pointer induction according to InductionDescriptor!"); 9530 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9531 "Unexpected type."); 9532 9533 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9534 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9535 9536 if (onlyScalarsGenerated(State.VF)) { 9537 // This is the normalized GEP that starts counting at zero. 9538 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9539 CanonicalIV, IndDesc.getStep()->getType()); 9540 // Determine the number of scalars we need to generate for each unroll 9541 // iteration. If the instruction is uniform, we only need to generate the 9542 // first lane. Otherwise, we generate all VF values. 9543 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9544 assert((IsUniform || !State.VF.isScalable()) && 9545 "Cannot scalarize a scalable VF"); 9546 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9547 9548 for (unsigned Part = 0; Part < State.UF; ++Part) { 9549 Value *PartStart = 9550 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9551 9552 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9553 Value *Idx = State.Builder.CreateAdd( 9554 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9555 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9556 9557 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9558 State.CFG.PrevBB->getTerminator()); 9559 Value *SclrGep = emitTransformedIndex( 9560 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9561 SclrGep->setName("next.gep"); 9562 State.set(this, SclrGep, VPIteration(Part, Lane)); 9563 } 9564 } 9565 return; 9566 } 9567 9568 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9569 "Induction step not a SCEV constant!"); 9570 Type *PhiType = IndDesc.getStep()->getType(); 9571 9572 // Build a pointer phi 9573 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9574 Type *ScStValueType = ScalarStartValue->getType(); 9575 PHINode *NewPointerPhi = 9576 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9577 9578 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9579 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9580 9581 // A pointer induction, performed by using a gep 9582 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9583 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9584 9585 const SCEV *ScalarStep = IndDesc.getStep(); 9586 SCEVExpander Exp(SE, DL, "induction"); 9587 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9588 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9589 Value *NumUnrolledElems = 9590 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9591 Value *InductionGEP = GetElementPtrInst::Create( 9592 IndDesc.getElementType(), NewPointerPhi, 9593 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9594 InductionLoc); 9595 // Add induction update using an incorrect block temporarily. The phi node 9596 // will be fixed after VPlan execution. Note that at this point the latch 9597 // block cannot be used, as it does not exist yet. 9598 // TODO: Model increment value in VPlan, by turning the recipe into a 9599 // multi-def and a subclass of VPHeaderPHIRecipe. 9600 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9601 9602 // Create UF many actual address geps that use the pointer 9603 // phi as base and a vectorized version of the step value 9604 // (<step*0, ..., step*N>) as offset. 9605 for (unsigned Part = 0; Part < State.UF; ++Part) { 9606 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9607 Value *StartOffsetScalar = 9608 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9609 Value *StartOffset = 9610 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9611 // Create a vector of consecutive numbers from zero to VF. 9612 StartOffset = State.Builder.CreateAdd( 9613 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9614 9615 Value *GEP = State.Builder.CreateGEP( 9616 IndDesc.getElementType(), NewPointerPhi, 9617 State.Builder.CreateMul( 9618 StartOffset, 9619 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9620 "vector.gep")); 9621 State.set(this, GEP, Part); 9622 } 9623 } 9624 9625 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9626 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9627 9628 // Fast-math-flags propagate from the original induction instruction. 9629 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9630 if (IndDesc.getInductionBinOp() && 9631 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9632 State.Builder.setFastMathFlags( 9633 IndDesc.getInductionBinOp()->getFastMathFlags()); 9634 9635 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9636 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9637 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9638 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9639 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9640 ScalarIV = 9641 Ty->isIntegerTy() 9642 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9643 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9644 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9645 getStartValue()->getLiveInIRValue(), Step, 9646 IndDesc); 9647 ScalarIV->setName("offset.idx"); 9648 } 9649 if (TruncToTy) { 9650 assert(Step->getType()->isIntegerTy() && 9651 "Truncation requires an integer step"); 9652 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9653 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9654 } 9655 return ScalarIV; 9656 }; 9657 9658 Value *ScalarIV = CreateScalarIV(Step); 9659 if (State.VF.isVector()) { 9660 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9661 return; 9662 } 9663 9664 for (unsigned Part = 0; Part < State.UF; ++Part) { 9665 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9666 Value *EntryPart; 9667 if (Step->getType()->isFloatingPointTy()) { 9668 Value *StartIdx = 9669 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9670 // Floating-point operations inherit FMF via the builder's flags. 9671 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9672 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9673 ScalarIV, MulOp); 9674 } else { 9675 Value *StartIdx = 9676 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9677 EntryPart = State.Builder.CreateAdd( 9678 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9679 } 9680 State.set(this, EntryPart, Part); 9681 } 9682 } 9683 9684 void VPBlendRecipe::execute(VPTransformState &State) { 9685 State.ILV->setDebugLocFromInst(Phi); 9686 // We know that all PHIs in non-header blocks are converted into 9687 // selects, so we don't have to worry about the insertion order and we 9688 // can just use the builder. 9689 // At this point we generate the predication tree. There may be 9690 // duplications since this is a simple recursive scan, but future 9691 // optimizations will clean it up. 9692 9693 unsigned NumIncoming = getNumIncomingValues(); 9694 9695 // Generate a sequence of selects of the form: 9696 // SELECT(Mask3, In3, 9697 // SELECT(Mask2, In2, 9698 // SELECT(Mask1, In1, 9699 // In0))) 9700 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9701 // are essentially undef are taken from In0. 9702 InnerLoopVectorizer::VectorParts Entry(State.UF); 9703 for (unsigned In = 0; In < NumIncoming; ++In) { 9704 for (unsigned Part = 0; Part < State.UF; ++Part) { 9705 // We might have single edge PHIs (blocks) - use an identity 9706 // 'select' for the first PHI operand. 9707 Value *In0 = State.get(getIncomingValue(In), Part); 9708 if (In == 0) 9709 Entry[Part] = In0; // Initialize with the first incoming value. 9710 else { 9711 // Select between the current value and the previous incoming edge 9712 // based on the incoming mask. 9713 Value *Cond = State.get(getMask(In), Part); 9714 Entry[Part] = 9715 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9716 } 9717 } 9718 } 9719 for (unsigned Part = 0; Part < State.UF; ++Part) 9720 State.set(this, Entry[Part], Part); 9721 } 9722 9723 void VPInterleaveRecipe::execute(VPTransformState &State) { 9724 assert(!State.Instance && "Interleave group being replicated."); 9725 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9726 getStoredValues(), getMask()); 9727 } 9728 9729 void VPReductionRecipe::execute(VPTransformState &State) { 9730 assert(!State.Instance && "Reduction being replicated."); 9731 Value *PrevInChain = State.get(getChainOp(), 0); 9732 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9733 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9734 // Propagate the fast-math flags carried by the underlying instruction. 9735 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9736 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9737 for (unsigned Part = 0; Part < State.UF; ++Part) { 9738 Value *NewVecOp = State.get(getVecOp(), Part); 9739 if (VPValue *Cond = getCondOp()) { 9740 Value *NewCond = State.get(Cond, Part); 9741 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9742 Value *Iden = RdxDesc->getRecurrenceIdentity( 9743 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9744 Value *IdenVec = 9745 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9746 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9747 NewVecOp = Select; 9748 } 9749 Value *NewRed; 9750 Value *NextInChain; 9751 if (IsOrdered) { 9752 if (State.VF.isVector()) 9753 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9754 PrevInChain); 9755 else 9756 NewRed = State.Builder.CreateBinOp( 9757 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9758 NewVecOp); 9759 PrevInChain = NewRed; 9760 } else { 9761 PrevInChain = State.get(getChainOp(), Part); 9762 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9763 } 9764 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9765 NextInChain = 9766 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9767 NewRed, PrevInChain); 9768 } else if (IsOrdered) 9769 NextInChain = NewRed; 9770 else 9771 NextInChain = State.Builder.CreateBinOp( 9772 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9773 PrevInChain); 9774 State.set(this, NextInChain, Part); 9775 } 9776 } 9777 9778 void VPReplicateRecipe::execute(VPTransformState &State) { 9779 if (State.Instance) { // Generate a single instance. 9780 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9781 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9782 IsPredicated, State); 9783 // Insert scalar instance packing it into a vector. 9784 if (AlsoPack && State.VF.isVector()) { 9785 // If we're constructing lane 0, initialize to start from poison. 9786 if (State.Instance->Lane.isFirstLane()) { 9787 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9788 Value *Poison = PoisonValue::get( 9789 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9790 State.set(this, Poison, State.Instance->Part); 9791 } 9792 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9793 } 9794 return; 9795 } 9796 9797 // Generate scalar instances for all VF lanes of all UF parts, unless the 9798 // instruction is uniform inwhich case generate only the first lane for each 9799 // of the UF parts. 9800 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9801 assert((!State.VF.isScalable() || IsUniform) && 9802 "Can't scalarize a scalable vector"); 9803 for (unsigned Part = 0; Part < State.UF; ++Part) 9804 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9805 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9806 VPIteration(Part, Lane), IsPredicated, 9807 State); 9808 } 9809 9810 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9811 assert(State.Instance && "Branch on Mask works only on single instance."); 9812 9813 unsigned Part = State.Instance->Part; 9814 unsigned Lane = State.Instance->Lane.getKnownLane(); 9815 9816 Value *ConditionBit = nullptr; 9817 VPValue *BlockInMask = getMask(); 9818 if (BlockInMask) { 9819 ConditionBit = State.get(BlockInMask, Part); 9820 if (ConditionBit->getType()->isVectorTy()) 9821 ConditionBit = State.Builder.CreateExtractElement( 9822 ConditionBit, State.Builder.getInt32(Lane)); 9823 } else // Block in mask is all-one. 9824 ConditionBit = State.Builder.getTrue(); 9825 9826 // Replace the temporary unreachable terminator with a new conditional branch, 9827 // whose two destinations will be set later when they are created. 9828 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9829 assert(isa<UnreachableInst>(CurrentTerminator) && 9830 "Expected to replace unreachable terminator with conditional branch."); 9831 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9832 CondBr->setSuccessor(0, nullptr); 9833 ReplaceInstWithInst(CurrentTerminator, CondBr); 9834 } 9835 9836 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9837 assert(State.Instance && "Predicated instruction PHI works per instance."); 9838 Instruction *ScalarPredInst = 9839 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9840 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9841 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9842 assert(PredicatingBB && "Predicated block has no single predecessor."); 9843 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9844 "operand must be VPReplicateRecipe"); 9845 9846 // By current pack/unpack logic we need to generate only a single phi node: if 9847 // a vector value for the predicated instruction exists at this point it means 9848 // the instruction has vector users only, and a phi for the vector value is 9849 // needed. In this case the recipe of the predicated instruction is marked to 9850 // also do that packing, thereby "hoisting" the insert-element sequence. 9851 // Otherwise, a phi node for the scalar value is needed. 9852 unsigned Part = State.Instance->Part; 9853 if (State.hasVectorValue(getOperand(0), Part)) { 9854 Value *VectorValue = State.get(getOperand(0), Part); 9855 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9856 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9857 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9858 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9859 if (State.hasVectorValue(this, Part)) 9860 State.reset(this, VPhi, Part); 9861 else 9862 State.set(this, VPhi, Part); 9863 // NOTE: Currently we need to update the value of the operand, so the next 9864 // predicated iteration inserts its generated value in the correct vector. 9865 State.reset(getOperand(0), VPhi, Part); 9866 } else { 9867 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9868 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9869 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9870 PredicatingBB); 9871 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9872 if (State.hasScalarValue(this, *State.Instance)) 9873 State.reset(this, Phi, *State.Instance); 9874 else 9875 State.set(this, Phi, *State.Instance); 9876 // NOTE: Currently we need to update the value of the operand, so the next 9877 // predicated iteration inserts its generated value in the correct vector. 9878 State.reset(getOperand(0), Phi, *State.Instance); 9879 } 9880 } 9881 9882 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9883 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9884 9885 // Attempt to issue a wide load. 9886 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9887 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9888 9889 assert((LI || SI) && "Invalid Load/Store instruction"); 9890 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9891 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9892 9893 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9894 9895 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9896 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9897 bool CreateGatherScatter = !Consecutive; 9898 9899 auto &Builder = State.Builder; 9900 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9901 bool isMaskRequired = getMask(); 9902 if (isMaskRequired) 9903 for (unsigned Part = 0; Part < State.UF; ++Part) 9904 BlockInMaskParts[Part] = State.get(getMask(), Part); 9905 9906 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9907 // Calculate the pointer for the specific unroll-part. 9908 GetElementPtrInst *PartPtr = nullptr; 9909 9910 bool InBounds = false; 9911 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9912 InBounds = gep->isInBounds(); 9913 if (Reverse) { 9914 // If the address is consecutive but reversed, then the 9915 // wide store needs to start at the last vector element. 9916 // RunTimeVF = VScale * VF.getKnownMinValue() 9917 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9918 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9919 // NumElt = -Part * RunTimeVF 9920 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9921 // LastLane = 1 - RunTimeVF 9922 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9923 PartPtr = 9924 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9925 PartPtr->setIsInBounds(InBounds); 9926 PartPtr = cast<GetElementPtrInst>( 9927 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9928 PartPtr->setIsInBounds(InBounds); 9929 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9930 BlockInMaskParts[Part] = 9931 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9932 } else { 9933 Value *Increment = 9934 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9935 PartPtr = cast<GetElementPtrInst>( 9936 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9937 PartPtr->setIsInBounds(InBounds); 9938 } 9939 9940 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9941 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9942 }; 9943 9944 // Handle Stores: 9945 if (SI) { 9946 State.ILV->setDebugLocFromInst(SI); 9947 9948 for (unsigned Part = 0; Part < State.UF; ++Part) { 9949 Instruction *NewSI = nullptr; 9950 Value *StoredVal = State.get(StoredValue, Part); 9951 if (CreateGatherScatter) { 9952 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9953 Value *VectorGep = State.get(getAddr(), Part); 9954 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9955 MaskPart); 9956 } else { 9957 if (Reverse) { 9958 // If we store to reverse consecutive memory locations, then we need 9959 // to reverse the order of elements in the stored value. 9960 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9961 // We don't want to update the value in the map as it might be used in 9962 // another expression. So don't call resetVectorValue(StoredVal). 9963 } 9964 auto *VecPtr = 9965 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9966 if (isMaskRequired) 9967 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9968 BlockInMaskParts[Part]); 9969 else 9970 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9971 } 9972 State.ILV->addMetadata(NewSI, SI); 9973 } 9974 return; 9975 } 9976 9977 // Handle loads. 9978 assert(LI && "Must have a load instruction"); 9979 State.ILV->setDebugLocFromInst(LI); 9980 for (unsigned Part = 0; Part < State.UF; ++Part) { 9981 Value *NewLI; 9982 if (CreateGatherScatter) { 9983 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9984 Value *VectorGep = State.get(getAddr(), Part); 9985 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9986 nullptr, "wide.masked.gather"); 9987 State.ILV->addMetadata(NewLI, LI); 9988 } else { 9989 auto *VecPtr = 9990 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9991 if (isMaskRequired) 9992 NewLI = Builder.CreateMaskedLoad( 9993 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9994 PoisonValue::get(DataTy), "wide.masked.load"); 9995 else 9996 NewLI = 9997 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9998 9999 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10000 State.ILV->addMetadata(NewLI, LI); 10001 if (Reverse) 10002 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10003 } 10004 10005 State.set(getVPSingleValue(), NewLI, Part); 10006 } 10007 } 10008 10009 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10010 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10011 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10012 // for predication. 10013 static ScalarEpilogueLowering getScalarEpilogueLowering( 10014 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10015 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10016 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10017 LoopVectorizationLegality &LVL) { 10018 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10019 // don't look at hints or options, and don't request a scalar epilogue. 10020 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10021 // LoopAccessInfo (due to code dependency and not being able to reliably get 10022 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10023 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10024 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10025 // back to the old way and vectorize with versioning when forced. See D81345.) 10026 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10027 PGSOQueryType::IRPass) && 10028 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10029 return CM_ScalarEpilogueNotAllowedOptSize; 10030 10031 // 2) If set, obey the directives 10032 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10033 switch (PreferPredicateOverEpilogue) { 10034 case PreferPredicateTy::ScalarEpilogue: 10035 return CM_ScalarEpilogueAllowed; 10036 case PreferPredicateTy::PredicateElseScalarEpilogue: 10037 return CM_ScalarEpilogueNotNeededUsePredicate; 10038 case PreferPredicateTy::PredicateOrDontVectorize: 10039 return CM_ScalarEpilogueNotAllowedUsePredicate; 10040 }; 10041 } 10042 10043 // 3) If set, obey the hints 10044 switch (Hints.getPredicate()) { 10045 case LoopVectorizeHints::FK_Enabled: 10046 return CM_ScalarEpilogueNotNeededUsePredicate; 10047 case LoopVectorizeHints::FK_Disabled: 10048 return CM_ScalarEpilogueAllowed; 10049 }; 10050 10051 // 4) if the TTI hook indicates this is profitable, request predication. 10052 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10053 LVL.getLAI())) 10054 return CM_ScalarEpilogueNotNeededUsePredicate; 10055 10056 return CM_ScalarEpilogueAllowed; 10057 } 10058 10059 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10060 // If Values have been set for this Def return the one relevant for \p Part. 10061 if (hasVectorValue(Def, Part)) 10062 return Data.PerPartOutput[Def][Part]; 10063 10064 if (!hasScalarValue(Def, {Part, 0})) { 10065 Value *IRV = Def->getLiveInIRValue(); 10066 Value *B = ILV->getBroadcastInstrs(IRV); 10067 set(Def, B, Part); 10068 return B; 10069 } 10070 10071 Value *ScalarValue = get(Def, {Part, 0}); 10072 // If we aren't vectorizing, we can just copy the scalar map values over 10073 // to the vector map. 10074 if (VF.isScalar()) { 10075 set(Def, ScalarValue, Part); 10076 return ScalarValue; 10077 } 10078 10079 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10080 bool IsUniform = RepR && RepR->isUniform(); 10081 10082 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10083 // Check if there is a scalar value for the selected lane. 10084 if (!hasScalarValue(Def, {Part, LastLane})) { 10085 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10086 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10087 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10088 "unexpected recipe found to be invariant"); 10089 IsUniform = true; 10090 LastLane = 0; 10091 } 10092 10093 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10094 // Set the insert point after the last scalarized instruction or after the 10095 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10096 // will directly follow the scalar definitions. 10097 auto OldIP = Builder.saveIP(); 10098 auto NewIP = 10099 isa<PHINode>(LastInst) 10100 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10101 : std::next(BasicBlock::iterator(LastInst)); 10102 Builder.SetInsertPoint(&*NewIP); 10103 10104 // However, if we are vectorizing, we need to construct the vector values. 10105 // If the value is known to be uniform after vectorization, we can just 10106 // broadcast the scalar value corresponding to lane zero for each unroll 10107 // iteration. Otherwise, we construct the vector values using 10108 // insertelement instructions. Since the resulting vectors are stored in 10109 // State, we will only generate the insertelements once. 10110 Value *VectorValue = nullptr; 10111 if (IsUniform) { 10112 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10113 set(Def, VectorValue, Part); 10114 } else { 10115 // Initialize packing with insertelements to start from undef. 10116 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10117 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10118 set(Def, Undef, Part); 10119 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10120 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10121 VectorValue = get(Def, Part); 10122 } 10123 Builder.restoreIP(OldIP); 10124 return VectorValue; 10125 } 10126 10127 // Process the loop in the VPlan-native vectorization path. This path builds 10128 // VPlan upfront in the vectorization pipeline, which allows to apply 10129 // VPlan-to-VPlan transformations from the very beginning without modifying the 10130 // input LLVM IR. 10131 static bool processLoopInVPlanNativePath( 10132 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10133 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10134 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10135 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10136 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10137 LoopVectorizationRequirements &Requirements) { 10138 10139 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10140 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10141 return false; 10142 } 10143 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10144 Function *F = L->getHeader()->getParent(); 10145 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10146 10147 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10148 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10149 10150 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10151 &Hints, IAI); 10152 // Use the planner for outer loop vectorization. 10153 // TODO: CM is not used at this point inside the planner. Turn CM into an 10154 // optional argument if we don't need it in the future. 10155 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10156 Requirements, ORE); 10157 10158 // Get user vectorization factor. 10159 ElementCount UserVF = Hints.getWidth(); 10160 10161 CM.collectElementTypesForWidening(); 10162 10163 // Plan how to best vectorize, return the best VF and its cost. 10164 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10165 10166 // If we are stress testing VPlan builds, do not attempt to generate vector 10167 // code. Masked vector code generation support will follow soon. 10168 // Also, do not attempt to vectorize if no vector code will be produced. 10169 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10170 return false; 10171 10172 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10173 10174 { 10175 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10176 F->getParent()->getDataLayout()); 10177 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10178 &CM, BFI, PSI, Checks); 10179 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10180 << L->getHeader()->getParent()->getName() << "\"\n"); 10181 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10182 } 10183 10184 // Mark the loop as already vectorized to avoid vectorizing again. 10185 Hints.setAlreadyVectorized(); 10186 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10187 return true; 10188 } 10189 10190 // Emit a remark if there are stores to floats that required a floating point 10191 // extension. If the vectorized loop was generated with floating point there 10192 // will be a performance penalty from the conversion overhead and the change in 10193 // the vector width. 10194 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10195 SmallVector<Instruction *, 4> Worklist; 10196 for (BasicBlock *BB : L->getBlocks()) { 10197 for (Instruction &Inst : *BB) { 10198 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10199 if (S->getValueOperand()->getType()->isFloatTy()) 10200 Worklist.push_back(S); 10201 } 10202 } 10203 } 10204 10205 // Traverse the floating point stores upwards searching, for floating point 10206 // conversions. 10207 SmallPtrSet<const Instruction *, 4> Visited; 10208 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10209 while (!Worklist.empty()) { 10210 auto *I = Worklist.pop_back_val(); 10211 if (!L->contains(I)) 10212 continue; 10213 if (!Visited.insert(I).second) 10214 continue; 10215 10216 // Emit a remark if the floating point store required a floating 10217 // point conversion. 10218 // TODO: More work could be done to identify the root cause such as a 10219 // constant or a function return type and point the user to it. 10220 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10221 ORE->emit([&]() { 10222 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10223 I->getDebugLoc(), L->getHeader()) 10224 << "floating point conversion changes vector width. " 10225 << "Mixed floating point precision requires an up/down " 10226 << "cast that will negatively impact performance."; 10227 }); 10228 10229 for (Use &Op : I->operands()) 10230 if (auto *OpI = dyn_cast<Instruction>(Op)) 10231 Worklist.push_back(OpI); 10232 } 10233 } 10234 10235 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10236 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10237 !EnableLoopInterleaving), 10238 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10239 !EnableLoopVectorization) {} 10240 10241 bool LoopVectorizePass::processLoop(Loop *L) { 10242 assert((EnableVPlanNativePath || L->isInnermost()) && 10243 "VPlan-native path is not enabled. Only process inner loops."); 10244 10245 #ifndef NDEBUG 10246 const std::string DebugLocStr = getDebugLocString(L); 10247 #endif /* NDEBUG */ 10248 10249 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10250 << L->getHeader()->getParent()->getName() << "' from " 10251 << DebugLocStr << "\n"); 10252 10253 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10254 10255 LLVM_DEBUG( 10256 dbgs() << "LV: Loop hints:" 10257 << " force=" 10258 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10259 ? "disabled" 10260 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10261 ? "enabled" 10262 : "?")) 10263 << " width=" << Hints.getWidth() 10264 << " interleave=" << Hints.getInterleave() << "\n"); 10265 10266 // Function containing loop 10267 Function *F = L->getHeader()->getParent(); 10268 10269 // Looking at the diagnostic output is the only way to determine if a loop 10270 // was vectorized (other than looking at the IR or machine code), so it 10271 // is important to generate an optimization remark for each loop. Most of 10272 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10273 // generated as OptimizationRemark and OptimizationRemarkMissed are 10274 // less verbose reporting vectorized loops and unvectorized loops that may 10275 // benefit from vectorization, respectively. 10276 10277 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10278 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10279 return false; 10280 } 10281 10282 PredicatedScalarEvolution PSE(*SE, *L); 10283 10284 // Check if it is legal to vectorize the loop. 10285 LoopVectorizationRequirements Requirements; 10286 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10287 &Requirements, &Hints, DB, AC, BFI, PSI); 10288 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10289 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10290 Hints.emitRemarkWithHints(); 10291 return false; 10292 } 10293 10294 // Check the function attributes and profiles to find out if this function 10295 // should be optimized for size. 10296 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10297 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10298 10299 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10300 // here. They may require CFG and instruction level transformations before 10301 // even evaluating whether vectorization is profitable. Since we cannot modify 10302 // the incoming IR, we need to build VPlan upfront in the vectorization 10303 // pipeline. 10304 if (!L->isInnermost()) 10305 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10306 ORE, BFI, PSI, Hints, Requirements); 10307 10308 assert(L->isInnermost() && "Inner loop expected."); 10309 10310 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10311 // count by optimizing for size, to minimize overheads. 10312 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10313 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10314 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10315 << "This loop is worth vectorizing only if no scalar " 10316 << "iteration overheads are incurred."); 10317 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10318 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10319 else { 10320 LLVM_DEBUG(dbgs() << "\n"); 10321 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10322 } 10323 } 10324 10325 // Check the function attributes to see if implicit floats are allowed. 10326 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10327 // an integer loop and the vector instructions selected are purely integer 10328 // vector instructions? 10329 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10330 reportVectorizationFailure( 10331 "Can't vectorize when the NoImplicitFloat attribute is used", 10332 "loop not vectorized due to NoImplicitFloat attribute", 10333 "NoImplicitFloat", ORE, L); 10334 Hints.emitRemarkWithHints(); 10335 return false; 10336 } 10337 10338 // Check if the target supports potentially unsafe FP vectorization. 10339 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10340 // for the target we're vectorizing for, to make sure none of the 10341 // additional fp-math flags can help. 10342 if (Hints.isPotentiallyUnsafe() && 10343 TTI->isFPVectorizationPotentiallyUnsafe()) { 10344 reportVectorizationFailure( 10345 "Potentially unsafe FP op prevents vectorization", 10346 "loop not vectorized due to unsafe FP support.", 10347 "UnsafeFP", ORE, L); 10348 Hints.emitRemarkWithHints(); 10349 return false; 10350 } 10351 10352 bool AllowOrderedReductions; 10353 // If the flag is set, use that instead and override the TTI behaviour. 10354 if (ForceOrderedReductions.getNumOccurrences() > 0) 10355 AllowOrderedReductions = ForceOrderedReductions; 10356 else 10357 AllowOrderedReductions = TTI->enableOrderedReductions(); 10358 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10359 ORE->emit([&]() { 10360 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10361 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10362 ExactFPMathInst->getDebugLoc(), 10363 ExactFPMathInst->getParent()) 10364 << "loop not vectorized: cannot prove it is safe to reorder " 10365 "floating-point operations"; 10366 }); 10367 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10368 "reorder floating-point operations\n"); 10369 Hints.emitRemarkWithHints(); 10370 return false; 10371 } 10372 10373 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10374 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10375 10376 // If an override option has been passed in for interleaved accesses, use it. 10377 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10378 UseInterleaved = EnableInterleavedMemAccesses; 10379 10380 // Analyze interleaved memory accesses. 10381 if (UseInterleaved) { 10382 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10383 } 10384 10385 // Use the cost model. 10386 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10387 F, &Hints, IAI); 10388 CM.collectValuesToIgnore(); 10389 CM.collectElementTypesForWidening(); 10390 10391 // Use the planner for vectorization. 10392 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10393 Requirements, ORE); 10394 10395 // Get user vectorization factor and interleave count. 10396 ElementCount UserVF = Hints.getWidth(); 10397 unsigned UserIC = Hints.getInterleave(); 10398 10399 // Plan how to best vectorize, return the best VF and its cost. 10400 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10401 10402 VectorizationFactor VF = VectorizationFactor::Disabled(); 10403 unsigned IC = 1; 10404 10405 if (MaybeVF) { 10406 if (LVP.requiresTooManyRuntimeChecks()) { 10407 ORE->emit([&]() { 10408 return OptimizationRemarkAnalysisAliasing( 10409 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10410 L->getHeader()) 10411 << "loop not vectorized: cannot prove it is safe to reorder " 10412 "memory operations"; 10413 }); 10414 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10415 Hints.emitRemarkWithHints(); 10416 return false; 10417 } 10418 VF = *MaybeVF; 10419 // Select the interleave count. 10420 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10421 } 10422 10423 // Identify the diagnostic messages that should be produced. 10424 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10425 bool VectorizeLoop = true, InterleaveLoop = true; 10426 if (VF.Width.isScalar()) { 10427 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10428 VecDiagMsg = std::make_pair( 10429 "VectorizationNotBeneficial", 10430 "the cost-model indicates that vectorization is not beneficial"); 10431 VectorizeLoop = false; 10432 } 10433 10434 if (!MaybeVF && UserIC > 1) { 10435 // Tell the user interleaving was avoided up-front, despite being explicitly 10436 // requested. 10437 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10438 "interleaving should be avoided up front\n"); 10439 IntDiagMsg = std::make_pair( 10440 "InterleavingAvoided", 10441 "Ignoring UserIC, because interleaving was avoided up front"); 10442 InterleaveLoop = false; 10443 } else if (IC == 1 && UserIC <= 1) { 10444 // Tell the user interleaving is not beneficial. 10445 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10446 IntDiagMsg = std::make_pair( 10447 "InterleavingNotBeneficial", 10448 "the cost-model indicates that interleaving is not beneficial"); 10449 InterleaveLoop = false; 10450 if (UserIC == 1) { 10451 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10452 IntDiagMsg.second += 10453 " and is explicitly disabled or interleave count is set to 1"; 10454 } 10455 } else if (IC > 1 && UserIC == 1) { 10456 // Tell the user interleaving is beneficial, but it explicitly disabled. 10457 LLVM_DEBUG( 10458 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10459 IntDiagMsg = std::make_pair( 10460 "InterleavingBeneficialButDisabled", 10461 "the cost-model indicates that interleaving is beneficial " 10462 "but is explicitly disabled or interleave count is set to 1"); 10463 InterleaveLoop = false; 10464 } 10465 10466 // Override IC if user provided an interleave count. 10467 IC = UserIC > 0 ? UserIC : IC; 10468 10469 // Emit diagnostic messages, if any. 10470 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10471 if (!VectorizeLoop && !InterleaveLoop) { 10472 // Do not vectorize or interleaving the loop. 10473 ORE->emit([&]() { 10474 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10475 L->getStartLoc(), L->getHeader()) 10476 << VecDiagMsg.second; 10477 }); 10478 ORE->emit([&]() { 10479 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10480 L->getStartLoc(), L->getHeader()) 10481 << IntDiagMsg.second; 10482 }); 10483 return false; 10484 } else if (!VectorizeLoop && InterleaveLoop) { 10485 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10486 ORE->emit([&]() { 10487 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10488 L->getStartLoc(), L->getHeader()) 10489 << VecDiagMsg.second; 10490 }); 10491 } else if (VectorizeLoop && !InterleaveLoop) { 10492 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10493 << ") in " << DebugLocStr << '\n'); 10494 ORE->emit([&]() { 10495 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10496 L->getStartLoc(), L->getHeader()) 10497 << IntDiagMsg.second; 10498 }); 10499 } else if (VectorizeLoop && InterleaveLoop) { 10500 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10501 << ") in " << DebugLocStr << '\n'); 10502 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10503 } 10504 10505 bool DisableRuntimeUnroll = false; 10506 MDNode *OrigLoopID = L->getLoopID(); 10507 { 10508 // Optimistically generate runtime checks. Drop them if they turn out to not 10509 // be profitable. Limit the scope of Checks, so the cleanup happens 10510 // immediately after vector codegeneration is done. 10511 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10512 F->getParent()->getDataLayout()); 10513 if (!VF.Width.isScalar() || IC > 1) 10514 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC); 10515 10516 using namespace ore; 10517 if (!VectorizeLoop) { 10518 assert(IC > 1 && "interleave count should not be 1 or 0"); 10519 // If we decided that it is not legal to vectorize the loop, then 10520 // interleave it. 10521 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10522 &CM, BFI, PSI, Checks); 10523 10524 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10525 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10526 10527 ORE->emit([&]() { 10528 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10529 L->getHeader()) 10530 << "interleaved loop (interleaved count: " 10531 << NV("InterleaveCount", IC) << ")"; 10532 }); 10533 } else { 10534 // If we decided that it is *legal* to vectorize the loop, then do it. 10535 10536 // Consider vectorizing the epilogue too if it's profitable. 10537 VectorizationFactor EpilogueVF = 10538 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10539 if (EpilogueVF.Width.isVector()) { 10540 10541 // The first pass vectorizes the main loop and creates a scalar epilogue 10542 // to be vectorized by executing the plan (potentially with a different 10543 // factor) again shortly afterwards. 10544 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10545 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10546 EPI, &LVL, &CM, BFI, PSI, Checks); 10547 10548 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10549 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10550 DT); 10551 ++LoopsVectorized; 10552 10553 // Second pass vectorizes the epilogue and adjusts the control flow 10554 // edges from the first pass. 10555 EPI.MainLoopVF = EPI.EpilogueVF; 10556 EPI.MainLoopUF = EPI.EpilogueUF; 10557 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10558 ORE, EPI, &LVL, &CM, BFI, PSI, 10559 Checks); 10560 10561 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10562 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10563 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10564 Header->setName("vec.epilog.vector.body"); 10565 10566 // Ensure that the start values for any VPReductionPHIRecipes are 10567 // updated before vectorising the epilogue loop. 10568 for (VPRecipeBase &R : Header->phis()) { 10569 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10570 if (auto *Resume = MainILV.getReductionResumeValue( 10571 ReductionPhi->getRecurrenceDescriptor())) { 10572 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10573 ReductionPhi->setOperand(0, StartVal); 10574 } 10575 } 10576 } 10577 10578 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10579 DT); 10580 ++LoopsEpilogueVectorized; 10581 10582 if (!MainILV.areSafetyChecksAdded()) 10583 DisableRuntimeUnroll = true; 10584 } else { 10585 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10586 &LVL, &CM, BFI, PSI, Checks); 10587 10588 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10589 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10590 ++LoopsVectorized; 10591 10592 // Add metadata to disable runtime unrolling a scalar loop when there 10593 // are no runtime checks about strides and memory. A scalar loop that is 10594 // rarely used is not worth unrolling. 10595 if (!LB.areSafetyChecksAdded()) 10596 DisableRuntimeUnroll = true; 10597 } 10598 // Report the vectorization decision. 10599 ORE->emit([&]() { 10600 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10601 L->getHeader()) 10602 << "vectorized loop (vectorization width: " 10603 << NV("VectorizationFactor", VF.Width) 10604 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10605 }); 10606 } 10607 10608 if (ORE->allowExtraAnalysis(LV_NAME)) 10609 checkMixedPrecision(L, ORE); 10610 } 10611 10612 Optional<MDNode *> RemainderLoopID = 10613 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10614 LLVMLoopVectorizeFollowupEpilogue}); 10615 if (RemainderLoopID.hasValue()) { 10616 L->setLoopID(RemainderLoopID.getValue()); 10617 } else { 10618 if (DisableRuntimeUnroll) 10619 AddRuntimeUnrollDisableMetaData(L); 10620 10621 // Mark the loop as already vectorized to avoid vectorizing again. 10622 Hints.setAlreadyVectorized(); 10623 } 10624 10625 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10626 return true; 10627 } 10628 10629 LoopVectorizeResult LoopVectorizePass::runImpl( 10630 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10631 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10632 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10633 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10634 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10635 SE = &SE_; 10636 LI = &LI_; 10637 TTI = &TTI_; 10638 DT = &DT_; 10639 BFI = &BFI_; 10640 TLI = TLI_; 10641 AA = &AA_; 10642 AC = &AC_; 10643 GetLAA = &GetLAA_; 10644 DB = &DB_; 10645 ORE = &ORE_; 10646 PSI = PSI_; 10647 10648 // Don't attempt if 10649 // 1. the target claims to have no vector registers, and 10650 // 2. interleaving won't help ILP. 10651 // 10652 // The second condition is necessary because, even if the target has no 10653 // vector registers, loop vectorization may still enable scalar 10654 // interleaving. 10655 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10656 TTI->getMaxInterleaveFactor(1) < 2) 10657 return LoopVectorizeResult(false, false); 10658 10659 bool Changed = false, CFGChanged = false; 10660 10661 // The vectorizer requires loops to be in simplified form. 10662 // Since simplification may add new inner loops, it has to run before the 10663 // legality and profitability checks. This means running the loop vectorizer 10664 // will simplify all loops, regardless of whether anything end up being 10665 // vectorized. 10666 for (auto &L : *LI) 10667 Changed |= CFGChanged |= 10668 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10669 10670 // Build up a worklist of inner-loops to vectorize. This is necessary as 10671 // the act of vectorizing or partially unrolling a loop creates new loops 10672 // and can invalidate iterators across the loops. 10673 SmallVector<Loop *, 8> Worklist; 10674 10675 for (Loop *L : *LI) 10676 collectSupportedLoops(*L, LI, ORE, Worklist); 10677 10678 LoopsAnalyzed += Worklist.size(); 10679 10680 // Now walk the identified inner loops. 10681 while (!Worklist.empty()) { 10682 Loop *L = Worklist.pop_back_val(); 10683 10684 // For the inner loops we actually process, form LCSSA to simplify the 10685 // transform. 10686 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10687 10688 Changed |= CFGChanged |= processLoop(L); 10689 } 10690 10691 // Process each loop nest in the function. 10692 return LoopVectorizeResult(Changed, CFGChanged); 10693 } 10694 10695 PreservedAnalyses LoopVectorizePass::run(Function &F, 10696 FunctionAnalysisManager &AM) { 10697 auto &LI = AM.getResult<LoopAnalysis>(F); 10698 // There are no loops in the function. Return before computing other expensive 10699 // analyses. 10700 if (LI.empty()) 10701 return PreservedAnalyses::all(); 10702 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10703 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10704 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10705 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10706 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10707 auto &AA = AM.getResult<AAManager>(F); 10708 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10709 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10710 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10711 10712 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10713 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10714 [&](Loop &L) -> const LoopAccessInfo & { 10715 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10716 TLI, TTI, nullptr, nullptr, nullptr}; 10717 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10718 }; 10719 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10720 ProfileSummaryInfo *PSI = 10721 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10722 LoopVectorizeResult Result = 10723 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10724 if (!Result.MadeAnyChange) 10725 return PreservedAnalyses::all(); 10726 PreservedAnalyses PA; 10727 10728 // We currently do not preserve loopinfo/dominator analyses with outer loop 10729 // vectorization. Until this is addressed, mark these analyses as preserved 10730 // only for non-VPlan-native path. 10731 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10732 if (!EnableVPlanNativePath) { 10733 PA.preserve<LoopAnalysis>(); 10734 PA.preserve<DominatorTreeAnalysis>(); 10735 } 10736 10737 if (Result.MadeCFGChange) { 10738 // Making CFG changes likely means a loop got vectorized. Indicate that 10739 // extra simplification passes should be run. 10740 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10741 // be run if runtime checks have been added. 10742 AM.getResult<ShouldRunExtraVectorPasses>(F); 10743 PA.preserve<ShouldRunExtraVectorPasses>(); 10744 } else { 10745 PA.preserveSet<CFGAnalyses>(); 10746 } 10747 return PA; 10748 } 10749 10750 void LoopVectorizePass::printPipeline( 10751 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10752 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10753 OS, MapClassName2PassName); 10754 10755 OS << "<"; 10756 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10757 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10758 OS << ">"; 10759 } 10760