1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<bool> ForceTargetSupportsScalableVectors( 276 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 277 cl::desc( 278 "Pretend that scalable vectors are supported, even if the target does " 279 "not support them. This flag should only be used for testing.")); 280 281 static cl::opt<unsigned> SmallLoopCost( 282 "small-loop-cost", cl::init(20), cl::Hidden, 283 cl::desc( 284 "The cost of a loop that is considered 'small' by the interleaver.")); 285 286 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 287 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 288 cl::desc("Enable the use of the block frequency analysis to access PGO " 289 "heuristics minimizing code growth in cold regions and being more " 290 "aggressive in hot regions.")); 291 292 // Runtime interleave loops for load/store throughput. 293 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 294 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 295 cl::desc( 296 "Enable runtime interleaving until load/store ports are saturated")); 297 298 /// Interleave small loops with scalar reductions. 299 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 300 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 301 cl::desc("Enable interleaving for loops with small iteration counts that " 302 "contain scalar reductions to expose ILP.")); 303 304 /// The number of stores in a loop that are allowed to need predication. 305 static cl::opt<unsigned> NumberOfStoresToPredicate( 306 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 307 cl::desc("Max number of stores to be predicated behind an if.")); 308 309 static cl::opt<bool> EnableIndVarRegisterHeur( 310 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 311 cl::desc("Count the induction variable only once when interleaving")); 312 313 static cl::opt<bool> EnableCondStoresVectorization( 314 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 315 cl::desc("Enable if predication of stores during vectorization.")); 316 317 static cl::opt<unsigned> MaxNestedScalarReductionIC( 318 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 319 cl::desc("The maximum interleave count to use when interleaving a scalar " 320 "reduction in a nested loop.")); 321 322 static cl::opt<bool> 323 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 324 cl::Hidden, 325 cl::desc("Prefer in-loop vector reductions, " 326 "overriding the targets preference.")); 327 328 static cl::opt<bool> PreferPredicatedReductionSelect( 329 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 330 cl::desc( 331 "Prefer predicating a reduction operation over an after loop select.")); 332 333 cl::opt<bool> EnableVPlanNativePath( 334 "enable-vplan-native-path", cl::init(false), cl::Hidden, 335 cl::desc("Enable VPlan-native vectorization path with " 336 "support for outer loop vectorization.")); 337 338 // FIXME: Remove this switch once we have divergence analysis. Currently we 339 // assume divergent non-backedge branches when this switch is true. 340 cl::opt<bool> EnableVPlanPredication( 341 "enable-vplan-predication", cl::init(false), cl::Hidden, 342 cl::desc("Enable VPlan-native vectorization path predicator with " 343 "support for outer loop vectorization.")); 344 345 // This flag enables the stress testing of the VPlan H-CFG construction in the 346 // VPlan-native vectorization path. It must be used in conjuction with 347 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 348 // verification of the H-CFGs built. 349 static cl::opt<bool> VPlanBuildStressTest( 350 "vplan-build-stress-test", cl::init(false), cl::Hidden, 351 cl::desc( 352 "Build VPlan for every supported loop nest in the function and bail " 353 "out right after the build (stress test the VPlan H-CFG construction " 354 "in the VPlan-native vectorization path).")); 355 356 cl::opt<bool> llvm::EnableLoopInterleaving( 357 "interleave-loops", cl::init(true), cl::Hidden, 358 cl::desc("Enable loop interleaving in Loop vectorization passes")); 359 cl::opt<bool> llvm::EnableLoopVectorization( 360 "vectorize-loops", cl::init(true), cl::Hidden, 361 cl::desc("Run the Loop vectorization passes")); 362 363 /// A helper function that returns the type of loaded or stored value. 364 static Type *getMemInstValueType(Value *I) { 365 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 366 "Expected Load or Store instruction"); 367 if (auto *LI = dyn_cast<LoadInst>(I)) 368 return LI->getType(); 369 return cast<StoreInst>(I)->getValueOperand()->getType(); 370 } 371 372 /// A helper function that returns true if the given type is irregular. The 373 /// type is irregular if its allocated size doesn't equal the store size of an 374 /// element of the corresponding vector type. 375 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 376 // Determine if an array of N elements of type Ty is "bitcast compatible" 377 // with a <N x Ty> vector. 378 // This is only true if there is no padding between the array elements. 379 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 380 } 381 382 /// A helper function that returns the reciprocal of the block probability of 383 /// predicated blocks. If we return X, we are assuming the predicated block 384 /// will execute once for every X iterations of the loop header. 385 /// 386 /// TODO: We should use actual block probability here, if available. Currently, 387 /// we always assume predicated blocks have a 50% chance of executing. 388 static unsigned getReciprocalPredBlockProb() { return 2; } 389 390 /// A helper function that returns an integer or floating-point constant with 391 /// value C. 392 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 393 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 394 : ConstantFP::get(Ty, C); 395 } 396 397 /// Returns "best known" trip count for the specified loop \p L as defined by 398 /// the following procedure: 399 /// 1) Returns exact trip count if it is known. 400 /// 2) Returns expected trip count according to profile data if any. 401 /// 3) Returns upper bound estimate if it is known. 402 /// 4) Returns None if all of the above failed. 403 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 404 // Check if exact trip count is known. 405 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 406 return ExpectedTC; 407 408 // Check if there is an expected trip count available from profile data. 409 if (LoopVectorizeWithBlockFrequency) 410 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 411 return EstimatedTC; 412 413 // Check if upper bound estimate is known. 414 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 415 return ExpectedTC; 416 417 return None; 418 } 419 420 // Forward declare GeneratedRTChecks. 421 class GeneratedRTChecks; 422 423 namespace llvm { 424 425 /// InnerLoopVectorizer vectorizes loops which contain only one basic 426 /// block to a specified vectorization factor (VF). 427 /// This class performs the widening of scalars into vectors, or multiple 428 /// scalars. This class also implements the following features: 429 /// * It inserts an epilogue loop for handling loops that don't have iteration 430 /// counts that are known to be a multiple of the vectorization factor. 431 /// * It handles the code generation for reduction variables. 432 /// * Scalarization (implementation using scalars) of un-vectorizable 433 /// instructions. 434 /// InnerLoopVectorizer does not perform any vectorization-legality 435 /// checks, and relies on the caller to check for the different legality 436 /// aspects. The InnerLoopVectorizer relies on the 437 /// LoopVectorizationLegality class to provide information about the induction 438 /// and reduction variables that were found to a given vectorization factor. 439 class InnerLoopVectorizer { 440 public: 441 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 442 LoopInfo *LI, DominatorTree *DT, 443 const TargetLibraryInfo *TLI, 444 const TargetTransformInfo *TTI, AssumptionCache *AC, 445 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 446 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 447 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 448 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 449 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 450 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 451 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 452 PSI(PSI), RTChecks(RTChecks) { 453 // Query this against the original loop and save it here because the profile 454 // of the original loop header may change as the transformation happens. 455 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 456 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 457 } 458 459 virtual ~InnerLoopVectorizer() = default; 460 461 /// Create a new empty loop that will contain vectorized instructions later 462 /// on, while the old loop will be used as the scalar remainder. Control flow 463 /// is generated around the vectorized (and scalar epilogue) loops consisting 464 /// of various checks and bypasses. Return the pre-header block of the new 465 /// loop. 466 /// In the case of epilogue vectorization, this function is overriden to 467 /// handle the more complex control flow around the loops. 468 virtual BasicBlock *createVectorizedLoopSkeleton(); 469 470 /// Widen a single instruction within the innermost loop. 471 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 472 VPTransformState &State); 473 474 /// Widen a single call instruction within the innermost loop. 475 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 476 VPTransformState &State); 477 478 /// Widen a single select instruction within the innermost loop. 479 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 480 bool InvariantCond, VPTransformState &State); 481 482 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 483 void fixVectorizedLoop(VPTransformState &State); 484 485 // Return true if any runtime check is added. 486 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 487 488 /// A type for vectorized values in the new loop. Each value from the 489 /// original loop, when vectorized, is represented by UF vector values in the 490 /// new unrolled loop, where UF is the unroll factor. 491 using VectorParts = SmallVector<Value *, 2>; 492 493 /// Vectorize a single GetElementPtrInst based on information gathered and 494 /// decisions taken during planning. 495 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 496 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 497 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 498 499 /// Vectorize a single PHINode in a block. This method handles the induction 500 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 501 /// arbitrary length vectors. 502 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 503 VPValue *StartV, VPValue *Def, 504 VPTransformState &State); 505 506 /// A helper function to scalarize a single Instruction in the innermost loop. 507 /// Generates a sequence of scalar instances for each lane between \p MinLane 508 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 509 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 510 /// Instr's operands. 511 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 512 const VPIteration &Instance, bool IfPredicateInstr, 513 VPTransformState &State); 514 515 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 516 /// is provided, the integer induction variable will first be truncated to 517 /// the corresponding type. 518 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 519 VPValue *Def, VPValue *CastDef, 520 VPTransformState &State); 521 522 /// Construct the vector value of a scalarized value \p V one lane at a time. 523 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 524 VPTransformState &State); 525 526 /// Try to vectorize interleaved access group \p Group with the base address 527 /// given in \p Addr, optionally masking the vector operations if \p 528 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 529 /// values in the vectorized loop. 530 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 531 ArrayRef<VPValue *> VPDefs, 532 VPTransformState &State, VPValue *Addr, 533 ArrayRef<VPValue *> StoredValues, 534 VPValue *BlockInMask = nullptr); 535 536 /// Vectorize Load and Store instructions with the base address given in \p 537 /// Addr, optionally masking the vector operations if \p BlockInMask is 538 /// non-null. Use \p State to translate given VPValues to IR values in the 539 /// vectorized loop. 540 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 541 VPValue *Def, VPValue *Addr, 542 VPValue *StoredValue, VPValue *BlockInMask); 543 544 /// Set the debug location in the builder using the debug location in 545 /// the instruction. 546 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 547 548 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 549 void fixNonInductionPHIs(VPTransformState &State); 550 551 /// Create a broadcast instruction. This method generates a broadcast 552 /// instruction (shuffle) for loop invariant values and for the induction 553 /// value. If this is the induction variable then we extend it to N, N+1, ... 554 /// this is needed because each iteration in the loop corresponds to a SIMD 555 /// element. 556 virtual Value *getBroadcastInstrs(Value *V); 557 558 protected: 559 friend class LoopVectorizationPlanner; 560 561 /// A small list of PHINodes. 562 using PhiVector = SmallVector<PHINode *, 4>; 563 564 /// A type for scalarized values in the new loop. Each value from the 565 /// original loop, when scalarized, is represented by UF x VF scalar values 566 /// in the new unrolled loop, where UF is the unroll factor and VF is the 567 /// vectorization factor. 568 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 569 570 /// Set up the values of the IVs correctly when exiting the vector loop. 571 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 572 Value *CountRoundDown, Value *EndValue, 573 BasicBlock *MiddleBlock); 574 575 /// Create a new induction variable inside L. 576 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 577 Value *Step, Instruction *DL); 578 579 /// Handle all cross-iteration phis in the header. 580 void fixCrossIterationPHIs(VPTransformState &State); 581 582 /// Fix a first-order recurrence. This is the second phase of vectorizing 583 /// this phi node. 584 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 585 586 /// Fix a reduction cross-iteration phi. This is the second phase of 587 /// vectorizing this phi node. 588 void fixReduction(PHINode *Phi, VPTransformState &State); 589 590 /// Clear NSW/NUW flags from reduction instructions if necessary. 591 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 592 VPTransformState &State); 593 594 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 595 /// means we need to add the appropriate incoming value from the middle 596 /// block as exiting edges from the scalar epilogue loop (if present) are 597 /// already in place, and we exit the vector loop exclusively to the middle 598 /// block. 599 void fixLCSSAPHIs(VPTransformState &State); 600 601 /// Iteratively sink the scalarized operands of a predicated instruction into 602 /// the block that was created for it. 603 void sinkScalarOperands(Instruction *PredInst); 604 605 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 606 /// represented as. 607 void truncateToMinimalBitwidths(VPTransformState &State); 608 609 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 610 /// to each vector element of Val. The sequence starts at StartIndex. 611 /// \p Opcode is relevant for FP induction variable. 612 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 613 Instruction::BinaryOps Opcode = 614 Instruction::BinaryOpsEnd); 615 616 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 617 /// variable on which to base the steps, \p Step is the size of the step, and 618 /// \p EntryVal is the value from the original loop that maps to the steps. 619 /// Note that \p EntryVal doesn't have to be an induction variable - it 620 /// can also be a truncate instruction. 621 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 622 const InductionDescriptor &ID, VPValue *Def, 623 VPValue *CastDef, VPTransformState &State); 624 625 /// Create a vector induction phi node based on an existing scalar one. \p 626 /// EntryVal is the value from the original loop that maps to the vector phi 627 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 628 /// truncate instruction, instead of widening the original IV, we widen a 629 /// version of the IV truncated to \p EntryVal's type. 630 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 631 Value *Step, Value *Start, 632 Instruction *EntryVal, VPValue *Def, 633 VPValue *CastDef, 634 VPTransformState &State); 635 636 /// Returns true if an instruction \p I should be scalarized instead of 637 /// vectorized for the chosen vectorization factor. 638 bool shouldScalarizeInstruction(Instruction *I) const; 639 640 /// Returns true if we should generate a scalar version of \p IV. 641 bool needsScalarInduction(Instruction *IV) const; 642 643 /// If there is a cast involved in the induction variable \p ID, which should 644 /// be ignored in the vectorized loop body, this function records the 645 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 646 /// cast. We had already proved that the casted Phi is equal to the uncasted 647 /// Phi in the vectorized loop (under a runtime guard), and therefore 648 /// there is no need to vectorize the cast - the same value can be used in the 649 /// vector loop for both the Phi and the cast. 650 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 651 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 652 /// 653 /// \p EntryVal is the value from the original loop that maps to the vector 654 /// phi node and is used to distinguish what is the IV currently being 655 /// processed - original one (if \p EntryVal is a phi corresponding to the 656 /// original IV) or the "newly-created" one based on the proof mentioned above 657 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 658 /// latter case \p EntryVal is a TruncInst and we must not record anything for 659 /// that IV, but it's error-prone to expect callers of this routine to care 660 /// about that, hence this explicit parameter. 661 void recordVectorLoopValueForInductionCast( 662 const InductionDescriptor &ID, const Instruction *EntryVal, 663 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 664 unsigned Part, unsigned Lane = UINT_MAX); 665 666 /// Generate a shuffle sequence that will reverse the vector Vec. 667 virtual Value *reverseVector(Value *Vec); 668 669 /// Returns (and creates if needed) the original loop trip count. 670 Value *getOrCreateTripCount(Loop *NewLoop); 671 672 /// Returns (and creates if needed) the trip count of the widened loop. 673 Value *getOrCreateVectorTripCount(Loop *NewLoop); 674 675 /// Returns a bitcasted value to the requested vector type. 676 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 677 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 678 const DataLayout &DL); 679 680 /// Emit a bypass check to see if the vector trip count is zero, including if 681 /// it overflows. 682 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 683 684 /// Emit a bypass check to see if all of the SCEV assumptions we've 685 /// had to make are correct. Returns the block containing the checks or 686 /// nullptr if no checks have been added. 687 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 688 689 /// Emit bypass checks to check any memory assumptions we may have made. 690 /// Returns the block containing the checks or nullptr if no checks have been 691 /// added. 692 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 693 694 /// Compute the transformed value of Index at offset StartValue using step 695 /// StepValue. 696 /// For integer induction, returns StartValue + Index * StepValue. 697 /// For pointer induction, returns StartValue[Index * StepValue]. 698 /// FIXME: The newly created binary instructions should contain nsw/nuw 699 /// flags, which can be found from the original scalar operations. 700 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 701 const DataLayout &DL, 702 const InductionDescriptor &ID) const; 703 704 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 705 /// vector loop preheader, middle block and scalar preheader. Also 706 /// allocate a loop object for the new vector loop and return it. 707 Loop *createVectorLoopSkeleton(StringRef Prefix); 708 709 /// Create new phi nodes for the induction variables to resume iteration count 710 /// in the scalar epilogue, from where the vectorized loop left off (given by 711 /// \p VectorTripCount). 712 /// In cases where the loop skeleton is more complicated (eg. epilogue 713 /// vectorization) and the resume values can come from an additional bypass 714 /// block, the \p AdditionalBypass pair provides information about the bypass 715 /// block and the end value on the edge from bypass to this loop. 716 void createInductionResumeValues( 717 Loop *L, Value *VectorTripCount, 718 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 719 720 /// Complete the loop skeleton by adding debug MDs, creating appropriate 721 /// conditional branches in the middle block, preparing the builder and 722 /// running the verifier. Take in the vector loop \p L as argument, and return 723 /// the preheader of the completed vector loop. 724 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 725 726 /// Add additional metadata to \p To that was not present on \p Orig. 727 /// 728 /// Currently this is used to add the noalias annotations based on the 729 /// inserted memchecks. Use this for instructions that are *cloned* into the 730 /// vector loop. 731 void addNewMetadata(Instruction *To, const Instruction *Orig); 732 733 /// Add metadata from one instruction to another. 734 /// 735 /// This includes both the original MDs from \p From and additional ones (\see 736 /// addNewMetadata). Use this for *newly created* instructions in the vector 737 /// loop. 738 void addMetadata(Instruction *To, Instruction *From); 739 740 /// Similar to the previous function but it adds the metadata to a 741 /// vector of instructions. 742 void addMetadata(ArrayRef<Value *> To, Instruction *From); 743 744 /// Allow subclasses to override and print debug traces before/after vplan 745 /// execution, when trace information is requested. 746 virtual void printDebugTracesAtStart(){}; 747 virtual void printDebugTracesAtEnd(){}; 748 749 /// The original loop. 750 Loop *OrigLoop; 751 752 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 753 /// dynamic knowledge to simplify SCEV expressions and converts them to a 754 /// more usable form. 755 PredicatedScalarEvolution &PSE; 756 757 /// Loop Info. 758 LoopInfo *LI; 759 760 /// Dominator Tree. 761 DominatorTree *DT; 762 763 /// Alias Analysis. 764 AAResults *AA; 765 766 /// Target Library Info. 767 const TargetLibraryInfo *TLI; 768 769 /// Target Transform Info. 770 const TargetTransformInfo *TTI; 771 772 /// Assumption Cache. 773 AssumptionCache *AC; 774 775 /// Interface to emit optimization remarks. 776 OptimizationRemarkEmitter *ORE; 777 778 /// LoopVersioning. It's only set up (non-null) if memchecks were 779 /// used. 780 /// 781 /// This is currently only used to add no-alias metadata based on the 782 /// memchecks. The actually versioning is performed manually. 783 std::unique_ptr<LoopVersioning> LVer; 784 785 /// The vectorization SIMD factor to use. Each vector will have this many 786 /// vector elements. 787 ElementCount VF; 788 789 /// The vectorization unroll factor to use. Each scalar is vectorized to this 790 /// many different vector instructions. 791 unsigned UF; 792 793 /// The builder that we use 794 IRBuilder<> Builder; 795 796 // --- Vectorization state --- 797 798 /// The vector-loop preheader. 799 BasicBlock *LoopVectorPreHeader; 800 801 /// The scalar-loop preheader. 802 BasicBlock *LoopScalarPreHeader; 803 804 /// Middle Block between the vector and the scalar. 805 BasicBlock *LoopMiddleBlock; 806 807 /// The (unique) ExitBlock of the scalar loop. Note that 808 /// there can be multiple exiting edges reaching this block. 809 BasicBlock *LoopExitBlock; 810 811 /// The vector loop body. 812 BasicBlock *LoopVectorBody; 813 814 /// The scalar loop body. 815 BasicBlock *LoopScalarBody; 816 817 /// A list of all bypass blocks. The first block is the entry of the loop. 818 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 819 820 /// The new Induction variable which was added to the new block. 821 PHINode *Induction = nullptr; 822 823 /// The induction variable of the old basic block. 824 PHINode *OldInduction = nullptr; 825 826 /// Store instructions that were predicated. 827 SmallVector<Instruction *, 4> PredicatedInstructions; 828 829 /// Trip count of the original loop. 830 Value *TripCount = nullptr; 831 832 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 833 Value *VectorTripCount = nullptr; 834 835 /// The legality analysis. 836 LoopVectorizationLegality *Legal; 837 838 /// The profitablity analysis. 839 LoopVectorizationCostModel *Cost; 840 841 // Record whether runtime checks are added. 842 bool AddedSafetyChecks = false; 843 844 // Holds the end values for each induction variable. We save the end values 845 // so we can later fix-up the external users of the induction variables. 846 DenseMap<PHINode *, Value *> IVEndValues; 847 848 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 849 // fixed up at the end of vector code generation. 850 SmallVector<PHINode *, 8> OrigPHIsToFix; 851 852 /// BFI and PSI are used to check for profile guided size optimizations. 853 BlockFrequencyInfo *BFI; 854 ProfileSummaryInfo *PSI; 855 856 // Whether this loop should be optimized for size based on profile guided size 857 // optimizatios. 858 bool OptForSizeBasedOnProfile; 859 860 /// Structure to hold information about generated runtime checks, responsible 861 /// for cleaning the checks, if vectorization turns out unprofitable. 862 GeneratedRTChecks &RTChecks; 863 }; 864 865 class InnerLoopUnroller : public InnerLoopVectorizer { 866 public: 867 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 868 LoopInfo *LI, DominatorTree *DT, 869 const TargetLibraryInfo *TLI, 870 const TargetTransformInfo *TTI, AssumptionCache *AC, 871 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 872 LoopVectorizationLegality *LVL, 873 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 874 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 875 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 876 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 877 BFI, PSI, Check) {} 878 879 private: 880 Value *getBroadcastInstrs(Value *V) override; 881 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 882 Instruction::BinaryOps Opcode = 883 Instruction::BinaryOpsEnd) override; 884 Value *reverseVector(Value *Vec) override; 885 }; 886 887 /// Encapsulate information regarding vectorization of a loop and its epilogue. 888 /// This information is meant to be updated and used across two stages of 889 /// epilogue vectorization. 890 struct EpilogueLoopVectorizationInfo { 891 ElementCount MainLoopVF = ElementCount::getFixed(0); 892 unsigned MainLoopUF = 0; 893 ElementCount EpilogueVF = ElementCount::getFixed(0); 894 unsigned EpilogueUF = 0; 895 BasicBlock *MainLoopIterationCountCheck = nullptr; 896 BasicBlock *EpilogueIterationCountCheck = nullptr; 897 BasicBlock *SCEVSafetyCheck = nullptr; 898 BasicBlock *MemSafetyCheck = nullptr; 899 Value *TripCount = nullptr; 900 Value *VectorTripCount = nullptr; 901 902 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 903 unsigned EUF) 904 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 905 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 906 assert(EUF == 1 && 907 "A high UF for the epilogue loop is likely not beneficial."); 908 } 909 }; 910 911 /// An extension of the inner loop vectorizer that creates a skeleton for a 912 /// vectorized loop that has its epilogue (residual) also vectorized. 913 /// The idea is to run the vplan on a given loop twice, firstly to setup the 914 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 915 /// from the first step and vectorize the epilogue. This is achieved by 916 /// deriving two concrete strategy classes from this base class and invoking 917 /// them in succession from the loop vectorizer planner. 918 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 919 public: 920 InnerLoopAndEpilogueVectorizer( 921 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 922 DominatorTree *DT, const TargetLibraryInfo *TLI, 923 const TargetTransformInfo *TTI, AssumptionCache *AC, 924 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 925 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 926 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 927 GeneratedRTChecks &Checks) 928 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 929 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 930 Checks), 931 EPI(EPI) {} 932 933 // Override this function to handle the more complex control flow around the 934 // three loops. 935 BasicBlock *createVectorizedLoopSkeleton() final override { 936 return createEpilogueVectorizedLoopSkeleton(); 937 } 938 939 /// The interface for creating a vectorized skeleton using one of two 940 /// different strategies, each corresponding to one execution of the vplan 941 /// as described above. 942 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 943 944 /// Holds and updates state information required to vectorize the main loop 945 /// and its epilogue in two separate passes. This setup helps us avoid 946 /// regenerating and recomputing runtime safety checks. It also helps us to 947 /// shorten the iteration-count-check path length for the cases where the 948 /// iteration count of the loop is so small that the main vector loop is 949 /// completely skipped. 950 EpilogueLoopVectorizationInfo &EPI; 951 }; 952 953 /// A specialized derived class of inner loop vectorizer that performs 954 /// vectorization of *main* loops in the process of vectorizing loops and their 955 /// epilogues. 956 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 957 public: 958 EpilogueVectorizerMainLoop( 959 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 960 DominatorTree *DT, const TargetLibraryInfo *TLI, 961 const TargetTransformInfo *TTI, AssumptionCache *AC, 962 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 963 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 964 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 965 GeneratedRTChecks &Check) 966 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 967 EPI, LVL, CM, BFI, PSI, Check) {} 968 /// Implements the interface for creating a vectorized skeleton using the 969 /// *main loop* strategy (ie the first pass of vplan execution). 970 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 971 972 protected: 973 /// Emits an iteration count bypass check once for the main loop (when \p 974 /// ForEpilogue is false) and once for the epilogue loop (when \p 975 /// ForEpilogue is true). 976 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 977 bool ForEpilogue); 978 void printDebugTracesAtStart() override; 979 void printDebugTracesAtEnd() override; 980 }; 981 982 // A specialized derived class of inner loop vectorizer that performs 983 // vectorization of *epilogue* loops in the process of vectorizing loops and 984 // their epilogues. 985 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 986 public: 987 EpilogueVectorizerEpilogueLoop( 988 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 989 DominatorTree *DT, const TargetLibraryInfo *TLI, 990 const TargetTransformInfo *TTI, AssumptionCache *AC, 991 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 992 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 993 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 994 GeneratedRTChecks &Checks) 995 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 996 EPI, LVL, CM, BFI, PSI, Checks) {} 997 /// Implements the interface for creating a vectorized skeleton using the 998 /// *epilogue loop* strategy (ie the second pass of vplan execution). 999 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1000 1001 protected: 1002 /// Emits an iteration count bypass check after the main vector loop has 1003 /// finished to see if there are any iterations left to execute by either 1004 /// the vector epilogue or the scalar epilogue. 1005 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1006 BasicBlock *Bypass, 1007 BasicBlock *Insert); 1008 void printDebugTracesAtStart() override; 1009 void printDebugTracesAtEnd() override; 1010 }; 1011 } // end namespace llvm 1012 1013 /// Look for a meaningful debug location on the instruction or it's 1014 /// operands. 1015 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1016 if (!I) 1017 return I; 1018 1019 DebugLoc Empty; 1020 if (I->getDebugLoc() != Empty) 1021 return I; 1022 1023 for (Use &Op : I->operands()) { 1024 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1025 if (OpInst->getDebugLoc() != Empty) 1026 return OpInst; 1027 } 1028 1029 return I; 1030 } 1031 1032 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1033 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1034 const DILocation *DIL = Inst->getDebugLoc(); 1035 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1036 !isa<DbgInfoIntrinsic>(Inst)) { 1037 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1038 auto NewDIL = 1039 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1040 if (NewDIL) 1041 B.SetCurrentDebugLocation(NewDIL.getValue()); 1042 else 1043 LLVM_DEBUG(dbgs() 1044 << "Failed to create new discriminator: " 1045 << DIL->getFilename() << " Line: " << DIL->getLine()); 1046 } 1047 else 1048 B.SetCurrentDebugLocation(DIL); 1049 } else 1050 B.SetCurrentDebugLocation(DebugLoc()); 1051 } 1052 1053 /// Write a record \p DebugMsg about vectorization failure to the debug 1054 /// output stream. If \p I is passed, it is an instruction that prevents 1055 /// vectorization. 1056 #ifndef NDEBUG 1057 static void debugVectorizationFailure(const StringRef DebugMsg, 1058 Instruction *I) { 1059 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1060 if (I != nullptr) 1061 dbgs() << " " << *I; 1062 else 1063 dbgs() << '.'; 1064 dbgs() << '\n'; 1065 } 1066 #endif 1067 1068 /// Create an analysis remark that explains why vectorization failed 1069 /// 1070 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1071 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1072 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1073 /// the location of the remark. \return the remark object that can be 1074 /// streamed to. 1075 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1076 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1077 Value *CodeRegion = TheLoop->getHeader(); 1078 DebugLoc DL = TheLoop->getStartLoc(); 1079 1080 if (I) { 1081 CodeRegion = I->getParent(); 1082 // If there is no debug location attached to the instruction, revert back to 1083 // using the loop's. 1084 if (I->getDebugLoc()) 1085 DL = I->getDebugLoc(); 1086 } 1087 1088 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1089 R << "loop not vectorized: "; 1090 return R; 1091 } 1092 1093 /// Return a value for Step multiplied by VF. 1094 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1095 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1096 Constant *StepVal = ConstantInt::get( 1097 Step->getType(), 1098 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1099 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1100 } 1101 1102 namespace llvm { 1103 1104 /// Return the runtime value for VF. 1105 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1106 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1107 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1108 } 1109 1110 void reportVectorizationFailure(const StringRef DebugMsg, 1111 const StringRef OREMsg, const StringRef ORETag, 1112 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1113 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1114 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1115 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1116 ORETag, TheLoop, I) << OREMsg); 1117 } 1118 1119 } // end namespace llvm 1120 1121 #ifndef NDEBUG 1122 /// \return string containing a file name and a line # for the given loop. 1123 static std::string getDebugLocString(const Loop *L) { 1124 std::string Result; 1125 if (L) { 1126 raw_string_ostream OS(Result); 1127 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1128 LoopDbgLoc.print(OS); 1129 else 1130 // Just print the module name. 1131 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1132 OS.flush(); 1133 } 1134 return Result; 1135 } 1136 #endif 1137 1138 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1139 const Instruction *Orig) { 1140 // If the loop was versioned with memchecks, add the corresponding no-alias 1141 // metadata. 1142 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1143 LVer->annotateInstWithNoAlias(To, Orig); 1144 } 1145 1146 void InnerLoopVectorizer::addMetadata(Instruction *To, 1147 Instruction *From) { 1148 propagateMetadata(To, From); 1149 addNewMetadata(To, From); 1150 } 1151 1152 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1153 Instruction *From) { 1154 for (Value *V : To) { 1155 if (Instruction *I = dyn_cast<Instruction>(V)) 1156 addMetadata(I, From); 1157 } 1158 } 1159 1160 namespace llvm { 1161 1162 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1163 // lowered. 1164 enum ScalarEpilogueLowering { 1165 1166 // The default: allowing scalar epilogues. 1167 CM_ScalarEpilogueAllowed, 1168 1169 // Vectorization with OptForSize: don't allow epilogues. 1170 CM_ScalarEpilogueNotAllowedOptSize, 1171 1172 // A special case of vectorisation with OptForSize: loops with a very small 1173 // trip count are considered for vectorization under OptForSize, thereby 1174 // making sure the cost of their loop body is dominant, free of runtime 1175 // guards and scalar iteration overheads. 1176 CM_ScalarEpilogueNotAllowedLowTripLoop, 1177 1178 // Loop hint predicate indicating an epilogue is undesired. 1179 CM_ScalarEpilogueNotNeededUsePredicate, 1180 1181 // Directive indicating we must either tail fold or not vectorize 1182 CM_ScalarEpilogueNotAllowedUsePredicate 1183 }; 1184 1185 /// LoopVectorizationCostModel - estimates the expected speedups due to 1186 /// vectorization. 1187 /// In many cases vectorization is not profitable. This can happen because of 1188 /// a number of reasons. In this class we mainly attempt to predict the 1189 /// expected speedup/slowdowns due to the supported instruction set. We use the 1190 /// TargetTransformInfo to query the different backends for the cost of 1191 /// different operations. 1192 class LoopVectorizationCostModel { 1193 public: 1194 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1195 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1196 LoopVectorizationLegality *Legal, 1197 const TargetTransformInfo &TTI, 1198 const TargetLibraryInfo *TLI, DemandedBits *DB, 1199 AssumptionCache *AC, 1200 OptimizationRemarkEmitter *ORE, const Function *F, 1201 const LoopVectorizeHints *Hints, 1202 InterleavedAccessInfo &IAI) 1203 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1204 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1205 Hints(Hints), InterleaveInfo(IAI) {} 1206 1207 /// \return An upper bound for the vectorization factor, or None if 1208 /// vectorization and interleaving should be avoided up front. 1209 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1210 1211 /// \return True if runtime checks are required for vectorization, and false 1212 /// otherwise. 1213 bool runtimeChecksRequired(); 1214 1215 /// \return The most profitable vectorization factor and the cost of that VF. 1216 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1217 /// then this vectorization factor will be selected if vectorization is 1218 /// possible. 1219 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1220 VectorizationFactor 1221 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1222 const LoopVectorizationPlanner &LVP); 1223 1224 /// Setup cost-based decisions for user vectorization factor. 1225 void selectUserVectorizationFactor(ElementCount UserVF) { 1226 collectUniformsAndScalars(UserVF); 1227 collectInstsToScalarize(UserVF); 1228 } 1229 1230 /// \return The size (in bits) of the smallest and widest types in the code 1231 /// that needs to be vectorized. We ignore values that remain scalar such as 1232 /// 64 bit loop indices. 1233 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1234 1235 /// \return The desired interleave count. 1236 /// If interleave count has been specified by metadata it will be returned. 1237 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1238 /// are the selected vectorization factor and the cost of the selected VF. 1239 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1240 1241 /// Memory access instruction may be vectorized in more than one way. 1242 /// Form of instruction after vectorization depends on cost. 1243 /// This function takes cost-based decisions for Load/Store instructions 1244 /// and collects them in a map. This decisions map is used for building 1245 /// the lists of loop-uniform and loop-scalar instructions. 1246 /// The calculated cost is saved with widening decision in order to 1247 /// avoid redundant calculations. 1248 void setCostBasedWideningDecision(ElementCount VF); 1249 1250 /// A struct that represents some properties of the register usage 1251 /// of a loop. 1252 struct RegisterUsage { 1253 /// Holds the number of loop invariant values that are used in the loop. 1254 /// The key is ClassID of target-provided register class. 1255 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1256 /// Holds the maximum number of concurrent live intervals in the loop. 1257 /// The key is ClassID of target-provided register class. 1258 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1259 }; 1260 1261 /// \return Returns information about the register usages of the loop for the 1262 /// given vectorization factors. 1263 SmallVector<RegisterUsage, 8> 1264 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1265 1266 /// Collect values we want to ignore in the cost model. 1267 void collectValuesToIgnore(); 1268 1269 /// Split reductions into those that happen in the loop, and those that happen 1270 /// outside. In loop reductions are collected into InLoopReductionChains. 1271 void collectInLoopReductions(); 1272 1273 /// \returns The smallest bitwidth each instruction can be represented with. 1274 /// The vector equivalents of these instructions should be truncated to this 1275 /// type. 1276 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1277 return MinBWs; 1278 } 1279 1280 /// \returns True if it is more profitable to scalarize instruction \p I for 1281 /// vectorization factor \p VF. 1282 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1283 assert(VF.isVector() && 1284 "Profitable to scalarize relevant only for VF > 1."); 1285 1286 // Cost model is not run in the VPlan-native path - return conservative 1287 // result until this changes. 1288 if (EnableVPlanNativePath) 1289 return false; 1290 1291 auto Scalars = InstsToScalarize.find(VF); 1292 assert(Scalars != InstsToScalarize.end() && 1293 "VF not yet analyzed for scalarization profitability"); 1294 return Scalars->second.find(I) != Scalars->second.end(); 1295 } 1296 1297 /// Returns true if \p I is known to be uniform after vectorization. 1298 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1299 if (VF.isScalar()) 1300 return true; 1301 1302 // Cost model is not run in the VPlan-native path - return conservative 1303 // result until this changes. 1304 if (EnableVPlanNativePath) 1305 return false; 1306 1307 auto UniformsPerVF = Uniforms.find(VF); 1308 assert(UniformsPerVF != Uniforms.end() && 1309 "VF not yet analyzed for uniformity"); 1310 return UniformsPerVF->second.count(I); 1311 } 1312 1313 /// Returns true if \p I is known to be scalar after vectorization. 1314 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1315 if (VF.isScalar()) 1316 return true; 1317 1318 // Cost model is not run in the VPlan-native path - return conservative 1319 // result until this changes. 1320 if (EnableVPlanNativePath) 1321 return false; 1322 1323 auto ScalarsPerVF = Scalars.find(VF); 1324 assert(ScalarsPerVF != Scalars.end() && 1325 "Scalar values are not calculated for VF"); 1326 return ScalarsPerVF->second.count(I); 1327 } 1328 1329 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1330 /// for vectorization factor \p VF. 1331 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1332 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1333 !isProfitableToScalarize(I, VF) && 1334 !isScalarAfterVectorization(I, VF); 1335 } 1336 1337 /// Decision that was taken during cost calculation for memory instruction. 1338 enum InstWidening { 1339 CM_Unknown, 1340 CM_Widen, // For consecutive accesses with stride +1. 1341 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1342 CM_Interleave, 1343 CM_GatherScatter, 1344 CM_Scalarize 1345 }; 1346 1347 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1348 /// instruction \p I and vector width \p VF. 1349 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1350 InstructionCost Cost) { 1351 assert(VF.isVector() && "Expected VF >=2"); 1352 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1353 } 1354 1355 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1356 /// interleaving group \p Grp and vector width \p VF. 1357 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1358 ElementCount VF, InstWidening W, 1359 InstructionCost Cost) { 1360 assert(VF.isVector() && "Expected VF >=2"); 1361 /// Broadcast this decicion to all instructions inside the group. 1362 /// But the cost will be assigned to one instruction only. 1363 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1364 if (auto *I = Grp->getMember(i)) { 1365 if (Grp->getInsertPos() == I) 1366 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1367 else 1368 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1369 } 1370 } 1371 } 1372 1373 /// Return the cost model decision for the given instruction \p I and vector 1374 /// width \p VF. Return CM_Unknown if this instruction did not pass 1375 /// through the cost modeling. 1376 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1377 assert(VF.isVector() && "Expected VF to be a vector VF"); 1378 // Cost model is not run in the VPlan-native path - return conservative 1379 // result until this changes. 1380 if (EnableVPlanNativePath) 1381 return CM_GatherScatter; 1382 1383 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1384 auto Itr = WideningDecisions.find(InstOnVF); 1385 if (Itr == WideningDecisions.end()) 1386 return CM_Unknown; 1387 return Itr->second.first; 1388 } 1389 1390 /// Return the vectorization cost for the given instruction \p I and vector 1391 /// width \p VF. 1392 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1393 assert(VF.isVector() && "Expected VF >=2"); 1394 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1395 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1396 "The cost is not calculated"); 1397 return WideningDecisions[InstOnVF].second; 1398 } 1399 1400 /// Return True if instruction \p I is an optimizable truncate whose operand 1401 /// is an induction variable. Such a truncate will be removed by adding a new 1402 /// induction variable with the destination type. 1403 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1404 // If the instruction is not a truncate, return false. 1405 auto *Trunc = dyn_cast<TruncInst>(I); 1406 if (!Trunc) 1407 return false; 1408 1409 // Get the source and destination types of the truncate. 1410 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1411 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1412 1413 // If the truncate is free for the given types, return false. Replacing a 1414 // free truncate with an induction variable would add an induction variable 1415 // update instruction to each iteration of the loop. We exclude from this 1416 // check the primary induction variable since it will need an update 1417 // instruction regardless. 1418 Value *Op = Trunc->getOperand(0); 1419 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1420 return false; 1421 1422 // If the truncated value is not an induction variable, return false. 1423 return Legal->isInductionPhi(Op); 1424 } 1425 1426 /// Collects the instructions to scalarize for each predicated instruction in 1427 /// the loop. 1428 void collectInstsToScalarize(ElementCount VF); 1429 1430 /// Collect Uniform and Scalar values for the given \p VF. 1431 /// The sets depend on CM decision for Load/Store instructions 1432 /// that may be vectorized as interleave, gather-scatter or scalarized. 1433 void collectUniformsAndScalars(ElementCount VF) { 1434 // Do the analysis once. 1435 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1436 return; 1437 setCostBasedWideningDecision(VF); 1438 collectLoopUniforms(VF); 1439 collectLoopScalars(VF); 1440 } 1441 1442 /// Returns true if the target machine supports masked store operation 1443 /// for the given \p DataType and kind of access to \p Ptr. 1444 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1445 return Legal->isConsecutivePtr(Ptr) && 1446 TTI.isLegalMaskedStore(DataType, Alignment); 1447 } 1448 1449 /// Returns true if the target machine supports masked load operation 1450 /// for the given \p DataType and kind of access to \p Ptr. 1451 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1452 return Legal->isConsecutivePtr(Ptr) && 1453 TTI.isLegalMaskedLoad(DataType, Alignment); 1454 } 1455 1456 /// Returns true if the target machine supports masked scatter operation 1457 /// for the given \p DataType. 1458 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1459 return TTI.isLegalMaskedScatter(DataType, Alignment); 1460 } 1461 1462 /// Returns true if the target machine supports masked gather operation 1463 /// for the given \p DataType. 1464 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1465 return TTI.isLegalMaskedGather(DataType, Alignment); 1466 } 1467 1468 /// Returns true if the target machine can represent \p V as a masked gather 1469 /// or scatter operation. 1470 bool isLegalGatherOrScatter(Value *V) { 1471 bool LI = isa<LoadInst>(V); 1472 bool SI = isa<StoreInst>(V); 1473 if (!LI && !SI) 1474 return false; 1475 auto *Ty = getMemInstValueType(V); 1476 Align Align = getLoadStoreAlignment(V); 1477 return (LI && isLegalMaskedGather(Ty, Align)) || 1478 (SI && isLegalMaskedScatter(Ty, Align)); 1479 } 1480 1481 /// Returns true if the target machine supports all of the reduction 1482 /// variables found for the given VF. 1483 bool canVectorizeReductions(ElementCount VF) { 1484 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1485 RecurrenceDescriptor RdxDesc = Reduction.second; 1486 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1487 })); 1488 } 1489 1490 /// Returns true if \p I is an instruction that will be scalarized with 1491 /// predication. Such instructions include conditional stores and 1492 /// instructions that may divide by zero. 1493 /// If a non-zero VF has been calculated, we check if I will be scalarized 1494 /// predication for that VF. 1495 bool isScalarWithPredication(Instruction *I, 1496 ElementCount VF = ElementCount::getFixed(1)); 1497 1498 // Returns true if \p I is an instruction that will be predicated either 1499 // through scalar predication or masked load/store or masked gather/scatter. 1500 // Superset of instructions that return true for isScalarWithPredication. 1501 bool isPredicatedInst(Instruction *I) { 1502 if (!blockNeedsPredication(I->getParent())) 1503 return false; 1504 // Loads and stores that need some form of masked operation are predicated 1505 // instructions. 1506 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1507 return Legal->isMaskRequired(I); 1508 return isScalarWithPredication(I); 1509 } 1510 1511 /// Returns true if \p I is a memory instruction with consecutive memory 1512 /// access that can be widened. 1513 bool 1514 memoryInstructionCanBeWidened(Instruction *I, 1515 ElementCount VF = ElementCount::getFixed(1)); 1516 1517 /// Returns true if \p I is a memory instruction in an interleaved-group 1518 /// of memory accesses that can be vectorized with wide vector loads/stores 1519 /// and shuffles. 1520 bool 1521 interleavedAccessCanBeWidened(Instruction *I, 1522 ElementCount VF = ElementCount::getFixed(1)); 1523 1524 /// Check if \p Instr belongs to any interleaved access group. 1525 bool isAccessInterleaved(Instruction *Instr) { 1526 return InterleaveInfo.isInterleaved(Instr); 1527 } 1528 1529 /// Get the interleaved access group that \p Instr belongs to. 1530 const InterleaveGroup<Instruction> * 1531 getInterleavedAccessGroup(Instruction *Instr) { 1532 return InterleaveInfo.getInterleaveGroup(Instr); 1533 } 1534 1535 /// Returns true if we're required to use a scalar epilogue for at least 1536 /// the final iteration of the original loop. 1537 bool requiresScalarEpilogue() const { 1538 if (!isScalarEpilogueAllowed()) 1539 return false; 1540 // If we might exit from anywhere but the latch, must run the exiting 1541 // iteration in scalar form. 1542 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1543 return true; 1544 return InterleaveInfo.requiresScalarEpilogue(); 1545 } 1546 1547 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1548 /// loop hint annotation. 1549 bool isScalarEpilogueAllowed() const { 1550 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1551 } 1552 1553 /// Returns true if all loop blocks should be masked to fold tail loop. 1554 bool foldTailByMasking() const { return FoldTailByMasking; } 1555 1556 bool blockNeedsPredication(BasicBlock *BB) { 1557 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1558 } 1559 1560 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1561 /// nodes to the chain of instructions representing the reductions. Uses a 1562 /// MapVector to ensure deterministic iteration order. 1563 using ReductionChainMap = 1564 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1565 1566 /// Return the chain of instructions representing an inloop reduction. 1567 const ReductionChainMap &getInLoopReductionChains() const { 1568 return InLoopReductionChains; 1569 } 1570 1571 /// Returns true if the Phi is part of an inloop reduction. 1572 bool isInLoopReduction(PHINode *Phi) const { 1573 return InLoopReductionChains.count(Phi); 1574 } 1575 1576 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1577 /// with factor VF. Return the cost of the instruction, including 1578 /// scalarization overhead if it's needed. 1579 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1580 1581 /// Estimate cost of a call instruction CI if it were vectorized with factor 1582 /// VF. Return the cost of the instruction, including scalarization overhead 1583 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1584 /// scalarized - 1585 /// i.e. either vector version isn't available, or is too expensive. 1586 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1587 bool &NeedToScalarize); 1588 1589 /// Invalidates decisions already taken by the cost model. 1590 void invalidateCostModelingDecisions() { 1591 WideningDecisions.clear(); 1592 Uniforms.clear(); 1593 Scalars.clear(); 1594 } 1595 1596 private: 1597 unsigned NumPredStores = 0; 1598 1599 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1600 /// than zero. One is returned if vectorization should best be avoided due 1601 /// to cost. 1602 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1603 ElementCount UserVF); 1604 1605 /// The vectorization cost is a combination of the cost itself and a boolean 1606 /// indicating whether any of the contributing operations will actually 1607 /// operate on 1608 /// vector values after type legalization in the backend. If this latter value 1609 /// is 1610 /// false, then all operations will be scalarized (i.e. no vectorization has 1611 /// actually taken place). 1612 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1613 1614 /// Returns the expected execution cost. The unit of the cost does 1615 /// not matter because we use the 'cost' units to compare different 1616 /// vector widths. The cost that is returned is *not* normalized by 1617 /// the factor width. 1618 VectorizationCostTy expectedCost(ElementCount VF); 1619 1620 /// Returns the execution time cost of an instruction for a given vector 1621 /// width. Vector width of one means scalar. 1622 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1623 1624 /// The cost-computation logic from getInstructionCost which provides 1625 /// the vector type as an output parameter. 1626 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1627 Type *&VectorTy); 1628 1629 /// Return the cost of instructions in an inloop reduction pattern, if I is 1630 /// part of that pattern. 1631 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1632 Type *VectorTy, 1633 TTI::TargetCostKind CostKind); 1634 1635 /// Calculate vectorization cost of memory instruction \p I. 1636 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1637 1638 /// The cost computation for scalarized memory instruction. 1639 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1640 1641 /// The cost computation for interleaving group of memory instructions. 1642 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1643 1644 /// The cost computation for Gather/Scatter instruction. 1645 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1646 1647 /// The cost computation for widening instruction \p I with consecutive 1648 /// memory access. 1649 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1650 1651 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1652 /// Load: scalar load + broadcast. 1653 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1654 /// element) 1655 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1656 1657 /// Estimate the overhead of scalarizing an instruction. This is a 1658 /// convenience wrapper for the type-based getScalarizationOverhead API. 1659 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1660 1661 /// Returns whether the instruction is a load or store and will be a emitted 1662 /// as a vector operation. 1663 bool isConsecutiveLoadOrStore(Instruction *I); 1664 1665 /// Returns true if an artificially high cost for emulated masked memrefs 1666 /// should be used. 1667 bool useEmulatedMaskMemRefHack(Instruction *I); 1668 1669 /// Map of scalar integer values to the smallest bitwidth they can be legally 1670 /// represented as. The vector equivalents of these values should be truncated 1671 /// to this type. 1672 MapVector<Instruction *, uint64_t> MinBWs; 1673 1674 /// A type representing the costs for instructions if they were to be 1675 /// scalarized rather than vectorized. The entries are Instruction-Cost 1676 /// pairs. 1677 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1678 1679 /// A set containing all BasicBlocks that are known to present after 1680 /// vectorization as a predicated block. 1681 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1682 1683 /// Records whether it is allowed to have the original scalar loop execute at 1684 /// least once. This may be needed as a fallback loop in case runtime 1685 /// aliasing/dependence checks fail, or to handle the tail/remainder 1686 /// iterations when the trip count is unknown or doesn't divide by the VF, 1687 /// or as a peel-loop to handle gaps in interleave-groups. 1688 /// Under optsize and when the trip count is very small we don't allow any 1689 /// iterations to execute in the scalar loop. 1690 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1691 1692 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1693 bool FoldTailByMasking = false; 1694 1695 /// A map holding scalar costs for different vectorization factors. The 1696 /// presence of a cost for an instruction in the mapping indicates that the 1697 /// instruction will be scalarized when vectorizing with the associated 1698 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1699 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1700 1701 /// Holds the instructions known to be uniform after vectorization. 1702 /// The data is collected per VF. 1703 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1704 1705 /// Holds the instructions known to be scalar after vectorization. 1706 /// The data is collected per VF. 1707 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1708 1709 /// Holds the instructions (address computations) that are forced to be 1710 /// scalarized. 1711 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1712 1713 /// PHINodes of the reductions that should be expanded in-loop along with 1714 /// their associated chains of reduction operations, in program order from top 1715 /// (PHI) to bottom 1716 ReductionChainMap InLoopReductionChains; 1717 1718 /// A Map of inloop reduction operations and their immediate chain operand. 1719 /// FIXME: This can be removed once reductions can be costed correctly in 1720 /// vplan. This was added to allow quick lookup to the inloop operations, 1721 /// without having to loop through InLoopReductionChains. 1722 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1723 1724 /// Returns the expected difference in cost from scalarizing the expression 1725 /// feeding a predicated instruction \p PredInst. The instructions to 1726 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1727 /// non-negative return value implies the expression will be scalarized. 1728 /// Currently, only single-use chains are considered for scalarization. 1729 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1730 ElementCount VF); 1731 1732 /// Collect the instructions that are uniform after vectorization. An 1733 /// instruction is uniform if we represent it with a single scalar value in 1734 /// the vectorized loop corresponding to each vector iteration. Examples of 1735 /// uniform instructions include pointer operands of consecutive or 1736 /// interleaved memory accesses. Note that although uniformity implies an 1737 /// instruction will be scalar, the reverse is not true. In general, a 1738 /// scalarized instruction will be represented by VF scalar values in the 1739 /// vectorized loop, each corresponding to an iteration of the original 1740 /// scalar loop. 1741 void collectLoopUniforms(ElementCount VF); 1742 1743 /// Collect the instructions that are scalar after vectorization. An 1744 /// instruction is scalar if it is known to be uniform or will be scalarized 1745 /// during vectorization. Non-uniform scalarized instructions will be 1746 /// represented by VF values in the vectorized loop, each corresponding to an 1747 /// iteration of the original scalar loop. 1748 void collectLoopScalars(ElementCount VF); 1749 1750 /// Keeps cost model vectorization decision and cost for instructions. 1751 /// Right now it is used for memory instructions only. 1752 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1753 std::pair<InstWidening, InstructionCost>>; 1754 1755 DecisionList WideningDecisions; 1756 1757 /// Returns true if \p V is expected to be vectorized and it needs to be 1758 /// extracted. 1759 bool needsExtract(Value *V, ElementCount VF) const { 1760 Instruction *I = dyn_cast<Instruction>(V); 1761 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1762 TheLoop->isLoopInvariant(I)) 1763 return false; 1764 1765 // Assume we can vectorize V (and hence we need extraction) if the 1766 // scalars are not computed yet. This can happen, because it is called 1767 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1768 // the scalars are collected. That should be a safe assumption in most 1769 // cases, because we check if the operands have vectorizable types 1770 // beforehand in LoopVectorizationLegality. 1771 return Scalars.find(VF) == Scalars.end() || 1772 !isScalarAfterVectorization(I, VF); 1773 }; 1774 1775 /// Returns a range containing only operands needing to be extracted. 1776 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1777 ElementCount VF) { 1778 return SmallVector<Value *, 4>(make_filter_range( 1779 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1780 } 1781 1782 /// Determines if we have the infrastructure to vectorize loop \p L and its 1783 /// epilogue, assuming the main loop is vectorized by \p VF. 1784 bool isCandidateForEpilogueVectorization(const Loop &L, 1785 const ElementCount VF) const; 1786 1787 /// Returns true if epilogue vectorization is considered profitable, and 1788 /// false otherwise. 1789 /// \p VF is the vectorization factor chosen for the original loop. 1790 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1791 1792 public: 1793 /// The loop that we evaluate. 1794 Loop *TheLoop; 1795 1796 /// Predicated scalar evolution analysis. 1797 PredicatedScalarEvolution &PSE; 1798 1799 /// Loop Info analysis. 1800 LoopInfo *LI; 1801 1802 /// Vectorization legality. 1803 LoopVectorizationLegality *Legal; 1804 1805 /// Vector target information. 1806 const TargetTransformInfo &TTI; 1807 1808 /// Target Library Info. 1809 const TargetLibraryInfo *TLI; 1810 1811 /// Demanded bits analysis. 1812 DemandedBits *DB; 1813 1814 /// Assumption cache. 1815 AssumptionCache *AC; 1816 1817 /// Interface to emit optimization remarks. 1818 OptimizationRemarkEmitter *ORE; 1819 1820 const Function *TheFunction; 1821 1822 /// Loop Vectorize Hint. 1823 const LoopVectorizeHints *Hints; 1824 1825 /// The interleave access information contains groups of interleaved accesses 1826 /// with the same stride and close to each other. 1827 InterleavedAccessInfo &InterleaveInfo; 1828 1829 /// Values to ignore in the cost model. 1830 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1831 1832 /// Values to ignore in the cost model when VF > 1. 1833 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1834 1835 /// Profitable vector factors. 1836 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1837 }; 1838 } // end namespace llvm 1839 1840 /// Helper struct to manage generating runtime checks for vectorization. 1841 /// 1842 /// The runtime checks are created up-front in temporary blocks to allow better 1843 /// estimating the cost and un-linked from the existing IR. After deciding to 1844 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1845 /// temporary blocks are completely removed. 1846 class GeneratedRTChecks { 1847 /// Basic block which contains the generated SCEV checks, if any. 1848 BasicBlock *SCEVCheckBlock = nullptr; 1849 1850 /// The value representing the result of the generated SCEV checks. If it is 1851 /// nullptr, either no SCEV checks have been generated or they have been used. 1852 Value *SCEVCheckCond = nullptr; 1853 1854 /// Basic block which contains the generated memory runtime checks, if any. 1855 BasicBlock *MemCheckBlock = nullptr; 1856 1857 /// The value representing the result of the generated memory runtime checks. 1858 /// If it is nullptr, either no memory runtime checks have been generated or 1859 /// they have been used. 1860 Instruction *MemRuntimeCheckCond = nullptr; 1861 1862 DominatorTree *DT; 1863 LoopInfo *LI; 1864 1865 SCEVExpander SCEVExp; 1866 SCEVExpander MemCheckExp; 1867 1868 public: 1869 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1870 const DataLayout &DL) 1871 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1872 MemCheckExp(SE, DL, "scev.check") {} 1873 1874 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1875 /// accurately estimate the cost of the runtime checks. The blocks are 1876 /// un-linked from the IR and is added back during vector code generation. If 1877 /// there is no vector code generation, the check blocks are removed 1878 /// completely. 1879 void Create(Loop *L, const LoopAccessInfo &LAI, 1880 const SCEVUnionPredicate &UnionPred) { 1881 1882 BasicBlock *LoopHeader = L->getHeader(); 1883 BasicBlock *Preheader = L->getLoopPreheader(); 1884 1885 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1886 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1887 // may be used by SCEVExpander. The blocks will be un-linked from their 1888 // predecessors and removed from LI & DT at the end of the function. 1889 if (!UnionPred.isAlwaysTrue()) { 1890 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1891 nullptr, "vector.scevcheck"); 1892 1893 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1894 &UnionPred, SCEVCheckBlock->getTerminator()); 1895 } 1896 1897 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1898 if (RtPtrChecking.Need) { 1899 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1900 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1901 "vector.memcheck"); 1902 1903 std::tie(std::ignore, MemRuntimeCheckCond) = 1904 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1905 RtPtrChecking.getChecks(), MemCheckExp); 1906 assert(MemRuntimeCheckCond && 1907 "no RT checks generated although RtPtrChecking " 1908 "claimed checks are required"); 1909 } 1910 1911 if (!MemCheckBlock && !SCEVCheckBlock) 1912 return; 1913 1914 // Unhook the temporary block with the checks, update various places 1915 // accordingly. 1916 if (SCEVCheckBlock) 1917 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1918 if (MemCheckBlock) 1919 MemCheckBlock->replaceAllUsesWith(Preheader); 1920 1921 if (SCEVCheckBlock) { 1922 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1923 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1924 Preheader->getTerminator()->eraseFromParent(); 1925 } 1926 if (MemCheckBlock) { 1927 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1928 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1929 Preheader->getTerminator()->eraseFromParent(); 1930 } 1931 1932 DT->changeImmediateDominator(LoopHeader, Preheader); 1933 if (MemCheckBlock) { 1934 DT->eraseNode(MemCheckBlock); 1935 LI->removeBlock(MemCheckBlock); 1936 } 1937 if (SCEVCheckBlock) { 1938 DT->eraseNode(SCEVCheckBlock); 1939 LI->removeBlock(SCEVCheckBlock); 1940 } 1941 } 1942 1943 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1944 /// unused. 1945 ~GeneratedRTChecks() { 1946 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1947 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1948 if (!SCEVCheckCond) 1949 SCEVCleaner.markResultUsed(); 1950 1951 if (!MemRuntimeCheckCond) 1952 MemCheckCleaner.markResultUsed(); 1953 1954 if (MemRuntimeCheckCond) { 1955 auto &SE = *MemCheckExp.getSE(); 1956 // Memory runtime check generation creates compares that use expanded 1957 // values. Remove them before running the SCEVExpanderCleaners. 1958 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1959 if (MemCheckExp.isInsertedInstruction(&I)) 1960 continue; 1961 SE.forgetValue(&I); 1962 SE.eraseValueFromMap(&I); 1963 I.eraseFromParent(); 1964 } 1965 } 1966 MemCheckCleaner.cleanup(); 1967 SCEVCleaner.cleanup(); 1968 1969 if (SCEVCheckCond) 1970 SCEVCheckBlock->eraseFromParent(); 1971 if (MemRuntimeCheckCond) 1972 MemCheckBlock->eraseFromParent(); 1973 } 1974 1975 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1976 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1977 /// depending on the generated condition. 1978 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1979 BasicBlock *LoopVectorPreHeader, 1980 BasicBlock *LoopExitBlock) { 1981 if (!SCEVCheckCond) 1982 return nullptr; 1983 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 1984 if (C->isZero()) 1985 return nullptr; 1986 1987 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 1988 1989 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 1990 // Create new preheader for vector loop. 1991 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 1992 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 1993 1994 SCEVCheckBlock->getTerminator()->eraseFromParent(); 1995 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 1996 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 1997 SCEVCheckBlock); 1998 1999 DT->addNewBlock(SCEVCheckBlock, Pred); 2000 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2001 2002 ReplaceInstWithInst( 2003 SCEVCheckBlock->getTerminator(), 2004 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2005 // Mark the check as used, to prevent it from being removed during cleanup. 2006 SCEVCheckCond = nullptr; 2007 return SCEVCheckBlock; 2008 } 2009 2010 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2011 /// the branches to branch to the vector preheader or \p Bypass, depending on 2012 /// the generated condition. 2013 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2014 BasicBlock *LoopVectorPreHeader) { 2015 // Check if we generated code that checks in runtime if arrays overlap. 2016 if (!MemRuntimeCheckCond) 2017 return nullptr; 2018 2019 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2020 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2021 MemCheckBlock); 2022 2023 DT->addNewBlock(MemCheckBlock, Pred); 2024 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2025 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2026 2027 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2028 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2029 2030 ReplaceInstWithInst( 2031 MemCheckBlock->getTerminator(), 2032 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2033 MemCheckBlock->getTerminator()->setDebugLoc( 2034 Pred->getTerminator()->getDebugLoc()); 2035 2036 // Mark the check as used, to prevent it from being removed during cleanup. 2037 MemRuntimeCheckCond = nullptr; 2038 return MemCheckBlock; 2039 } 2040 }; 2041 2042 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2043 // vectorization. The loop needs to be annotated with #pragma omp simd 2044 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2045 // vector length information is not provided, vectorization is not considered 2046 // explicit. Interleave hints are not allowed either. These limitations will be 2047 // relaxed in the future. 2048 // Please, note that we are currently forced to abuse the pragma 'clang 2049 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2050 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2051 // provides *explicit vectorization hints* (LV can bypass legal checks and 2052 // assume that vectorization is legal). However, both hints are implemented 2053 // using the same metadata (llvm.loop.vectorize, processed by 2054 // LoopVectorizeHints). This will be fixed in the future when the native IR 2055 // representation for pragma 'omp simd' is introduced. 2056 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2057 OptimizationRemarkEmitter *ORE) { 2058 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2059 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2060 2061 // Only outer loops with an explicit vectorization hint are supported. 2062 // Unannotated outer loops are ignored. 2063 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2064 return false; 2065 2066 Function *Fn = OuterLp->getHeader()->getParent(); 2067 if (!Hints.allowVectorization(Fn, OuterLp, 2068 true /*VectorizeOnlyWhenForced*/)) { 2069 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2070 return false; 2071 } 2072 2073 if (Hints.getInterleave() > 1) { 2074 // TODO: Interleave support is future work. 2075 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2076 "outer loops.\n"); 2077 Hints.emitRemarkWithHints(); 2078 return false; 2079 } 2080 2081 return true; 2082 } 2083 2084 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2085 OptimizationRemarkEmitter *ORE, 2086 SmallVectorImpl<Loop *> &V) { 2087 // Collect inner loops and outer loops without irreducible control flow. For 2088 // now, only collect outer loops that have explicit vectorization hints. If we 2089 // are stress testing the VPlan H-CFG construction, we collect the outermost 2090 // loop of every loop nest. 2091 if (L.isInnermost() || VPlanBuildStressTest || 2092 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2093 LoopBlocksRPO RPOT(&L); 2094 RPOT.perform(LI); 2095 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2096 V.push_back(&L); 2097 // TODO: Collect inner loops inside marked outer loops in case 2098 // vectorization fails for the outer loop. Do not invoke 2099 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2100 // already known to be reducible. We can use an inherited attribute for 2101 // that. 2102 return; 2103 } 2104 } 2105 for (Loop *InnerL : L) 2106 collectSupportedLoops(*InnerL, LI, ORE, V); 2107 } 2108 2109 namespace { 2110 2111 /// The LoopVectorize Pass. 2112 struct LoopVectorize : public FunctionPass { 2113 /// Pass identification, replacement for typeid 2114 static char ID; 2115 2116 LoopVectorizePass Impl; 2117 2118 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2119 bool VectorizeOnlyWhenForced = false) 2120 : FunctionPass(ID), 2121 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2122 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2123 } 2124 2125 bool runOnFunction(Function &F) override { 2126 if (skipFunction(F)) 2127 return false; 2128 2129 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2130 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2131 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2132 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2133 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2134 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2135 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2136 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2137 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2138 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2139 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2140 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2141 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2142 2143 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2144 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2145 2146 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2147 GetLAA, *ORE, PSI).MadeAnyChange; 2148 } 2149 2150 void getAnalysisUsage(AnalysisUsage &AU) const override { 2151 AU.addRequired<AssumptionCacheTracker>(); 2152 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2153 AU.addRequired<DominatorTreeWrapperPass>(); 2154 AU.addRequired<LoopInfoWrapperPass>(); 2155 AU.addRequired<ScalarEvolutionWrapperPass>(); 2156 AU.addRequired<TargetTransformInfoWrapperPass>(); 2157 AU.addRequired<AAResultsWrapperPass>(); 2158 AU.addRequired<LoopAccessLegacyAnalysis>(); 2159 AU.addRequired<DemandedBitsWrapperPass>(); 2160 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2161 AU.addRequired<InjectTLIMappingsLegacy>(); 2162 2163 // We currently do not preserve loopinfo/dominator analyses with outer loop 2164 // vectorization. Until this is addressed, mark these analyses as preserved 2165 // only for non-VPlan-native path. 2166 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2167 if (!EnableVPlanNativePath) { 2168 AU.addPreserved<LoopInfoWrapperPass>(); 2169 AU.addPreserved<DominatorTreeWrapperPass>(); 2170 } 2171 2172 AU.addPreserved<BasicAAWrapperPass>(); 2173 AU.addPreserved<GlobalsAAWrapperPass>(); 2174 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2175 } 2176 }; 2177 2178 } // end anonymous namespace 2179 2180 //===----------------------------------------------------------------------===// 2181 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2182 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2183 //===----------------------------------------------------------------------===// 2184 2185 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2186 // We need to place the broadcast of invariant variables outside the loop, 2187 // but only if it's proven safe to do so. Else, broadcast will be inside 2188 // vector loop body. 2189 Instruction *Instr = dyn_cast<Instruction>(V); 2190 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2191 (!Instr || 2192 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2193 // Place the code for broadcasting invariant variables in the new preheader. 2194 IRBuilder<>::InsertPointGuard Guard(Builder); 2195 if (SafeToHoist) 2196 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2197 2198 // Broadcast the scalar into all locations in the vector. 2199 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2200 2201 return Shuf; 2202 } 2203 2204 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2205 const InductionDescriptor &II, Value *Step, Value *Start, 2206 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2207 VPTransformState &State) { 2208 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2209 "Expected either an induction phi-node or a truncate of it!"); 2210 2211 // Construct the initial value of the vector IV in the vector loop preheader 2212 auto CurrIP = Builder.saveIP(); 2213 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2214 if (isa<TruncInst>(EntryVal)) { 2215 assert(Start->getType()->isIntegerTy() && 2216 "Truncation requires an integer type"); 2217 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2218 Step = Builder.CreateTrunc(Step, TruncType); 2219 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2220 } 2221 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2222 Value *SteppedStart = 2223 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2224 2225 // We create vector phi nodes for both integer and floating-point induction 2226 // variables. Here, we determine the kind of arithmetic we will perform. 2227 Instruction::BinaryOps AddOp; 2228 Instruction::BinaryOps MulOp; 2229 if (Step->getType()->isIntegerTy()) { 2230 AddOp = Instruction::Add; 2231 MulOp = Instruction::Mul; 2232 } else { 2233 AddOp = II.getInductionOpcode(); 2234 MulOp = Instruction::FMul; 2235 } 2236 2237 // Multiply the vectorization factor by the step using integer or 2238 // floating-point arithmetic as appropriate. 2239 Value *ConstVF = 2240 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2241 Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); 2242 2243 // Create a vector splat to use in the induction update. 2244 // 2245 // FIXME: If the step is non-constant, we create the vector splat with 2246 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2247 // handle a constant vector splat. 2248 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2249 Value *SplatVF = isa<Constant>(Mul) 2250 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2251 : Builder.CreateVectorSplat(VF, Mul); 2252 Builder.restoreIP(CurrIP); 2253 2254 // We may need to add the step a number of times, depending on the unroll 2255 // factor. The last of those goes into the PHI. 2256 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2257 &*LoopVectorBody->getFirstInsertionPt()); 2258 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2259 Instruction *LastInduction = VecInd; 2260 for (unsigned Part = 0; Part < UF; ++Part) { 2261 State.set(Def, LastInduction, Part); 2262 2263 if (isa<TruncInst>(EntryVal)) 2264 addMetadata(LastInduction, EntryVal); 2265 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2266 State, Part); 2267 2268 LastInduction = cast<Instruction>( 2269 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2270 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2271 } 2272 2273 // Move the last step to the end of the latch block. This ensures consistent 2274 // placement of all induction updates. 2275 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2276 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2277 auto *ICmp = cast<Instruction>(Br->getCondition()); 2278 LastInduction->moveBefore(ICmp); 2279 LastInduction->setName("vec.ind.next"); 2280 2281 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2282 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2283 } 2284 2285 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2286 return Cost->isScalarAfterVectorization(I, VF) || 2287 Cost->isProfitableToScalarize(I, VF); 2288 } 2289 2290 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2291 if (shouldScalarizeInstruction(IV)) 2292 return true; 2293 auto isScalarInst = [&](User *U) -> bool { 2294 auto *I = cast<Instruction>(U); 2295 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2296 }; 2297 return llvm::any_of(IV->users(), isScalarInst); 2298 } 2299 2300 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2301 const InductionDescriptor &ID, const Instruction *EntryVal, 2302 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2303 unsigned Part, unsigned Lane) { 2304 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2305 "Expected either an induction phi-node or a truncate of it!"); 2306 2307 // This induction variable is not the phi from the original loop but the 2308 // newly-created IV based on the proof that casted Phi is equal to the 2309 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2310 // re-uses the same InductionDescriptor that original IV uses but we don't 2311 // have to do any recording in this case - that is done when original IV is 2312 // processed. 2313 if (isa<TruncInst>(EntryVal)) 2314 return; 2315 2316 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2317 if (Casts.empty()) 2318 return; 2319 // Only the first Cast instruction in the Casts vector is of interest. 2320 // The rest of the Casts (if exist) have no uses outside the 2321 // induction update chain itself. 2322 if (Lane < UINT_MAX) 2323 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2324 else 2325 State.set(CastDef, VectorLoopVal, Part); 2326 } 2327 2328 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2329 TruncInst *Trunc, VPValue *Def, 2330 VPValue *CastDef, 2331 VPTransformState &State) { 2332 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2333 "Primary induction variable must have an integer type"); 2334 2335 auto II = Legal->getInductionVars().find(IV); 2336 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2337 2338 auto ID = II->second; 2339 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2340 2341 // The value from the original loop to which we are mapping the new induction 2342 // variable. 2343 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2344 2345 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2346 2347 // Generate code for the induction step. Note that induction steps are 2348 // required to be loop-invariant 2349 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2350 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2351 "Induction step should be loop invariant"); 2352 if (PSE.getSE()->isSCEVable(IV->getType())) { 2353 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2354 return Exp.expandCodeFor(Step, Step->getType(), 2355 LoopVectorPreHeader->getTerminator()); 2356 } 2357 return cast<SCEVUnknown>(Step)->getValue(); 2358 }; 2359 2360 // The scalar value to broadcast. This is derived from the canonical 2361 // induction variable. If a truncation type is given, truncate the canonical 2362 // induction variable and step. Otherwise, derive these values from the 2363 // induction descriptor. 2364 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2365 Value *ScalarIV = Induction; 2366 if (IV != OldInduction) { 2367 ScalarIV = IV->getType()->isIntegerTy() 2368 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2369 : Builder.CreateCast(Instruction::SIToFP, Induction, 2370 IV->getType()); 2371 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2372 ScalarIV->setName("offset.idx"); 2373 } 2374 if (Trunc) { 2375 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2376 assert(Step->getType()->isIntegerTy() && 2377 "Truncation requires an integer step"); 2378 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2379 Step = Builder.CreateTrunc(Step, TruncType); 2380 } 2381 return ScalarIV; 2382 }; 2383 2384 // Create the vector values from the scalar IV, in the absence of creating a 2385 // vector IV. 2386 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2387 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2388 for (unsigned Part = 0; Part < UF; ++Part) { 2389 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2390 Value *EntryPart = 2391 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2392 ID.getInductionOpcode()); 2393 State.set(Def, EntryPart, Part); 2394 if (Trunc) 2395 addMetadata(EntryPart, Trunc); 2396 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2397 State, Part); 2398 } 2399 }; 2400 2401 // Fast-math-flags propagate from the original induction instruction. 2402 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2403 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2404 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2405 2406 // Now do the actual transformations, and start with creating the step value. 2407 Value *Step = CreateStepValue(ID.getStep()); 2408 if (VF.isZero() || VF.isScalar()) { 2409 Value *ScalarIV = CreateScalarIV(Step); 2410 CreateSplatIV(ScalarIV, Step); 2411 return; 2412 } 2413 2414 // Determine if we want a scalar version of the induction variable. This is 2415 // true if the induction variable itself is not widened, or if it has at 2416 // least one user in the loop that is not widened. 2417 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2418 if (!NeedsScalarIV) { 2419 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2420 State); 2421 return; 2422 } 2423 2424 // Try to create a new independent vector induction variable. If we can't 2425 // create the phi node, we will splat the scalar induction variable in each 2426 // loop iteration. 2427 if (!shouldScalarizeInstruction(EntryVal)) { 2428 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2429 State); 2430 Value *ScalarIV = CreateScalarIV(Step); 2431 // Create scalar steps that can be used by instructions we will later 2432 // scalarize. Note that the addition of the scalar steps will not increase 2433 // the number of instructions in the loop in the common case prior to 2434 // InstCombine. We will be trading one vector extract for each scalar step. 2435 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2436 return; 2437 } 2438 2439 // All IV users are scalar instructions, so only emit a scalar IV, not a 2440 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2441 // predicate used by the masked loads/stores. 2442 Value *ScalarIV = CreateScalarIV(Step); 2443 if (!Cost->isScalarEpilogueAllowed()) 2444 CreateSplatIV(ScalarIV, Step); 2445 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2446 } 2447 2448 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2449 Instruction::BinaryOps BinOp) { 2450 // Create and check the types. 2451 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2452 int VLen = ValVTy->getNumElements(); 2453 2454 Type *STy = Val->getType()->getScalarType(); 2455 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2456 "Induction Step must be an integer or FP"); 2457 assert(Step->getType() == STy && "Step has wrong type"); 2458 2459 SmallVector<Constant *, 8> Indices; 2460 2461 if (STy->isIntegerTy()) { 2462 // Create a vector of consecutive numbers from zero to VF. 2463 for (int i = 0; i < VLen; ++i) 2464 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2465 2466 // Add the consecutive indices to the vector value. 2467 Constant *Cv = ConstantVector::get(Indices); 2468 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2469 Step = Builder.CreateVectorSplat(VLen, Step); 2470 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2471 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2472 // which can be found from the original scalar operations. 2473 Step = Builder.CreateMul(Cv, Step); 2474 return Builder.CreateAdd(Val, Step, "induction"); 2475 } 2476 2477 // Floating point induction. 2478 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2479 "Binary Opcode should be specified for FP induction"); 2480 // Create a vector of consecutive numbers from zero to VF. 2481 for (int i = 0; i < VLen; ++i) 2482 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2483 2484 // Add the consecutive indices to the vector value. 2485 // Floating-point operations inherit FMF via the builder's flags. 2486 Constant *Cv = ConstantVector::get(Indices); 2487 Step = Builder.CreateVectorSplat(VLen, Step); 2488 Value *MulOp = Builder.CreateFMul(Cv, Step); 2489 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2490 } 2491 2492 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2493 Instruction *EntryVal, 2494 const InductionDescriptor &ID, 2495 VPValue *Def, VPValue *CastDef, 2496 VPTransformState &State) { 2497 // We shouldn't have to build scalar steps if we aren't vectorizing. 2498 assert(VF.isVector() && "VF should be greater than one"); 2499 // Get the value type and ensure it and the step have the same integer type. 2500 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2501 assert(ScalarIVTy == Step->getType() && 2502 "Val and Step should have the same type"); 2503 2504 // We build scalar steps for both integer and floating-point induction 2505 // variables. Here, we determine the kind of arithmetic we will perform. 2506 Instruction::BinaryOps AddOp; 2507 Instruction::BinaryOps MulOp; 2508 if (ScalarIVTy->isIntegerTy()) { 2509 AddOp = Instruction::Add; 2510 MulOp = Instruction::Mul; 2511 } else { 2512 AddOp = ID.getInductionOpcode(); 2513 MulOp = Instruction::FMul; 2514 } 2515 2516 // Determine the number of scalars we need to generate for each unroll 2517 // iteration. If EntryVal is uniform, we only need to generate the first 2518 // lane. Otherwise, we generate all VF values. 2519 unsigned Lanes = 2520 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2521 ? 1 2522 : VF.getKnownMinValue(); 2523 assert((!VF.isScalable() || Lanes == 1) && 2524 "Should never scalarize a scalable vector"); 2525 // Compute the scalar steps and save the results in State. 2526 for (unsigned Part = 0; Part < UF; ++Part) { 2527 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2528 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2529 ScalarIVTy->getScalarSizeInBits()); 2530 Value *StartIdx = 2531 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2532 if (ScalarIVTy->isFloatingPointTy()) 2533 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2534 StartIdx = Builder.CreateBinOp( 2535 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2536 // The step returned by `createStepForVF` is a runtime-evaluated value 2537 // when VF is scalable. Otherwise, it should be folded into a Constant. 2538 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2539 "Expected StartIdx to be folded to a constant when VF is not " 2540 "scalable"); 2541 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2542 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2543 State.set(Def, Add, VPIteration(Part, Lane)); 2544 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2545 Part, Lane); 2546 } 2547 } 2548 } 2549 2550 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2551 const VPIteration &Instance, 2552 VPTransformState &State) { 2553 Value *ScalarInst = State.get(Def, Instance); 2554 Value *VectorValue = State.get(Def, Instance.Part); 2555 VectorValue = Builder.CreateInsertElement( 2556 VectorValue, ScalarInst, 2557 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2558 State.set(Def, VectorValue, Instance.Part); 2559 } 2560 2561 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2562 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2563 return Builder.CreateVectorReverse(Vec, "reverse"); 2564 } 2565 2566 // Return whether we allow using masked interleave-groups (for dealing with 2567 // strided loads/stores that reside in predicated blocks, or for dealing 2568 // with gaps). 2569 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2570 // If an override option has been passed in for interleaved accesses, use it. 2571 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2572 return EnableMaskedInterleavedMemAccesses; 2573 2574 return TTI.enableMaskedInterleavedAccessVectorization(); 2575 } 2576 2577 // Try to vectorize the interleave group that \p Instr belongs to. 2578 // 2579 // E.g. Translate following interleaved load group (factor = 3): 2580 // for (i = 0; i < N; i+=3) { 2581 // R = Pic[i]; // Member of index 0 2582 // G = Pic[i+1]; // Member of index 1 2583 // B = Pic[i+2]; // Member of index 2 2584 // ... // do something to R, G, B 2585 // } 2586 // To: 2587 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2588 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2589 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2590 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2591 // 2592 // Or translate following interleaved store group (factor = 3): 2593 // for (i = 0; i < N; i+=3) { 2594 // ... do something to R, G, B 2595 // Pic[i] = R; // Member of index 0 2596 // Pic[i+1] = G; // Member of index 1 2597 // Pic[i+2] = B; // Member of index 2 2598 // } 2599 // To: 2600 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2601 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2602 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2603 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2604 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2605 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2606 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2607 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2608 VPValue *BlockInMask) { 2609 Instruction *Instr = Group->getInsertPos(); 2610 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2611 2612 // Prepare for the vector type of the interleaved load/store. 2613 Type *ScalarTy = getMemInstValueType(Instr); 2614 unsigned InterleaveFactor = Group->getFactor(); 2615 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2616 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2617 2618 // Prepare for the new pointers. 2619 SmallVector<Value *, 2> AddrParts; 2620 unsigned Index = Group->getIndex(Instr); 2621 2622 // TODO: extend the masked interleaved-group support to reversed access. 2623 assert((!BlockInMask || !Group->isReverse()) && 2624 "Reversed masked interleave-group not supported."); 2625 2626 // If the group is reverse, adjust the index to refer to the last vector lane 2627 // instead of the first. We adjust the index from the first vector lane, 2628 // rather than directly getting the pointer for lane VF - 1, because the 2629 // pointer operand of the interleaved access is supposed to be uniform. For 2630 // uniform instructions, we're only required to generate a value for the 2631 // first vector lane in each unroll iteration. 2632 assert(!VF.isScalable() && 2633 "scalable vector reverse operation is not implemented"); 2634 if (Group->isReverse()) 2635 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2636 2637 for (unsigned Part = 0; Part < UF; Part++) { 2638 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2639 setDebugLocFromInst(Builder, AddrPart); 2640 2641 // Notice current instruction could be any index. Need to adjust the address 2642 // to the member of index 0. 2643 // 2644 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2645 // b = A[i]; // Member of index 0 2646 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2647 // 2648 // E.g. A[i+1] = a; // Member of index 1 2649 // A[i] = b; // Member of index 0 2650 // A[i+2] = c; // Member of index 2 (Current instruction) 2651 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2652 2653 bool InBounds = false; 2654 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2655 InBounds = gep->isInBounds(); 2656 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2657 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2658 2659 // Cast to the vector pointer type. 2660 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2661 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2662 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2663 } 2664 2665 setDebugLocFromInst(Builder, Instr); 2666 Value *PoisonVec = PoisonValue::get(VecTy); 2667 2668 Value *MaskForGaps = nullptr; 2669 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2670 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2671 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2672 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2673 } 2674 2675 // Vectorize the interleaved load group. 2676 if (isa<LoadInst>(Instr)) { 2677 // For each unroll part, create a wide load for the group. 2678 SmallVector<Value *, 2> NewLoads; 2679 for (unsigned Part = 0; Part < UF; Part++) { 2680 Instruction *NewLoad; 2681 if (BlockInMask || MaskForGaps) { 2682 assert(useMaskedInterleavedAccesses(*TTI) && 2683 "masked interleaved groups are not allowed."); 2684 Value *GroupMask = MaskForGaps; 2685 if (BlockInMask) { 2686 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2687 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2688 Value *ShuffledMask = Builder.CreateShuffleVector( 2689 BlockInMaskPart, 2690 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2691 "interleaved.mask"); 2692 GroupMask = MaskForGaps 2693 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2694 MaskForGaps) 2695 : ShuffledMask; 2696 } 2697 NewLoad = 2698 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2699 GroupMask, PoisonVec, "wide.masked.vec"); 2700 } 2701 else 2702 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2703 Group->getAlign(), "wide.vec"); 2704 Group->addMetadata(NewLoad); 2705 NewLoads.push_back(NewLoad); 2706 } 2707 2708 // For each member in the group, shuffle out the appropriate data from the 2709 // wide loads. 2710 unsigned J = 0; 2711 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2712 Instruction *Member = Group->getMember(I); 2713 2714 // Skip the gaps in the group. 2715 if (!Member) 2716 continue; 2717 2718 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2719 auto StrideMask = 2720 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2721 for (unsigned Part = 0; Part < UF; Part++) { 2722 Value *StridedVec = Builder.CreateShuffleVector( 2723 NewLoads[Part], StrideMask, "strided.vec"); 2724 2725 // If this member has different type, cast the result type. 2726 if (Member->getType() != ScalarTy) { 2727 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2728 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2729 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2730 } 2731 2732 if (Group->isReverse()) 2733 StridedVec = reverseVector(StridedVec); 2734 2735 State.set(VPDefs[J], StridedVec, Part); 2736 } 2737 ++J; 2738 } 2739 return; 2740 } 2741 2742 // The sub vector type for current instruction. 2743 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2744 auto *SubVT = VectorType::get(ScalarTy, VF); 2745 2746 // Vectorize the interleaved store group. 2747 for (unsigned Part = 0; Part < UF; Part++) { 2748 // Collect the stored vector from each member. 2749 SmallVector<Value *, 4> StoredVecs; 2750 for (unsigned i = 0; i < InterleaveFactor; i++) { 2751 // Interleaved store group doesn't allow a gap, so each index has a member 2752 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2753 2754 Value *StoredVec = State.get(StoredValues[i], Part); 2755 2756 if (Group->isReverse()) 2757 StoredVec = reverseVector(StoredVec); 2758 2759 // If this member has different type, cast it to a unified type. 2760 2761 if (StoredVec->getType() != SubVT) 2762 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2763 2764 StoredVecs.push_back(StoredVec); 2765 } 2766 2767 // Concatenate all vectors into a wide vector. 2768 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2769 2770 // Interleave the elements in the wide vector. 2771 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2772 Value *IVec = Builder.CreateShuffleVector( 2773 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2774 "interleaved.vec"); 2775 2776 Instruction *NewStoreInstr; 2777 if (BlockInMask) { 2778 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2779 Value *ShuffledMask = Builder.CreateShuffleVector( 2780 BlockInMaskPart, 2781 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2782 "interleaved.mask"); 2783 NewStoreInstr = Builder.CreateMaskedStore( 2784 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2785 } 2786 else 2787 NewStoreInstr = 2788 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2789 2790 Group->addMetadata(NewStoreInstr); 2791 } 2792 } 2793 2794 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2795 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2796 VPValue *StoredValue, VPValue *BlockInMask) { 2797 // Attempt to issue a wide load. 2798 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2799 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2800 2801 assert((LI || SI) && "Invalid Load/Store instruction"); 2802 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2803 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2804 2805 LoopVectorizationCostModel::InstWidening Decision = 2806 Cost->getWideningDecision(Instr, VF); 2807 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2808 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2809 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2810 "CM decision is not to widen the memory instruction"); 2811 2812 Type *ScalarDataTy = getMemInstValueType(Instr); 2813 2814 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2815 const Align Alignment = getLoadStoreAlignment(Instr); 2816 2817 // Determine if the pointer operand of the access is either consecutive or 2818 // reverse consecutive. 2819 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2820 bool ConsecutiveStride = 2821 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2822 bool CreateGatherScatter = 2823 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2824 2825 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2826 // gather/scatter. Otherwise Decision should have been to Scalarize. 2827 assert((ConsecutiveStride || CreateGatherScatter) && 2828 "The instruction should be scalarized"); 2829 (void)ConsecutiveStride; 2830 2831 VectorParts BlockInMaskParts(UF); 2832 bool isMaskRequired = BlockInMask; 2833 if (isMaskRequired) 2834 for (unsigned Part = 0; Part < UF; ++Part) 2835 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2836 2837 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2838 // Calculate the pointer for the specific unroll-part. 2839 GetElementPtrInst *PartPtr = nullptr; 2840 2841 bool InBounds = false; 2842 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2843 InBounds = gep->isInBounds(); 2844 if (Reverse) { 2845 // If the address is consecutive but reversed, then the 2846 // wide store needs to start at the last vector element. 2847 // RunTimeVF = VScale * VF.getKnownMinValue() 2848 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2849 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2850 // NumElt = -Part * RunTimeVF 2851 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2852 // LastLane = 1 - RunTimeVF 2853 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2854 PartPtr = 2855 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2856 PartPtr->setIsInBounds(InBounds); 2857 PartPtr = cast<GetElementPtrInst>( 2858 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2859 PartPtr->setIsInBounds(InBounds); 2860 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2861 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2862 } else { 2863 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2864 PartPtr = cast<GetElementPtrInst>( 2865 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2866 PartPtr->setIsInBounds(InBounds); 2867 } 2868 2869 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2870 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2871 }; 2872 2873 // Handle Stores: 2874 if (SI) { 2875 setDebugLocFromInst(Builder, SI); 2876 2877 for (unsigned Part = 0; Part < UF; ++Part) { 2878 Instruction *NewSI = nullptr; 2879 Value *StoredVal = State.get(StoredValue, Part); 2880 if (CreateGatherScatter) { 2881 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2882 Value *VectorGep = State.get(Addr, Part); 2883 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2884 MaskPart); 2885 } else { 2886 if (Reverse) { 2887 // If we store to reverse consecutive memory locations, then we need 2888 // to reverse the order of elements in the stored value. 2889 StoredVal = reverseVector(StoredVal); 2890 // We don't want to update the value in the map as it might be used in 2891 // another expression. So don't call resetVectorValue(StoredVal). 2892 } 2893 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2894 if (isMaskRequired) 2895 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2896 BlockInMaskParts[Part]); 2897 else 2898 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2899 } 2900 addMetadata(NewSI, SI); 2901 } 2902 return; 2903 } 2904 2905 // Handle loads. 2906 assert(LI && "Must have a load instruction"); 2907 setDebugLocFromInst(Builder, LI); 2908 for (unsigned Part = 0; Part < UF; ++Part) { 2909 Value *NewLI; 2910 if (CreateGatherScatter) { 2911 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2912 Value *VectorGep = State.get(Addr, Part); 2913 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2914 nullptr, "wide.masked.gather"); 2915 addMetadata(NewLI, LI); 2916 } else { 2917 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2918 if (isMaskRequired) 2919 NewLI = Builder.CreateMaskedLoad( 2920 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2921 "wide.masked.load"); 2922 else 2923 NewLI = 2924 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2925 2926 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2927 addMetadata(NewLI, LI); 2928 if (Reverse) 2929 NewLI = reverseVector(NewLI); 2930 } 2931 2932 State.set(Def, NewLI, Part); 2933 } 2934 } 2935 2936 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2937 VPUser &User, 2938 const VPIteration &Instance, 2939 bool IfPredicateInstr, 2940 VPTransformState &State) { 2941 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2942 2943 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2944 // the first lane and part. 2945 if (isa<NoAliasScopeDeclInst>(Instr)) 2946 if (!Instance.isFirstIteration()) 2947 return; 2948 2949 setDebugLocFromInst(Builder, Instr); 2950 2951 // Does this instruction return a value ? 2952 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2953 2954 Instruction *Cloned = Instr->clone(); 2955 if (!IsVoidRetTy) 2956 Cloned->setName(Instr->getName() + ".cloned"); 2957 2958 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2959 Builder.GetInsertPoint()); 2960 // Replace the operands of the cloned instructions with their scalar 2961 // equivalents in the new loop. 2962 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2963 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2964 auto InputInstance = Instance; 2965 if (!Operand || !OrigLoop->contains(Operand) || 2966 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2967 InputInstance.Lane = VPLane::getFirstLane(); 2968 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2969 Cloned->setOperand(op, NewOp); 2970 } 2971 addNewMetadata(Cloned, Instr); 2972 2973 // Place the cloned scalar in the new loop. 2974 Builder.Insert(Cloned); 2975 2976 State.set(Def, Cloned, Instance); 2977 2978 // If we just cloned a new assumption, add it the assumption cache. 2979 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2980 if (II->getIntrinsicID() == Intrinsic::assume) 2981 AC->registerAssumption(II); 2982 2983 // End if-block. 2984 if (IfPredicateInstr) 2985 PredicatedInstructions.push_back(Cloned); 2986 } 2987 2988 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2989 Value *End, Value *Step, 2990 Instruction *DL) { 2991 BasicBlock *Header = L->getHeader(); 2992 BasicBlock *Latch = L->getLoopLatch(); 2993 // As we're just creating this loop, it's possible no latch exists 2994 // yet. If so, use the header as this will be a single block loop. 2995 if (!Latch) 2996 Latch = Header; 2997 2998 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2999 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3000 setDebugLocFromInst(Builder, OldInst); 3001 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3002 3003 Builder.SetInsertPoint(Latch->getTerminator()); 3004 setDebugLocFromInst(Builder, OldInst); 3005 3006 // Create i+1 and fill the PHINode. 3007 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3008 Induction->addIncoming(Start, L->getLoopPreheader()); 3009 Induction->addIncoming(Next, Latch); 3010 // Create the compare. 3011 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3012 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3013 3014 // Now we have two terminators. Remove the old one from the block. 3015 Latch->getTerminator()->eraseFromParent(); 3016 3017 return Induction; 3018 } 3019 3020 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3021 if (TripCount) 3022 return TripCount; 3023 3024 assert(L && "Create Trip Count for null loop."); 3025 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3026 // Find the loop boundaries. 3027 ScalarEvolution *SE = PSE.getSE(); 3028 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3029 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3030 "Invalid loop count"); 3031 3032 Type *IdxTy = Legal->getWidestInductionType(); 3033 assert(IdxTy && "No type for induction"); 3034 3035 // The exit count might have the type of i64 while the phi is i32. This can 3036 // happen if we have an induction variable that is sign extended before the 3037 // compare. The only way that we get a backedge taken count is that the 3038 // induction variable was signed and as such will not overflow. In such a case 3039 // truncation is legal. 3040 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3041 IdxTy->getPrimitiveSizeInBits()) 3042 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3043 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3044 3045 // Get the total trip count from the count by adding 1. 3046 const SCEV *ExitCount = SE->getAddExpr( 3047 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3048 3049 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3050 3051 // Expand the trip count and place the new instructions in the preheader. 3052 // Notice that the pre-header does not change, only the loop body. 3053 SCEVExpander Exp(*SE, DL, "induction"); 3054 3055 // Count holds the overall loop count (N). 3056 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3057 L->getLoopPreheader()->getTerminator()); 3058 3059 if (TripCount->getType()->isPointerTy()) 3060 TripCount = 3061 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3062 L->getLoopPreheader()->getTerminator()); 3063 3064 return TripCount; 3065 } 3066 3067 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3068 if (VectorTripCount) 3069 return VectorTripCount; 3070 3071 Value *TC = getOrCreateTripCount(L); 3072 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3073 3074 Type *Ty = TC->getType(); 3075 // This is where we can make the step a runtime constant. 3076 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3077 3078 // If the tail is to be folded by masking, round the number of iterations N 3079 // up to a multiple of Step instead of rounding down. This is done by first 3080 // adding Step-1 and then rounding down. Note that it's ok if this addition 3081 // overflows: the vector induction variable will eventually wrap to zero given 3082 // that it starts at zero and its Step is a power of two; the loop will then 3083 // exit, with the last early-exit vector comparison also producing all-true. 3084 if (Cost->foldTailByMasking()) { 3085 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3086 "VF*UF must be a power of 2 when folding tail by masking"); 3087 assert(!VF.isScalable() && 3088 "Tail folding not yet supported for scalable vectors"); 3089 TC = Builder.CreateAdd( 3090 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3091 } 3092 3093 // Now we need to generate the expression for the part of the loop that the 3094 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3095 // iterations are not required for correctness, or N - Step, otherwise. Step 3096 // is equal to the vectorization factor (number of SIMD elements) times the 3097 // unroll factor (number of SIMD instructions). 3098 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3099 3100 // There are two cases where we need to ensure (at least) the last iteration 3101 // runs in the scalar remainder loop. Thus, if the step evenly divides 3102 // the trip count, we set the remainder to be equal to the step. If the step 3103 // does not evenly divide the trip count, no adjustment is necessary since 3104 // there will already be scalar iterations. Note that the minimum iterations 3105 // check ensures that N >= Step. The cases are: 3106 // 1) If there is a non-reversed interleaved group that may speculatively 3107 // access memory out-of-bounds. 3108 // 2) If any instruction may follow a conditionally taken exit. That is, if 3109 // the loop contains multiple exiting blocks, or a single exiting block 3110 // which is not the latch. 3111 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3112 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3113 R = Builder.CreateSelect(IsZero, Step, R); 3114 } 3115 3116 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3117 3118 return VectorTripCount; 3119 } 3120 3121 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3122 const DataLayout &DL) { 3123 // Verify that V is a vector type with same number of elements as DstVTy. 3124 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3125 unsigned VF = DstFVTy->getNumElements(); 3126 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3127 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3128 Type *SrcElemTy = SrcVecTy->getElementType(); 3129 Type *DstElemTy = DstFVTy->getElementType(); 3130 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3131 "Vector elements must have same size"); 3132 3133 // Do a direct cast if element types are castable. 3134 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3135 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3136 } 3137 // V cannot be directly casted to desired vector type. 3138 // May happen when V is a floating point vector but DstVTy is a vector of 3139 // pointers or vice-versa. Handle this using a two-step bitcast using an 3140 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3141 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3142 "Only one type should be a pointer type"); 3143 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3144 "Only one type should be a floating point type"); 3145 Type *IntTy = 3146 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3147 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3148 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3149 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3150 } 3151 3152 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3153 BasicBlock *Bypass) { 3154 Value *Count = getOrCreateTripCount(L); 3155 // Reuse existing vector loop preheader for TC checks. 3156 // Note that new preheader block is generated for vector loop. 3157 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3158 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3159 3160 // Generate code to check if the loop's trip count is less than VF * UF, or 3161 // equal to it in case a scalar epilogue is required; this implies that the 3162 // vector trip count is zero. This check also covers the case where adding one 3163 // to the backedge-taken count overflowed leading to an incorrect trip count 3164 // of zero. In this case we will also jump to the scalar loop. 3165 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3166 : ICmpInst::ICMP_ULT; 3167 3168 // If tail is to be folded, vector loop takes care of all iterations. 3169 Value *CheckMinIters = Builder.getFalse(); 3170 if (!Cost->foldTailByMasking()) { 3171 Value *Step = 3172 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3173 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3174 } 3175 // Create new preheader for vector loop. 3176 LoopVectorPreHeader = 3177 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3178 "vector.ph"); 3179 3180 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3181 DT->getNode(Bypass)->getIDom()) && 3182 "TC check is expected to dominate Bypass"); 3183 3184 // Update dominator for Bypass & LoopExit. 3185 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3186 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3187 3188 ReplaceInstWithInst( 3189 TCCheckBlock->getTerminator(), 3190 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3191 LoopBypassBlocks.push_back(TCCheckBlock); 3192 } 3193 3194 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3195 3196 BasicBlock *const SCEVCheckBlock = 3197 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3198 if (!SCEVCheckBlock) 3199 return nullptr; 3200 3201 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3202 (OptForSizeBasedOnProfile && 3203 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3204 "Cannot SCEV check stride or overflow when optimizing for size"); 3205 3206 3207 // Update dominator only if this is first RT check. 3208 if (LoopBypassBlocks.empty()) { 3209 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3210 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3211 } 3212 3213 LoopBypassBlocks.push_back(SCEVCheckBlock); 3214 AddedSafetyChecks = true; 3215 return SCEVCheckBlock; 3216 } 3217 3218 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3219 BasicBlock *Bypass) { 3220 // VPlan-native path does not do any analysis for runtime checks currently. 3221 if (EnableVPlanNativePath) 3222 return nullptr; 3223 3224 BasicBlock *const MemCheckBlock = 3225 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3226 3227 // Check if we generated code that checks in runtime if arrays overlap. We put 3228 // the checks into a separate block to make the more common case of few 3229 // elements faster. 3230 if (!MemCheckBlock) 3231 return nullptr; 3232 3233 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3234 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3235 "Cannot emit memory checks when optimizing for size, unless forced " 3236 "to vectorize."); 3237 ORE->emit([&]() { 3238 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3239 L->getStartLoc(), L->getHeader()) 3240 << "Code-size may be reduced by not forcing " 3241 "vectorization, or by source-code modifications " 3242 "eliminating the need for runtime checks " 3243 "(e.g., adding 'restrict')."; 3244 }); 3245 } 3246 3247 LoopBypassBlocks.push_back(MemCheckBlock); 3248 3249 AddedSafetyChecks = true; 3250 3251 // We currently don't use LoopVersioning for the actual loop cloning but we 3252 // still use it to add the noalias metadata. 3253 LVer = std::make_unique<LoopVersioning>( 3254 *Legal->getLAI(), 3255 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3256 DT, PSE.getSE()); 3257 LVer->prepareNoAliasMetadata(); 3258 return MemCheckBlock; 3259 } 3260 3261 Value *InnerLoopVectorizer::emitTransformedIndex( 3262 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3263 const InductionDescriptor &ID) const { 3264 3265 SCEVExpander Exp(*SE, DL, "induction"); 3266 auto Step = ID.getStep(); 3267 auto StartValue = ID.getStartValue(); 3268 assert(Index->getType() == Step->getType() && 3269 "Index type does not match StepValue type"); 3270 3271 // Note: the IR at this point is broken. We cannot use SE to create any new 3272 // SCEV and then expand it, hoping that SCEV's simplification will give us 3273 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3274 // lead to various SCEV crashes. So all we can do is to use builder and rely 3275 // on InstCombine for future simplifications. Here we handle some trivial 3276 // cases only. 3277 auto CreateAdd = [&B](Value *X, Value *Y) { 3278 assert(X->getType() == Y->getType() && "Types don't match!"); 3279 if (auto *CX = dyn_cast<ConstantInt>(X)) 3280 if (CX->isZero()) 3281 return Y; 3282 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3283 if (CY->isZero()) 3284 return X; 3285 return B.CreateAdd(X, Y); 3286 }; 3287 3288 auto CreateMul = [&B](Value *X, Value *Y) { 3289 assert(X->getType() == Y->getType() && "Types don't match!"); 3290 if (auto *CX = dyn_cast<ConstantInt>(X)) 3291 if (CX->isOne()) 3292 return Y; 3293 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3294 if (CY->isOne()) 3295 return X; 3296 return B.CreateMul(X, Y); 3297 }; 3298 3299 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3300 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3301 // the DomTree is not kept up-to-date for additional blocks generated in the 3302 // vector loop. By using the header as insertion point, we guarantee that the 3303 // expanded instructions dominate all their uses. 3304 auto GetInsertPoint = [this, &B]() { 3305 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3306 if (InsertBB != LoopVectorBody && 3307 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3308 return LoopVectorBody->getTerminator(); 3309 return &*B.GetInsertPoint(); 3310 }; 3311 3312 switch (ID.getKind()) { 3313 case InductionDescriptor::IK_IntInduction: { 3314 assert(Index->getType() == StartValue->getType() && 3315 "Index type does not match StartValue type"); 3316 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3317 return B.CreateSub(StartValue, Index); 3318 auto *Offset = CreateMul( 3319 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3320 return CreateAdd(StartValue, Offset); 3321 } 3322 case InductionDescriptor::IK_PtrInduction: { 3323 assert(isa<SCEVConstant>(Step) && 3324 "Expected constant step for pointer induction"); 3325 return B.CreateGEP( 3326 StartValue->getType()->getPointerElementType(), StartValue, 3327 CreateMul(Index, 3328 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3329 } 3330 case InductionDescriptor::IK_FpInduction: { 3331 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3332 auto InductionBinOp = ID.getInductionBinOp(); 3333 assert(InductionBinOp && 3334 (InductionBinOp->getOpcode() == Instruction::FAdd || 3335 InductionBinOp->getOpcode() == Instruction::FSub) && 3336 "Original bin op should be defined for FP induction"); 3337 3338 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3339 Value *MulExp = B.CreateFMul(StepValue, Index); 3340 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3341 "induction"); 3342 } 3343 case InductionDescriptor::IK_NoInduction: 3344 return nullptr; 3345 } 3346 llvm_unreachable("invalid enum"); 3347 } 3348 3349 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3350 LoopScalarBody = OrigLoop->getHeader(); 3351 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3352 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3353 assert(LoopExitBlock && "Must have an exit block"); 3354 assert(LoopVectorPreHeader && "Invalid loop structure"); 3355 3356 LoopMiddleBlock = 3357 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3358 LI, nullptr, Twine(Prefix) + "middle.block"); 3359 LoopScalarPreHeader = 3360 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3361 nullptr, Twine(Prefix) + "scalar.ph"); 3362 3363 // Set up branch from middle block to the exit and scalar preheader blocks. 3364 // completeLoopSkeleton will update the condition to use an iteration check, 3365 // if required to decide whether to execute the remainder. 3366 BranchInst *BrInst = 3367 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3368 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3369 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3370 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3371 3372 // We intentionally don't let SplitBlock to update LoopInfo since 3373 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3374 // LoopVectorBody is explicitly added to the correct place few lines later. 3375 LoopVectorBody = 3376 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3377 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3378 3379 // Update dominator for loop exit. 3380 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3381 3382 // Create and register the new vector loop. 3383 Loop *Lp = LI->AllocateLoop(); 3384 Loop *ParentLoop = OrigLoop->getParentLoop(); 3385 3386 // Insert the new loop into the loop nest and register the new basic blocks 3387 // before calling any utilities such as SCEV that require valid LoopInfo. 3388 if (ParentLoop) { 3389 ParentLoop->addChildLoop(Lp); 3390 } else { 3391 LI->addTopLevelLoop(Lp); 3392 } 3393 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3394 return Lp; 3395 } 3396 3397 void InnerLoopVectorizer::createInductionResumeValues( 3398 Loop *L, Value *VectorTripCount, 3399 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3400 assert(VectorTripCount && L && "Expected valid arguments"); 3401 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3402 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3403 "Inconsistent information about additional bypass."); 3404 // We are going to resume the execution of the scalar loop. 3405 // Go over all of the induction variables that we found and fix the 3406 // PHIs that are left in the scalar version of the loop. 3407 // The starting values of PHI nodes depend on the counter of the last 3408 // iteration in the vectorized loop. 3409 // If we come from a bypass edge then we need to start from the original 3410 // start value. 3411 for (auto &InductionEntry : Legal->getInductionVars()) { 3412 PHINode *OrigPhi = InductionEntry.first; 3413 InductionDescriptor II = InductionEntry.second; 3414 3415 // Create phi nodes to merge from the backedge-taken check block. 3416 PHINode *BCResumeVal = 3417 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3418 LoopScalarPreHeader->getTerminator()); 3419 // Copy original phi DL over to the new one. 3420 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3421 Value *&EndValue = IVEndValues[OrigPhi]; 3422 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3423 if (OrigPhi == OldInduction) { 3424 // We know what the end value is. 3425 EndValue = VectorTripCount; 3426 } else { 3427 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3428 3429 // Fast-math-flags propagate from the original induction instruction. 3430 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3431 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3432 3433 Type *StepType = II.getStep()->getType(); 3434 Instruction::CastOps CastOp = 3435 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3436 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3437 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3438 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3439 EndValue->setName("ind.end"); 3440 3441 // Compute the end value for the additional bypass (if applicable). 3442 if (AdditionalBypass.first) { 3443 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3444 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3445 StepType, true); 3446 CRD = 3447 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3448 EndValueFromAdditionalBypass = 3449 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3450 EndValueFromAdditionalBypass->setName("ind.end"); 3451 } 3452 } 3453 // The new PHI merges the original incoming value, in case of a bypass, 3454 // or the value at the end of the vectorized loop. 3455 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3456 3457 // Fix the scalar body counter (PHI node). 3458 // The old induction's phi node in the scalar body needs the truncated 3459 // value. 3460 for (BasicBlock *BB : LoopBypassBlocks) 3461 BCResumeVal->addIncoming(II.getStartValue(), BB); 3462 3463 if (AdditionalBypass.first) 3464 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3465 EndValueFromAdditionalBypass); 3466 3467 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3468 } 3469 } 3470 3471 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3472 MDNode *OrigLoopID) { 3473 assert(L && "Expected valid loop."); 3474 3475 // The trip counts should be cached by now. 3476 Value *Count = getOrCreateTripCount(L); 3477 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3478 3479 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3480 3481 // Add a check in the middle block to see if we have completed 3482 // all of the iterations in the first vector loop. 3483 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3484 // If tail is to be folded, we know we don't need to run the remainder. 3485 if (!Cost->foldTailByMasking()) { 3486 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3487 Count, VectorTripCount, "cmp.n", 3488 LoopMiddleBlock->getTerminator()); 3489 3490 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3491 // of the corresponding compare because they may have ended up with 3492 // different line numbers and we want to avoid awkward line stepping while 3493 // debugging. Eg. if the compare has got a line number inside the loop. 3494 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3495 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3496 } 3497 3498 // Get ready to start creating new instructions into the vectorized body. 3499 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3500 "Inconsistent vector loop preheader"); 3501 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3502 3503 Optional<MDNode *> VectorizedLoopID = 3504 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3505 LLVMLoopVectorizeFollowupVectorized}); 3506 if (VectorizedLoopID.hasValue()) { 3507 L->setLoopID(VectorizedLoopID.getValue()); 3508 3509 // Do not setAlreadyVectorized if loop attributes have been defined 3510 // explicitly. 3511 return LoopVectorPreHeader; 3512 } 3513 3514 // Keep all loop hints from the original loop on the vector loop (we'll 3515 // replace the vectorizer-specific hints below). 3516 if (MDNode *LID = OrigLoop->getLoopID()) 3517 L->setLoopID(LID); 3518 3519 LoopVectorizeHints Hints(L, true, *ORE); 3520 Hints.setAlreadyVectorized(); 3521 3522 #ifdef EXPENSIVE_CHECKS 3523 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3524 LI->verify(*DT); 3525 #endif 3526 3527 return LoopVectorPreHeader; 3528 } 3529 3530 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3531 /* 3532 In this function we generate a new loop. The new loop will contain 3533 the vectorized instructions while the old loop will continue to run the 3534 scalar remainder. 3535 3536 [ ] <-- loop iteration number check. 3537 / | 3538 / v 3539 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3540 | / | 3541 | / v 3542 || [ ] <-- vector pre header. 3543 |/ | 3544 | v 3545 | [ ] \ 3546 | [ ]_| <-- vector loop. 3547 | | 3548 | v 3549 | -[ ] <--- middle-block. 3550 | / | 3551 | / v 3552 -|- >[ ] <--- new preheader. 3553 | | 3554 | v 3555 | [ ] \ 3556 | [ ]_| <-- old scalar loop to handle remainder. 3557 \ | 3558 \ v 3559 >[ ] <-- exit block. 3560 ... 3561 */ 3562 3563 // Get the metadata of the original loop before it gets modified. 3564 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3565 3566 // Create an empty vector loop, and prepare basic blocks for the runtime 3567 // checks. 3568 Loop *Lp = createVectorLoopSkeleton(""); 3569 3570 // Now, compare the new count to zero. If it is zero skip the vector loop and 3571 // jump to the scalar loop. This check also covers the case where the 3572 // backedge-taken count is uint##_max: adding one to it will overflow leading 3573 // to an incorrect trip count of zero. In this (rare) case we will also jump 3574 // to the scalar loop. 3575 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3576 3577 // Generate the code to check any assumptions that we've made for SCEV 3578 // expressions. 3579 emitSCEVChecks(Lp, LoopScalarPreHeader); 3580 3581 // Generate the code that checks in runtime if arrays overlap. We put the 3582 // checks into a separate block to make the more common case of few elements 3583 // faster. 3584 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3585 3586 // Some loops have a single integer induction variable, while other loops 3587 // don't. One example is c++ iterators that often have multiple pointer 3588 // induction variables. In the code below we also support a case where we 3589 // don't have a single induction variable. 3590 // 3591 // We try to obtain an induction variable from the original loop as hard 3592 // as possible. However if we don't find one that: 3593 // - is an integer 3594 // - counts from zero, stepping by one 3595 // - is the size of the widest induction variable type 3596 // then we create a new one. 3597 OldInduction = Legal->getPrimaryInduction(); 3598 Type *IdxTy = Legal->getWidestInductionType(); 3599 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3600 // The loop step is equal to the vectorization factor (num of SIMD elements) 3601 // times the unroll factor (num of SIMD instructions). 3602 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3603 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3604 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3605 Induction = 3606 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3607 getDebugLocFromInstOrOperands(OldInduction)); 3608 3609 // Emit phis for the new starting index of the scalar loop. 3610 createInductionResumeValues(Lp, CountRoundDown); 3611 3612 return completeLoopSkeleton(Lp, OrigLoopID); 3613 } 3614 3615 // Fix up external users of the induction variable. At this point, we are 3616 // in LCSSA form, with all external PHIs that use the IV having one input value, 3617 // coming from the remainder loop. We need those PHIs to also have a correct 3618 // value for the IV when arriving directly from the middle block. 3619 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3620 const InductionDescriptor &II, 3621 Value *CountRoundDown, Value *EndValue, 3622 BasicBlock *MiddleBlock) { 3623 // There are two kinds of external IV usages - those that use the value 3624 // computed in the last iteration (the PHI) and those that use the penultimate 3625 // value (the value that feeds into the phi from the loop latch). 3626 // We allow both, but they, obviously, have different values. 3627 3628 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3629 3630 DenseMap<Value *, Value *> MissingVals; 3631 3632 // An external user of the last iteration's value should see the value that 3633 // the remainder loop uses to initialize its own IV. 3634 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3635 for (User *U : PostInc->users()) { 3636 Instruction *UI = cast<Instruction>(U); 3637 if (!OrigLoop->contains(UI)) { 3638 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3639 MissingVals[UI] = EndValue; 3640 } 3641 } 3642 3643 // An external user of the penultimate value need to see EndValue - Step. 3644 // The simplest way to get this is to recompute it from the constituent SCEVs, 3645 // that is Start + (Step * (CRD - 1)). 3646 for (User *U : OrigPhi->users()) { 3647 auto *UI = cast<Instruction>(U); 3648 if (!OrigLoop->contains(UI)) { 3649 const DataLayout &DL = 3650 OrigLoop->getHeader()->getModule()->getDataLayout(); 3651 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3652 3653 IRBuilder<> B(MiddleBlock->getTerminator()); 3654 3655 // Fast-math-flags propagate from the original induction instruction. 3656 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3657 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3658 3659 Value *CountMinusOne = B.CreateSub( 3660 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3661 Value *CMO = 3662 !II.getStep()->getType()->isIntegerTy() 3663 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3664 II.getStep()->getType()) 3665 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3666 CMO->setName("cast.cmo"); 3667 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3668 Escape->setName("ind.escape"); 3669 MissingVals[UI] = Escape; 3670 } 3671 } 3672 3673 for (auto &I : MissingVals) { 3674 PHINode *PHI = cast<PHINode>(I.first); 3675 // One corner case we have to handle is two IVs "chasing" each-other, 3676 // that is %IV2 = phi [...], [ %IV1, %latch ] 3677 // In this case, if IV1 has an external use, we need to avoid adding both 3678 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3679 // don't already have an incoming value for the middle block. 3680 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3681 PHI->addIncoming(I.second, MiddleBlock); 3682 } 3683 } 3684 3685 namespace { 3686 3687 struct CSEDenseMapInfo { 3688 static bool canHandle(const Instruction *I) { 3689 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3690 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3691 } 3692 3693 static inline Instruction *getEmptyKey() { 3694 return DenseMapInfo<Instruction *>::getEmptyKey(); 3695 } 3696 3697 static inline Instruction *getTombstoneKey() { 3698 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3699 } 3700 3701 static unsigned getHashValue(const Instruction *I) { 3702 assert(canHandle(I) && "Unknown instruction!"); 3703 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3704 I->value_op_end())); 3705 } 3706 3707 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3708 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3709 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3710 return LHS == RHS; 3711 return LHS->isIdenticalTo(RHS); 3712 } 3713 }; 3714 3715 } // end anonymous namespace 3716 3717 ///Perform cse of induction variable instructions. 3718 static void cse(BasicBlock *BB) { 3719 // Perform simple cse. 3720 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3721 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3722 Instruction *In = &*I++; 3723 3724 if (!CSEDenseMapInfo::canHandle(In)) 3725 continue; 3726 3727 // Check if we can replace this instruction with any of the 3728 // visited instructions. 3729 if (Instruction *V = CSEMap.lookup(In)) { 3730 In->replaceAllUsesWith(V); 3731 In->eraseFromParent(); 3732 continue; 3733 } 3734 3735 CSEMap[In] = In; 3736 } 3737 } 3738 3739 InstructionCost 3740 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3741 bool &NeedToScalarize) { 3742 Function *F = CI->getCalledFunction(); 3743 Type *ScalarRetTy = CI->getType(); 3744 SmallVector<Type *, 4> Tys, ScalarTys; 3745 for (auto &ArgOp : CI->arg_operands()) 3746 ScalarTys.push_back(ArgOp->getType()); 3747 3748 // Estimate cost of scalarized vector call. The source operands are assumed 3749 // to be vectors, so we need to extract individual elements from there, 3750 // execute VF scalar calls, and then gather the result into the vector return 3751 // value. 3752 InstructionCost ScalarCallCost = 3753 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3754 if (VF.isScalar()) 3755 return ScalarCallCost; 3756 3757 // Compute corresponding vector type for return value and arguments. 3758 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3759 for (Type *ScalarTy : ScalarTys) 3760 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3761 3762 // Compute costs of unpacking argument values for the scalar calls and 3763 // packing the return values to a vector. 3764 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3765 3766 InstructionCost Cost = 3767 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3768 3769 // If we can't emit a vector call for this function, then the currently found 3770 // cost is the cost we need to return. 3771 NeedToScalarize = true; 3772 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3773 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3774 3775 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3776 return Cost; 3777 3778 // If the corresponding vector cost is cheaper, return its cost. 3779 InstructionCost VectorCallCost = 3780 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3781 if (VectorCallCost < Cost) { 3782 NeedToScalarize = false; 3783 Cost = VectorCallCost; 3784 } 3785 return Cost; 3786 } 3787 3788 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3789 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3790 return Elt; 3791 return VectorType::get(Elt, VF); 3792 } 3793 3794 InstructionCost 3795 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3796 ElementCount VF) { 3797 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3798 assert(ID && "Expected intrinsic call!"); 3799 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3800 FastMathFlags FMF; 3801 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3802 FMF = FPMO->getFastMathFlags(); 3803 3804 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3805 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3806 SmallVector<Type *> ParamTys; 3807 std::transform(FTy->param_begin(), FTy->param_end(), 3808 std::back_inserter(ParamTys), 3809 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3810 3811 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3812 dyn_cast<IntrinsicInst>(CI)); 3813 return TTI.getIntrinsicInstrCost(CostAttrs, 3814 TargetTransformInfo::TCK_RecipThroughput); 3815 } 3816 3817 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3818 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3819 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3820 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3821 } 3822 3823 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3824 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3825 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3826 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3827 } 3828 3829 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3830 // For every instruction `I` in MinBWs, truncate the operands, create a 3831 // truncated version of `I` and reextend its result. InstCombine runs 3832 // later and will remove any ext/trunc pairs. 3833 SmallPtrSet<Value *, 4> Erased; 3834 for (const auto &KV : Cost->getMinimalBitwidths()) { 3835 // If the value wasn't vectorized, we must maintain the original scalar 3836 // type. The absence of the value from State indicates that it 3837 // wasn't vectorized. 3838 VPValue *Def = State.Plan->getVPValue(KV.first); 3839 if (!State.hasAnyVectorValue(Def)) 3840 continue; 3841 for (unsigned Part = 0; Part < UF; ++Part) { 3842 Value *I = State.get(Def, Part); 3843 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3844 continue; 3845 Type *OriginalTy = I->getType(); 3846 Type *ScalarTruncatedTy = 3847 IntegerType::get(OriginalTy->getContext(), KV.second); 3848 auto *TruncatedTy = FixedVectorType::get( 3849 ScalarTruncatedTy, 3850 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3851 if (TruncatedTy == OriginalTy) 3852 continue; 3853 3854 IRBuilder<> B(cast<Instruction>(I)); 3855 auto ShrinkOperand = [&](Value *V) -> Value * { 3856 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3857 if (ZI->getSrcTy() == TruncatedTy) 3858 return ZI->getOperand(0); 3859 return B.CreateZExtOrTrunc(V, TruncatedTy); 3860 }; 3861 3862 // The actual instruction modification depends on the instruction type, 3863 // unfortunately. 3864 Value *NewI = nullptr; 3865 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3866 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3867 ShrinkOperand(BO->getOperand(1))); 3868 3869 // Any wrapping introduced by shrinking this operation shouldn't be 3870 // considered undefined behavior. So, we can't unconditionally copy 3871 // arithmetic wrapping flags to NewI. 3872 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3873 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3874 NewI = 3875 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3876 ShrinkOperand(CI->getOperand(1))); 3877 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3878 NewI = B.CreateSelect(SI->getCondition(), 3879 ShrinkOperand(SI->getTrueValue()), 3880 ShrinkOperand(SI->getFalseValue())); 3881 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3882 switch (CI->getOpcode()) { 3883 default: 3884 llvm_unreachable("Unhandled cast!"); 3885 case Instruction::Trunc: 3886 NewI = ShrinkOperand(CI->getOperand(0)); 3887 break; 3888 case Instruction::SExt: 3889 NewI = B.CreateSExtOrTrunc( 3890 CI->getOperand(0), 3891 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3892 break; 3893 case Instruction::ZExt: 3894 NewI = B.CreateZExtOrTrunc( 3895 CI->getOperand(0), 3896 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3897 break; 3898 } 3899 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3900 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3901 ->getNumElements(); 3902 auto *O0 = B.CreateZExtOrTrunc( 3903 SI->getOperand(0), 3904 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3905 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3906 ->getNumElements(); 3907 auto *O1 = B.CreateZExtOrTrunc( 3908 SI->getOperand(1), 3909 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3910 3911 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3912 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3913 // Don't do anything with the operands, just extend the result. 3914 continue; 3915 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3916 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3917 ->getNumElements(); 3918 auto *O0 = B.CreateZExtOrTrunc( 3919 IE->getOperand(0), 3920 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3921 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3922 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3923 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3924 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3925 ->getNumElements(); 3926 auto *O0 = B.CreateZExtOrTrunc( 3927 EE->getOperand(0), 3928 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3929 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3930 } else { 3931 // If we don't know what to do, be conservative and don't do anything. 3932 continue; 3933 } 3934 3935 // Lastly, extend the result. 3936 NewI->takeName(cast<Instruction>(I)); 3937 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3938 I->replaceAllUsesWith(Res); 3939 cast<Instruction>(I)->eraseFromParent(); 3940 Erased.insert(I); 3941 State.reset(Def, Res, Part); 3942 } 3943 } 3944 3945 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3946 for (const auto &KV : Cost->getMinimalBitwidths()) { 3947 // If the value wasn't vectorized, we must maintain the original scalar 3948 // type. The absence of the value from State indicates that it 3949 // wasn't vectorized. 3950 VPValue *Def = State.Plan->getVPValue(KV.first); 3951 if (!State.hasAnyVectorValue(Def)) 3952 continue; 3953 for (unsigned Part = 0; Part < UF; ++Part) { 3954 Value *I = State.get(Def, Part); 3955 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3956 if (Inst && Inst->use_empty()) { 3957 Value *NewI = Inst->getOperand(0); 3958 Inst->eraseFromParent(); 3959 State.reset(Def, NewI, Part); 3960 } 3961 } 3962 } 3963 } 3964 3965 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3966 // Insert truncates and extends for any truncated instructions as hints to 3967 // InstCombine. 3968 if (VF.isVector()) 3969 truncateToMinimalBitwidths(State); 3970 3971 // Fix widened non-induction PHIs by setting up the PHI operands. 3972 if (OrigPHIsToFix.size()) { 3973 assert(EnableVPlanNativePath && 3974 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3975 fixNonInductionPHIs(State); 3976 } 3977 3978 // At this point every instruction in the original loop is widened to a 3979 // vector form. Now we need to fix the recurrences in the loop. These PHI 3980 // nodes are currently empty because we did not want to introduce cycles. 3981 // This is the second stage of vectorizing recurrences. 3982 fixCrossIterationPHIs(State); 3983 3984 // Forget the original basic block. 3985 PSE.getSE()->forgetLoop(OrigLoop); 3986 3987 // Fix-up external users of the induction variables. 3988 for (auto &Entry : Legal->getInductionVars()) 3989 fixupIVUsers(Entry.first, Entry.second, 3990 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3991 IVEndValues[Entry.first], LoopMiddleBlock); 3992 3993 fixLCSSAPHIs(State); 3994 for (Instruction *PI : PredicatedInstructions) 3995 sinkScalarOperands(&*PI); 3996 3997 // Remove redundant induction instructions. 3998 cse(LoopVectorBody); 3999 4000 // Set/update profile weights for the vector and remainder loops as original 4001 // loop iterations are now distributed among them. Note that original loop 4002 // represented by LoopScalarBody becomes remainder loop after vectorization. 4003 // 4004 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4005 // end up getting slightly roughened result but that should be OK since 4006 // profile is not inherently precise anyway. Note also possible bypass of 4007 // vector code caused by legality checks is ignored, assigning all the weight 4008 // to the vector loop, optimistically. 4009 // 4010 // For scalable vectorization we can't know at compile time how many iterations 4011 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4012 // vscale of '1'. 4013 setProfileInfoAfterUnrolling( 4014 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4015 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4016 } 4017 4018 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4019 // In order to support recurrences we need to be able to vectorize Phi nodes. 4020 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4021 // stage #2: We now need to fix the recurrences by adding incoming edges to 4022 // the currently empty PHI nodes. At this point every instruction in the 4023 // original loop is widened to a vector form so we can use them to construct 4024 // the incoming edges. 4025 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4026 // Handle first-order recurrences and reductions that need to be fixed. 4027 if (Legal->isFirstOrderRecurrence(&Phi)) 4028 fixFirstOrderRecurrence(&Phi, State); 4029 else if (Legal->isReductionVariable(&Phi)) 4030 fixReduction(&Phi, State); 4031 } 4032 } 4033 4034 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4035 VPTransformState &State) { 4036 // This is the second phase of vectorizing first-order recurrences. An 4037 // overview of the transformation is described below. Suppose we have the 4038 // following loop. 4039 // 4040 // for (int i = 0; i < n; ++i) 4041 // b[i] = a[i] - a[i - 1]; 4042 // 4043 // There is a first-order recurrence on "a". For this loop, the shorthand 4044 // scalar IR looks like: 4045 // 4046 // scalar.ph: 4047 // s_init = a[-1] 4048 // br scalar.body 4049 // 4050 // scalar.body: 4051 // i = phi [0, scalar.ph], [i+1, scalar.body] 4052 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4053 // s2 = a[i] 4054 // b[i] = s2 - s1 4055 // br cond, scalar.body, ... 4056 // 4057 // In this example, s1 is a recurrence because it's value depends on the 4058 // previous iteration. In the first phase of vectorization, we created a 4059 // temporary value for s1. We now complete the vectorization and produce the 4060 // shorthand vector IR shown below (for VF = 4, UF = 1). 4061 // 4062 // vector.ph: 4063 // v_init = vector(..., ..., ..., a[-1]) 4064 // br vector.body 4065 // 4066 // vector.body 4067 // i = phi [0, vector.ph], [i+4, vector.body] 4068 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4069 // v2 = a[i, i+1, i+2, i+3]; 4070 // v3 = vector(v1(3), v2(0, 1, 2)) 4071 // b[i, i+1, i+2, i+3] = v2 - v3 4072 // br cond, vector.body, middle.block 4073 // 4074 // middle.block: 4075 // x = v2(3) 4076 // br scalar.ph 4077 // 4078 // scalar.ph: 4079 // s_init = phi [x, middle.block], [a[-1], otherwise] 4080 // br scalar.body 4081 // 4082 // After execution completes the vector loop, we extract the next value of 4083 // the recurrence (x) to use as the initial value in the scalar loop. 4084 4085 // Get the original loop preheader and single loop latch. 4086 auto *Preheader = OrigLoop->getLoopPreheader(); 4087 auto *Latch = OrigLoop->getLoopLatch(); 4088 4089 // Get the initial and previous values of the scalar recurrence. 4090 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4091 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4092 4093 // Create a vector from the initial value. 4094 auto *VectorInit = ScalarInit; 4095 if (VF.isVector()) { 4096 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4097 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4098 VectorInit = Builder.CreateInsertElement( 4099 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4100 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4101 } 4102 4103 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4104 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4105 // We constructed a temporary phi node in the first phase of vectorization. 4106 // This phi node will eventually be deleted. 4107 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4108 4109 // Create a phi node for the new recurrence. The current value will either be 4110 // the initial value inserted into a vector or loop-varying vector value. 4111 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4112 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4113 4114 // Get the vectorized previous value of the last part UF - 1. It appears last 4115 // among all unrolled iterations, due to the order of their construction. 4116 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4117 4118 // Find and set the insertion point after the previous value if it is an 4119 // instruction. 4120 BasicBlock::iterator InsertPt; 4121 // Note that the previous value may have been constant-folded so it is not 4122 // guaranteed to be an instruction in the vector loop. 4123 // FIXME: Loop invariant values do not form recurrences. We should deal with 4124 // them earlier. 4125 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4126 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4127 else { 4128 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4129 if (isa<PHINode>(PreviousLastPart)) 4130 // If the previous value is a phi node, we should insert after all the phi 4131 // nodes in the block containing the PHI to avoid breaking basic block 4132 // verification. Note that the basic block may be different to 4133 // LoopVectorBody, in case we predicate the loop. 4134 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4135 else 4136 InsertPt = ++PreviousInst->getIterator(); 4137 } 4138 Builder.SetInsertPoint(&*InsertPt); 4139 4140 // We will construct a vector for the recurrence by combining the values for 4141 // the current and previous iterations. This is the required shuffle mask. 4142 assert(!VF.isScalable()); 4143 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4144 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4145 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4146 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4147 4148 // The vector from which to take the initial value for the current iteration 4149 // (actual or unrolled). Initially, this is the vector phi node. 4150 Value *Incoming = VecPhi; 4151 4152 // Shuffle the current and previous vector and update the vector parts. 4153 for (unsigned Part = 0; Part < UF; ++Part) { 4154 Value *PreviousPart = State.get(PreviousDef, Part); 4155 Value *PhiPart = State.get(PhiDef, Part); 4156 auto *Shuffle = 4157 VF.isVector() 4158 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4159 : Incoming; 4160 PhiPart->replaceAllUsesWith(Shuffle); 4161 cast<Instruction>(PhiPart)->eraseFromParent(); 4162 State.reset(PhiDef, Shuffle, Part); 4163 Incoming = PreviousPart; 4164 } 4165 4166 // Fix the latch value of the new recurrence in the vector loop. 4167 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4168 4169 // Extract the last vector element in the middle block. This will be the 4170 // initial value for the recurrence when jumping to the scalar loop. 4171 auto *ExtractForScalar = Incoming; 4172 if (VF.isVector()) { 4173 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4174 ExtractForScalar = Builder.CreateExtractElement( 4175 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4176 "vector.recur.extract"); 4177 } 4178 // Extract the second last element in the middle block if the 4179 // Phi is used outside the loop. We need to extract the phi itself 4180 // and not the last element (the phi update in the current iteration). This 4181 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4182 // when the scalar loop is not run at all. 4183 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4184 if (VF.isVector()) 4185 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4186 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4187 "vector.recur.extract.for.phi"); 4188 // When loop is unrolled without vectorizing, initialize 4189 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4190 // `Incoming`. This is analogous to the vectorized case above: extracting the 4191 // second last element when VF > 1. 4192 else if (UF > 1) 4193 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4194 4195 // Fix the initial value of the original recurrence in the scalar loop. 4196 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4197 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4198 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4199 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4200 Start->addIncoming(Incoming, BB); 4201 } 4202 4203 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4204 Phi->setName("scalar.recur"); 4205 4206 // Finally, fix users of the recurrence outside the loop. The users will need 4207 // either the last value of the scalar recurrence or the last value of the 4208 // vector recurrence we extracted in the middle block. Since the loop is in 4209 // LCSSA form, we just need to find all the phi nodes for the original scalar 4210 // recurrence in the exit block, and then add an edge for the middle block. 4211 // Note that LCSSA does not imply single entry when the original scalar loop 4212 // had multiple exiting edges (as we always run the last iteration in the 4213 // scalar epilogue); in that case, the exiting path through middle will be 4214 // dynamically dead and the value picked for the phi doesn't matter. 4215 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4216 if (any_of(LCSSAPhi.incoming_values(), 4217 [Phi](Value *V) { return V == Phi; })) 4218 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4219 } 4220 4221 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4222 // Get it's reduction variable descriptor. 4223 assert(Legal->isReductionVariable(Phi) && 4224 "Unable to find the reduction variable"); 4225 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4226 4227 RecurKind RK = RdxDesc.getRecurrenceKind(); 4228 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4229 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4230 setDebugLocFromInst(Builder, ReductionStartValue); 4231 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4232 4233 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4234 // This is the vector-clone of the value that leaves the loop. 4235 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4236 4237 // Wrap flags are in general invalid after vectorization, clear them. 4238 clearReductionWrapFlags(RdxDesc, State); 4239 4240 // Fix the vector-loop phi. 4241 4242 // Reductions do not have to start at zero. They can start with 4243 // any loop invariant values. 4244 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4245 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4246 4247 for (unsigned Part = 0; Part < UF; ++Part) { 4248 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4249 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4250 cast<PHINode>(VecRdxPhi) 4251 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4252 } 4253 4254 // Before each round, move the insertion point right between 4255 // the PHIs and the values we are going to write. 4256 // This allows us to write both PHINodes and the extractelement 4257 // instructions. 4258 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4259 4260 setDebugLocFromInst(Builder, LoopExitInst); 4261 4262 // If tail is folded by masking, the vector value to leave the loop should be 4263 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4264 // instead of the former. For an inloop reduction the reduction will already 4265 // be predicated, and does not need to be handled here. 4266 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4267 for (unsigned Part = 0; Part < UF; ++Part) { 4268 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4269 Value *Sel = nullptr; 4270 for (User *U : VecLoopExitInst->users()) { 4271 if (isa<SelectInst>(U)) { 4272 assert(!Sel && "Reduction exit feeding two selects"); 4273 Sel = U; 4274 } else 4275 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4276 } 4277 assert(Sel && "Reduction exit feeds no select"); 4278 State.reset(LoopExitInstDef, Sel, Part); 4279 4280 // If the target can create a predicated operator for the reduction at no 4281 // extra cost in the loop (for example a predicated vadd), it can be 4282 // cheaper for the select to remain in the loop than be sunk out of it, 4283 // and so use the select value for the phi instead of the old 4284 // LoopExitValue. 4285 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4286 if (PreferPredicatedReductionSelect || 4287 TTI->preferPredicatedReductionSelect( 4288 RdxDesc.getOpcode(), Phi->getType(), 4289 TargetTransformInfo::ReductionFlags())) { 4290 auto *VecRdxPhi = 4291 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4292 VecRdxPhi->setIncomingValueForBlock( 4293 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4294 } 4295 } 4296 } 4297 4298 // If the vector reduction can be performed in a smaller type, we truncate 4299 // then extend the loop exit value to enable InstCombine to evaluate the 4300 // entire expression in the smaller type. 4301 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4302 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4303 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4304 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4305 Builder.SetInsertPoint( 4306 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4307 VectorParts RdxParts(UF); 4308 for (unsigned Part = 0; Part < UF; ++Part) { 4309 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4310 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4311 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4312 : Builder.CreateZExt(Trunc, VecTy); 4313 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4314 UI != RdxParts[Part]->user_end();) 4315 if (*UI != Trunc) { 4316 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4317 RdxParts[Part] = Extnd; 4318 } else { 4319 ++UI; 4320 } 4321 } 4322 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4323 for (unsigned Part = 0; Part < UF; ++Part) { 4324 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4325 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4326 } 4327 } 4328 4329 // Reduce all of the unrolled parts into a single vector. 4330 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4331 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4332 4333 // The middle block terminator has already been assigned a DebugLoc here (the 4334 // OrigLoop's single latch terminator). We want the whole middle block to 4335 // appear to execute on this line because: (a) it is all compiler generated, 4336 // (b) these instructions are always executed after evaluating the latch 4337 // conditional branch, and (c) other passes may add new predecessors which 4338 // terminate on this line. This is the easiest way to ensure we don't 4339 // accidentally cause an extra step back into the loop while debugging. 4340 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4341 { 4342 // Floating-point operations should have some FMF to enable the reduction. 4343 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4344 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4345 for (unsigned Part = 1; Part < UF; ++Part) { 4346 Value *RdxPart = State.get(LoopExitInstDef, Part); 4347 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4348 ReducedPartRdx = Builder.CreateBinOp( 4349 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4350 } else { 4351 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4352 } 4353 } 4354 } 4355 4356 // Create the reduction after the loop. Note that inloop reductions create the 4357 // target reduction in the loop using a Reduction recipe. 4358 if (VF.isVector() && !IsInLoopReductionPhi) { 4359 ReducedPartRdx = 4360 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4361 // If the reduction can be performed in a smaller type, we need to extend 4362 // the reduction to the wider type before we branch to the original loop. 4363 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4364 ReducedPartRdx = 4365 RdxDesc.isSigned() 4366 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4367 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4368 } 4369 4370 // Create a phi node that merges control-flow from the backedge-taken check 4371 // block and the middle block. 4372 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4373 LoopScalarPreHeader->getTerminator()); 4374 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4375 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4376 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4377 4378 // Now, we need to fix the users of the reduction variable 4379 // inside and outside of the scalar remainder loop. 4380 4381 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4382 // in the exit blocks. See comment on analogous loop in 4383 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4384 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4385 if (any_of(LCSSAPhi.incoming_values(), 4386 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4387 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4388 4389 // Fix the scalar loop reduction variable with the incoming reduction sum 4390 // from the vector body and from the backedge value. 4391 int IncomingEdgeBlockIdx = 4392 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4393 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4394 // Pick the other block. 4395 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4396 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4397 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4398 } 4399 4400 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4401 VPTransformState &State) { 4402 RecurKind RK = RdxDesc.getRecurrenceKind(); 4403 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4404 return; 4405 4406 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4407 assert(LoopExitInstr && "null loop exit instruction"); 4408 SmallVector<Instruction *, 8> Worklist; 4409 SmallPtrSet<Instruction *, 8> Visited; 4410 Worklist.push_back(LoopExitInstr); 4411 Visited.insert(LoopExitInstr); 4412 4413 while (!Worklist.empty()) { 4414 Instruction *Cur = Worklist.pop_back_val(); 4415 if (isa<OverflowingBinaryOperator>(Cur)) 4416 for (unsigned Part = 0; Part < UF; ++Part) { 4417 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4418 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4419 } 4420 4421 for (User *U : Cur->users()) { 4422 Instruction *UI = cast<Instruction>(U); 4423 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4424 Visited.insert(UI).second) 4425 Worklist.push_back(UI); 4426 } 4427 } 4428 } 4429 4430 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4431 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4432 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4433 // Some phis were already hand updated by the reduction and recurrence 4434 // code above, leave them alone. 4435 continue; 4436 4437 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4438 // Non-instruction incoming values will have only one value. 4439 4440 VPLane Lane = VPLane::getFirstLane(); 4441 if (isa<Instruction>(IncomingValue) && 4442 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4443 VF)) 4444 Lane = VPLane::getLastLaneForVF(VF); 4445 4446 // Can be a loop invariant incoming value or the last scalar value to be 4447 // extracted from the vectorized loop. 4448 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4449 Value *lastIncomingValue = 4450 OrigLoop->isLoopInvariant(IncomingValue) 4451 ? IncomingValue 4452 : State.get(State.Plan->getVPValue(IncomingValue), 4453 VPIteration(UF - 1, Lane)); 4454 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4455 } 4456 } 4457 4458 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4459 // The basic block and loop containing the predicated instruction. 4460 auto *PredBB = PredInst->getParent(); 4461 auto *VectorLoop = LI->getLoopFor(PredBB); 4462 4463 // Initialize a worklist with the operands of the predicated instruction. 4464 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4465 4466 // Holds instructions that we need to analyze again. An instruction may be 4467 // reanalyzed if we don't yet know if we can sink it or not. 4468 SmallVector<Instruction *, 8> InstsToReanalyze; 4469 4470 // Returns true if a given use occurs in the predicated block. Phi nodes use 4471 // their operands in their corresponding predecessor blocks. 4472 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4473 auto *I = cast<Instruction>(U.getUser()); 4474 BasicBlock *BB = I->getParent(); 4475 if (auto *Phi = dyn_cast<PHINode>(I)) 4476 BB = Phi->getIncomingBlock( 4477 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4478 return BB == PredBB; 4479 }; 4480 4481 // Iteratively sink the scalarized operands of the predicated instruction 4482 // into the block we created for it. When an instruction is sunk, it's 4483 // operands are then added to the worklist. The algorithm ends after one pass 4484 // through the worklist doesn't sink a single instruction. 4485 bool Changed; 4486 do { 4487 // Add the instructions that need to be reanalyzed to the worklist, and 4488 // reset the changed indicator. 4489 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4490 InstsToReanalyze.clear(); 4491 Changed = false; 4492 4493 while (!Worklist.empty()) { 4494 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4495 4496 // We can't sink an instruction if it is a phi node, is already in the 4497 // predicated block, is not in the loop, or may have side effects. 4498 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4499 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4500 continue; 4501 4502 // It's legal to sink the instruction if all its uses occur in the 4503 // predicated block. Otherwise, there's nothing to do yet, and we may 4504 // need to reanalyze the instruction. 4505 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4506 InstsToReanalyze.push_back(I); 4507 continue; 4508 } 4509 4510 // Move the instruction to the beginning of the predicated block, and add 4511 // it's operands to the worklist. 4512 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4513 Worklist.insert(I->op_begin(), I->op_end()); 4514 4515 // The sinking may have enabled other instructions to be sunk, so we will 4516 // need to iterate. 4517 Changed = true; 4518 } 4519 } while (Changed); 4520 } 4521 4522 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4523 for (PHINode *OrigPhi : OrigPHIsToFix) { 4524 VPWidenPHIRecipe *VPPhi = 4525 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4526 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4527 // Make sure the builder has a valid insert point. 4528 Builder.SetInsertPoint(NewPhi); 4529 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4530 VPValue *Inc = VPPhi->getIncomingValue(i); 4531 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4532 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4533 } 4534 } 4535 } 4536 4537 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4538 VPUser &Operands, unsigned UF, 4539 ElementCount VF, bool IsPtrLoopInvariant, 4540 SmallBitVector &IsIndexLoopInvariant, 4541 VPTransformState &State) { 4542 // Construct a vector GEP by widening the operands of the scalar GEP as 4543 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4544 // results in a vector of pointers when at least one operand of the GEP 4545 // is vector-typed. Thus, to keep the representation compact, we only use 4546 // vector-typed operands for loop-varying values. 4547 4548 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4549 // If we are vectorizing, but the GEP has only loop-invariant operands, 4550 // the GEP we build (by only using vector-typed operands for 4551 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4552 // produce a vector of pointers, we need to either arbitrarily pick an 4553 // operand to broadcast, or broadcast a clone of the original GEP. 4554 // Here, we broadcast a clone of the original. 4555 // 4556 // TODO: If at some point we decide to scalarize instructions having 4557 // loop-invariant operands, this special case will no longer be 4558 // required. We would add the scalarization decision to 4559 // collectLoopScalars() and teach getVectorValue() to broadcast 4560 // the lane-zero scalar value. 4561 auto *Clone = Builder.Insert(GEP->clone()); 4562 for (unsigned Part = 0; Part < UF; ++Part) { 4563 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4564 State.set(VPDef, EntryPart, Part); 4565 addMetadata(EntryPart, GEP); 4566 } 4567 } else { 4568 // If the GEP has at least one loop-varying operand, we are sure to 4569 // produce a vector of pointers. But if we are only unrolling, we want 4570 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4571 // produce with the code below will be scalar (if VF == 1) or vector 4572 // (otherwise). Note that for the unroll-only case, we still maintain 4573 // values in the vector mapping with initVector, as we do for other 4574 // instructions. 4575 for (unsigned Part = 0; Part < UF; ++Part) { 4576 // The pointer operand of the new GEP. If it's loop-invariant, we 4577 // won't broadcast it. 4578 auto *Ptr = IsPtrLoopInvariant 4579 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4580 : State.get(Operands.getOperand(0), Part); 4581 4582 // Collect all the indices for the new GEP. If any index is 4583 // loop-invariant, we won't broadcast it. 4584 SmallVector<Value *, 4> Indices; 4585 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4586 VPValue *Operand = Operands.getOperand(I); 4587 if (IsIndexLoopInvariant[I - 1]) 4588 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4589 else 4590 Indices.push_back(State.get(Operand, Part)); 4591 } 4592 4593 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4594 // but it should be a vector, otherwise. 4595 auto *NewGEP = 4596 GEP->isInBounds() 4597 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4598 Indices) 4599 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4600 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4601 "NewGEP is not a pointer vector"); 4602 State.set(VPDef, NewGEP, Part); 4603 addMetadata(NewGEP, GEP); 4604 } 4605 } 4606 } 4607 4608 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4609 RecurrenceDescriptor *RdxDesc, 4610 VPValue *StartVPV, VPValue *Def, 4611 VPTransformState &State) { 4612 PHINode *P = cast<PHINode>(PN); 4613 if (EnableVPlanNativePath) { 4614 // Currently we enter here in the VPlan-native path for non-induction 4615 // PHIs where all control flow is uniform. We simply widen these PHIs. 4616 // Create a vector phi with no operands - the vector phi operands will be 4617 // set at the end of vector code generation. 4618 Type *VecTy = (State.VF.isScalar()) 4619 ? PN->getType() 4620 : VectorType::get(PN->getType(), State.VF); 4621 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4622 State.set(Def, VecPhi, 0); 4623 OrigPHIsToFix.push_back(P); 4624 4625 return; 4626 } 4627 4628 assert(PN->getParent() == OrigLoop->getHeader() && 4629 "Non-header phis should have been handled elsewhere"); 4630 4631 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4632 // In order to support recurrences we need to be able to vectorize Phi nodes. 4633 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4634 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4635 // this value when we vectorize all of the instructions that use the PHI. 4636 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4637 Value *Iden = nullptr; 4638 bool ScalarPHI = 4639 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4640 Type *VecTy = 4641 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4642 4643 if (RdxDesc) { 4644 assert(Legal->isReductionVariable(P) && StartV && 4645 "RdxDesc should only be set for reduction variables; in that case " 4646 "a StartV is also required"); 4647 RecurKind RK = RdxDesc->getRecurrenceKind(); 4648 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4649 // MinMax reduction have the start value as their identify. 4650 if (ScalarPHI) { 4651 Iden = StartV; 4652 } else { 4653 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4654 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4655 StartV = Iden = 4656 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4657 } 4658 } else { 4659 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4660 RK, VecTy->getScalarType()); 4661 Iden = IdenC; 4662 4663 if (!ScalarPHI) { 4664 Iden = ConstantVector::getSplat(State.VF, IdenC); 4665 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4666 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4667 Constant *Zero = Builder.getInt32(0); 4668 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4669 } 4670 } 4671 } 4672 4673 for (unsigned Part = 0; Part < State.UF; ++Part) { 4674 // This is phase one of vectorizing PHIs. 4675 Value *EntryPart = PHINode::Create( 4676 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4677 State.set(Def, EntryPart, Part); 4678 if (StartV) { 4679 // Make sure to add the reduction start value only to the 4680 // first unroll part. 4681 Value *StartVal = (Part == 0) ? StartV : Iden; 4682 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4683 } 4684 } 4685 return; 4686 } 4687 4688 assert(!Legal->isReductionVariable(P) && 4689 "reductions should be handled above"); 4690 4691 setDebugLocFromInst(Builder, P); 4692 4693 // This PHINode must be an induction variable. 4694 // Make sure that we know about it. 4695 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4696 4697 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4698 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4699 4700 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4701 // which can be found from the original scalar operations. 4702 switch (II.getKind()) { 4703 case InductionDescriptor::IK_NoInduction: 4704 llvm_unreachable("Unknown induction"); 4705 case InductionDescriptor::IK_IntInduction: 4706 case InductionDescriptor::IK_FpInduction: 4707 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4708 case InductionDescriptor::IK_PtrInduction: { 4709 // Handle the pointer induction variable case. 4710 assert(P->getType()->isPointerTy() && "Unexpected type."); 4711 4712 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4713 // This is the normalized GEP that starts counting at zero. 4714 Value *PtrInd = 4715 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4716 // Determine the number of scalars we need to generate for each unroll 4717 // iteration. If the instruction is uniform, we only need to generate the 4718 // first lane. Otherwise, we generate all VF values. 4719 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF) 4720 ? 1 4721 : State.VF.getKnownMinValue(); 4722 for (unsigned Part = 0; Part < UF; ++Part) { 4723 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4724 Constant *Idx = ConstantInt::get( 4725 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue()); 4726 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4727 Value *SclrGep = 4728 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4729 SclrGep->setName("next.gep"); 4730 State.set(Def, SclrGep, VPIteration(Part, Lane)); 4731 } 4732 } 4733 return; 4734 } 4735 assert(isa<SCEVConstant>(II.getStep()) && 4736 "Induction step not a SCEV constant!"); 4737 Type *PhiType = II.getStep()->getType(); 4738 4739 // Build a pointer phi 4740 Value *ScalarStartValue = II.getStartValue(); 4741 Type *ScStValueType = ScalarStartValue->getType(); 4742 PHINode *NewPointerPhi = 4743 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4744 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4745 4746 // A pointer induction, performed by using a gep 4747 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4748 Instruction *InductionLoc = LoopLatch->getTerminator(); 4749 const SCEV *ScalarStep = II.getStep(); 4750 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4751 Value *ScalarStepValue = 4752 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4753 Value *InductionGEP = GetElementPtrInst::Create( 4754 ScStValueType->getPointerElementType(), NewPointerPhi, 4755 Builder.CreateMul( 4756 ScalarStepValue, 4757 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)), 4758 "ptr.ind", InductionLoc); 4759 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4760 4761 // Create UF many actual address geps that use the pointer 4762 // phi as base and a vectorized version of the step value 4763 // (<step*0, ..., step*N>) as offset. 4764 for (unsigned Part = 0; Part < State.UF; ++Part) { 4765 SmallVector<Constant *, 8> Indices; 4766 // Create a vector of consecutive numbers from zero to VF. 4767 for (unsigned i = 0; i < State.VF.getKnownMinValue(); ++i) 4768 Indices.push_back( 4769 ConstantInt::get(PhiType, i + Part * State.VF.getKnownMinValue())); 4770 Constant *StartOffset = ConstantVector::get(Indices); 4771 4772 Value *GEP = Builder.CreateGEP( 4773 ScStValueType->getPointerElementType(), NewPointerPhi, 4774 Builder.CreateMul(StartOffset, 4775 Builder.CreateVectorSplat( 4776 State.VF.getKnownMinValue(), ScalarStepValue), 4777 "vector.gep")); 4778 State.set(Def, GEP, Part); 4779 } 4780 } 4781 } 4782 } 4783 4784 /// A helper function for checking whether an integer division-related 4785 /// instruction may divide by zero (in which case it must be predicated if 4786 /// executed conditionally in the scalar code). 4787 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4788 /// Non-zero divisors that are non compile-time constants will not be 4789 /// converted into multiplication, so we will still end up scalarizing 4790 /// the division, but can do so w/o predication. 4791 static bool mayDivideByZero(Instruction &I) { 4792 assert((I.getOpcode() == Instruction::UDiv || 4793 I.getOpcode() == Instruction::SDiv || 4794 I.getOpcode() == Instruction::URem || 4795 I.getOpcode() == Instruction::SRem) && 4796 "Unexpected instruction"); 4797 Value *Divisor = I.getOperand(1); 4798 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4799 return !CInt || CInt->isZero(); 4800 } 4801 4802 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4803 VPUser &User, 4804 VPTransformState &State) { 4805 switch (I.getOpcode()) { 4806 case Instruction::Call: 4807 case Instruction::Br: 4808 case Instruction::PHI: 4809 case Instruction::GetElementPtr: 4810 case Instruction::Select: 4811 llvm_unreachable("This instruction is handled by a different recipe."); 4812 case Instruction::UDiv: 4813 case Instruction::SDiv: 4814 case Instruction::SRem: 4815 case Instruction::URem: 4816 case Instruction::Add: 4817 case Instruction::FAdd: 4818 case Instruction::Sub: 4819 case Instruction::FSub: 4820 case Instruction::FNeg: 4821 case Instruction::Mul: 4822 case Instruction::FMul: 4823 case Instruction::FDiv: 4824 case Instruction::FRem: 4825 case Instruction::Shl: 4826 case Instruction::LShr: 4827 case Instruction::AShr: 4828 case Instruction::And: 4829 case Instruction::Or: 4830 case Instruction::Xor: { 4831 // Just widen unops and binops. 4832 setDebugLocFromInst(Builder, &I); 4833 4834 for (unsigned Part = 0; Part < UF; ++Part) { 4835 SmallVector<Value *, 2> Ops; 4836 for (VPValue *VPOp : User.operands()) 4837 Ops.push_back(State.get(VPOp, Part)); 4838 4839 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4840 4841 if (auto *VecOp = dyn_cast<Instruction>(V)) 4842 VecOp->copyIRFlags(&I); 4843 4844 // Use this vector value for all users of the original instruction. 4845 State.set(Def, V, Part); 4846 addMetadata(V, &I); 4847 } 4848 4849 break; 4850 } 4851 case Instruction::ICmp: 4852 case Instruction::FCmp: { 4853 // Widen compares. Generate vector compares. 4854 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4855 auto *Cmp = cast<CmpInst>(&I); 4856 setDebugLocFromInst(Builder, Cmp); 4857 for (unsigned Part = 0; Part < UF; ++Part) { 4858 Value *A = State.get(User.getOperand(0), Part); 4859 Value *B = State.get(User.getOperand(1), Part); 4860 Value *C = nullptr; 4861 if (FCmp) { 4862 // Propagate fast math flags. 4863 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4864 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4865 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4866 } else { 4867 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4868 } 4869 State.set(Def, C, Part); 4870 addMetadata(C, &I); 4871 } 4872 4873 break; 4874 } 4875 4876 case Instruction::ZExt: 4877 case Instruction::SExt: 4878 case Instruction::FPToUI: 4879 case Instruction::FPToSI: 4880 case Instruction::FPExt: 4881 case Instruction::PtrToInt: 4882 case Instruction::IntToPtr: 4883 case Instruction::SIToFP: 4884 case Instruction::UIToFP: 4885 case Instruction::Trunc: 4886 case Instruction::FPTrunc: 4887 case Instruction::BitCast: { 4888 auto *CI = cast<CastInst>(&I); 4889 setDebugLocFromInst(Builder, CI); 4890 4891 /// Vectorize casts. 4892 Type *DestTy = 4893 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4894 4895 for (unsigned Part = 0; Part < UF; ++Part) { 4896 Value *A = State.get(User.getOperand(0), Part); 4897 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4898 State.set(Def, Cast, Part); 4899 addMetadata(Cast, &I); 4900 } 4901 break; 4902 } 4903 default: 4904 // This instruction is not vectorized by simple widening. 4905 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4906 llvm_unreachable("Unhandled instruction!"); 4907 } // end of switch. 4908 } 4909 4910 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4911 VPUser &ArgOperands, 4912 VPTransformState &State) { 4913 assert(!isa<DbgInfoIntrinsic>(I) && 4914 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4915 setDebugLocFromInst(Builder, &I); 4916 4917 Module *M = I.getParent()->getParent()->getParent(); 4918 auto *CI = cast<CallInst>(&I); 4919 4920 SmallVector<Type *, 4> Tys; 4921 for (Value *ArgOperand : CI->arg_operands()) 4922 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4923 4924 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4925 4926 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4927 // version of the instruction. 4928 // Is it beneficial to perform intrinsic call compared to lib call? 4929 bool NeedToScalarize = false; 4930 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4931 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4932 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4933 assert((UseVectorIntrinsic || !NeedToScalarize) && 4934 "Instruction should be scalarized elsewhere."); 4935 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4936 "Cannot have invalid costs while widening"); 4937 4938 for (unsigned Part = 0; Part < UF; ++Part) { 4939 SmallVector<Value *, 4> Args; 4940 for (auto &I : enumerate(ArgOperands.operands())) { 4941 // Some intrinsics have a scalar argument - don't replace it with a 4942 // vector. 4943 Value *Arg; 4944 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4945 Arg = State.get(I.value(), Part); 4946 else 4947 Arg = State.get(I.value(), VPIteration(0, 0)); 4948 Args.push_back(Arg); 4949 } 4950 4951 Function *VectorF; 4952 if (UseVectorIntrinsic) { 4953 // Use vector version of the intrinsic. 4954 Type *TysForDecl[] = {CI->getType()}; 4955 if (VF.isVector()) 4956 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4957 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4958 assert(VectorF && "Can't retrieve vector intrinsic."); 4959 } else { 4960 // Use vector version of the function call. 4961 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4962 #ifndef NDEBUG 4963 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4964 "Can't create vector function."); 4965 #endif 4966 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4967 } 4968 SmallVector<OperandBundleDef, 1> OpBundles; 4969 CI->getOperandBundlesAsDefs(OpBundles); 4970 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4971 4972 if (isa<FPMathOperator>(V)) 4973 V->copyFastMathFlags(CI); 4974 4975 State.set(Def, V, Part); 4976 addMetadata(V, &I); 4977 } 4978 } 4979 4980 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4981 VPUser &Operands, 4982 bool InvariantCond, 4983 VPTransformState &State) { 4984 setDebugLocFromInst(Builder, &I); 4985 4986 // The condition can be loop invariant but still defined inside the 4987 // loop. This means that we can't just use the original 'cond' value. 4988 // We have to take the 'vectorized' value and pick the first lane. 4989 // Instcombine will make this a no-op. 4990 auto *InvarCond = InvariantCond 4991 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4992 : nullptr; 4993 4994 for (unsigned Part = 0; Part < UF; ++Part) { 4995 Value *Cond = 4996 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4997 Value *Op0 = State.get(Operands.getOperand(1), Part); 4998 Value *Op1 = State.get(Operands.getOperand(2), Part); 4999 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5000 State.set(VPDef, Sel, Part); 5001 addMetadata(Sel, &I); 5002 } 5003 } 5004 5005 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5006 // We should not collect Scalars more than once per VF. Right now, this 5007 // function is called from collectUniformsAndScalars(), which already does 5008 // this check. Collecting Scalars for VF=1 does not make any sense. 5009 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5010 "This function should not be visited twice for the same VF"); 5011 5012 SmallSetVector<Instruction *, 8> Worklist; 5013 5014 // These sets are used to seed the analysis with pointers used by memory 5015 // accesses that will remain scalar. 5016 SmallSetVector<Instruction *, 8> ScalarPtrs; 5017 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5018 auto *Latch = TheLoop->getLoopLatch(); 5019 5020 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5021 // The pointer operands of loads and stores will be scalar as long as the 5022 // memory access is not a gather or scatter operation. The value operand of a 5023 // store will remain scalar if the store is scalarized. 5024 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5025 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5026 assert(WideningDecision != CM_Unknown && 5027 "Widening decision should be ready at this moment"); 5028 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5029 if (Ptr == Store->getValueOperand()) 5030 return WideningDecision == CM_Scalarize; 5031 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5032 "Ptr is neither a value or pointer operand"); 5033 return WideningDecision != CM_GatherScatter; 5034 }; 5035 5036 // A helper that returns true if the given value is a bitcast or 5037 // getelementptr instruction contained in the loop. 5038 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5039 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5040 isa<GetElementPtrInst>(V)) && 5041 !TheLoop->isLoopInvariant(V); 5042 }; 5043 5044 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5045 if (!isa<PHINode>(Ptr) || 5046 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5047 return false; 5048 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5049 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5050 return false; 5051 return isScalarUse(MemAccess, Ptr); 5052 }; 5053 5054 // A helper that evaluates a memory access's use of a pointer. If the 5055 // pointer is actually the pointer induction of a loop, it is being 5056 // inserted into Worklist. If the use will be a scalar use, and the 5057 // pointer is only used by memory accesses, we place the pointer in 5058 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5059 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5060 if (isScalarPtrInduction(MemAccess, Ptr)) { 5061 Worklist.insert(cast<Instruction>(Ptr)); 5062 Instruction *Update = cast<Instruction>( 5063 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5064 Worklist.insert(Update); 5065 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5066 << "\n"); 5067 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5068 << "\n"); 5069 return; 5070 } 5071 // We only care about bitcast and getelementptr instructions contained in 5072 // the loop. 5073 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5074 return; 5075 5076 // If the pointer has already been identified as scalar (e.g., if it was 5077 // also identified as uniform), there's nothing to do. 5078 auto *I = cast<Instruction>(Ptr); 5079 if (Worklist.count(I)) 5080 return; 5081 5082 // If the use of the pointer will be a scalar use, and all users of the 5083 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5084 // place the pointer in PossibleNonScalarPtrs. 5085 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5086 return isa<LoadInst>(U) || isa<StoreInst>(U); 5087 })) 5088 ScalarPtrs.insert(I); 5089 else 5090 PossibleNonScalarPtrs.insert(I); 5091 }; 5092 5093 // We seed the scalars analysis with three classes of instructions: (1) 5094 // instructions marked uniform-after-vectorization and (2) bitcast, 5095 // getelementptr and (pointer) phi instructions used by memory accesses 5096 // requiring a scalar use. 5097 // 5098 // (1) Add to the worklist all instructions that have been identified as 5099 // uniform-after-vectorization. 5100 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5101 5102 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5103 // memory accesses requiring a scalar use. The pointer operands of loads and 5104 // stores will be scalar as long as the memory accesses is not a gather or 5105 // scatter operation. The value operand of a store will remain scalar if the 5106 // store is scalarized. 5107 for (auto *BB : TheLoop->blocks()) 5108 for (auto &I : *BB) { 5109 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5110 evaluatePtrUse(Load, Load->getPointerOperand()); 5111 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5112 evaluatePtrUse(Store, Store->getPointerOperand()); 5113 evaluatePtrUse(Store, Store->getValueOperand()); 5114 } 5115 } 5116 for (auto *I : ScalarPtrs) 5117 if (!PossibleNonScalarPtrs.count(I)) { 5118 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5119 Worklist.insert(I); 5120 } 5121 5122 // Insert the forced scalars. 5123 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5124 // induction variable when the PHI user is scalarized. 5125 auto ForcedScalar = ForcedScalars.find(VF); 5126 if (ForcedScalar != ForcedScalars.end()) 5127 for (auto *I : ForcedScalar->second) 5128 Worklist.insert(I); 5129 5130 // Expand the worklist by looking through any bitcasts and getelementptr 5131 // instructions we've already identified as scalar. This is similar to the 5132 // expansion step in collectLoopUniforms(); however, here we're only 5133 // expanding to include additional bitcasts and getelementptr instructions. 5134 unsigned Idx = 0; 5135 while (Idx != Worklist.size()) { 5136 Instruction *Dst = Worklist[Idx++]; 5137 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5138 continue; 5139 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5140 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5141 auto *J = cast<Instruction>(U); 5142 return !TheLoop->contains(J) || Worklist.count(J) || 5143 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5144 isScalarUse(J, Src)); 5145 })) { 5146 Worklist.insert(Src); 5147 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5148 } 5149 } 5150 5151 // An induction variable will remain scalar if all users of the induction 5152 // variable and induction variable update remain scalar. 5153 for (auto &Induction : Legal->getInductionVars()) { 5154 auto *Ind = Induction.first; 5155 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5156 5157 // If tail-folding is applied, the primary induction variable will be used 5158 // to feed a vector compare. 5159 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5160 continue; 5161 5162 // Determine if all users of the induction variable are scalar after 5163 // vectorization. 5164 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5165 auto *I = cast<Instruction>(U); 5166 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5167 }); 5168 if (!ScalarInd) 5169 continue; 5170 5171 // Determine if all users of the induction variable update instruction are 5172 // scalar after vectorization. 5173 auto ScalarIndUpdate = 5174 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5175 auto *I = cast<Instruction>(U); 5176 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5177 }); 5178 if (!ScalarIndUpdate) 5179 continue; 5180 5181 // The induction variable and its update instruction will remain scalar. 5182 Worklist.insert(Ind); 5183 Worklist.insert(IndUpdate); 5184 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5185 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5186 << "\n"); 5187 } 5188 5189 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5190 } 5191 5192 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5193 ElementCount VF) { 5194 if (!blockNeedsPredication(I->getParent())) 5195 return false; 5196 switch(I->getOpcode()) { 5197 default: 5198 break; 5199 case Instruction::Load: 5200 case Instruction::Store: { 5201 if (!Legal->isMaskRequired(I)) 5202 return false; 5203 auto *Ptr = getLoadStorePointerOperand(I); 5204 auto *Ty = getMemInstValueType(I); 5205 // We have already decided how to vectorize this instruction, get that 5206 // result. 5207 if (VF.isVector()) { 5208 InstWidening WideningDecision = getWideningDecision(I, VF); 5209 assert(WideningDecision != CM_Unknown && 5210 "Widening decision should be ready at this moment"); 5211 return WideningDecision == CM_Scalarize; 5212 } 5213 const Align Alignment = getLoadStoreAlignment(I); 5214 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5215 isLegalMaskedGather(Ty, Alignment)) 5216 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5217 isLegalMaskedScatter(Ty, Alignment)); 5218 } 5219 case Instruction::UDiv: 5220 case Instruction::SDiv: 5221 case Instruction::SRem: 5222 case Instruction::URem: 5223 return mayDivideByZero(*I); 5224 } 5225 return false; 5226 } 5227 5228 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5229 Instruction *I, ElementCount VF) { 5230 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5231 assert(getWideningDecision(I, VF) == CM_Unknown && 5232 "Decision should not be set yet."); 5233 auto *Group = getInterleavedAccessGroup(I); 5234 assert(Group && "Must have a group."); 5235 5236 // If the instruction's allocated size doesn't equal it's type size, it 5237 // requires padding and will be scalarized. 5238 auto &DL = I->getModule()->getDataLayout(); 5239 auto *ScalarTy = getMemInstValueType(I); 5240 if (hasIrregularType(ScalarTy, DL)) 5241 return false; 5242 5243 // Check if masking is required. 5244 // A Group may need masking for one of two reasons: it resides in a block that 5245 // needs predication, or it was decided to use masking to deal with gaps. 5246 bool PredicatedAccessRequiresMasking = 5247 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5248 bool AccessWithGapsRequiresMasking = 5249 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5250 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5251 return true; 5252 5253 // If masked interleaving is required, we expect that the user/target had 5254 // enabled it, because otherwise it either wouldn't have been created or 5255 // it should have been invalidated by the CostModel. 5256 assert(useMaskedInterleavedAccesses(TTI) && 5257 "Masked interleave-groups for predicated accesses are not enabled."); 5258 5259 auto *Ty = getMemInstValueType(I); 5260 const Align Alignment = getLoadStoreAlignment(I); 5261 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5262 : TTI.isLegalMaskedStore(Ty, Alignment); 5263 } 5264 5265 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5266 Instruction *I, ElementCount VF) { 5267 // Get and ensure we have a valid memory instruction. 5268 LoadInst *LI = dyn_cast<LoadInst>(I); 5269 StoreInst *SI = dyn_cast<StoreInst>(I); 5270 assert((LI || SI) && "Invalid memory instruction"); 5271 5272 auto *Ptr = getLoadStorePointerOperand(I); 5273 5274 // In order to be widened, the pointer should be consecutive, first of all. 5275 if (!Legal->isConsecutivePtr(Ptr)) 5276 return false; 5277 5278 // If the instruction is a store located in a predicated block, it will be 5279 // scalarized. 5280 if (isScalarWithPredication(I)) 5281 return false; 5282 5283 // If the instruction's allocated size doesn't equal it's type size, it 5284 // requires padding and will be scalarized. 5285 auto &DL = I->getModule()->getDataLayout(); 5286 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5287 if (hasIrregularType(ScalarTy, DL)) 5288 return false; 5289 5290 return true; 5291 } 5292 5293 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5294 // We should not collect Uniforms more than once per VF. Right now, 5295 // this function is called from collectUniformsAndScalars(), which 5296 // already does this check. Collecting Uniforms for VF=1 does not make any 5297 // sense. 5298 5299 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5300 "This function should not be visited twice for the same VF"); 5301 5302 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5303 // not analyze again. Uniforms.count(VF) will return 1. 5304 Uniforms[VF].clear(); 5305 5306 // We now know that the loop is vectorizable! 5307 // Collect instructions inside the loop that will remain uniform after 5308 // vectorization. 5309 5310 // Global values, params and instructions outside of current loop are out of 5311 // scope. 5312 auto isOutOfScope = [&](Value *V) -> bool { 5313 Instruction *I = dyn_cast<Instruction>(V); 5314 return (!I || !TheLoop->contains(I)); 5315 }; 5316 5317 SetVector<Instruction *> Worklist; 5318 BasicBlock *Latch = TheLoop->getLoopLatch(); 5319 5320 // Instructions that are scalar with predication must not be considered 5321 // uniform after vectorization, because that would create an erroneous 5322 // replicating region where only a single instance out of VF should be formed. 5323 // TODO: optimize such seldom cases if found important, see PR40816. 5324 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5325 if (isOutOfScope(I)) { 5326 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5327 << *I << "\n"); 5328 return; 5329 } 5330 if (isScalarWithPredication(I, VF)) { 5331 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5332 << *I << "\n"); 5333 return; 5334 } 5335 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5336 Worklist.insert(I); 5337 }; 5338 5339 // Start with the conditional branch. If the branch condition is an 5340 // instruction contained in the loop that is only used by the branch, it is 5341 // uniform. 5342 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5343 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5344 addToWorklistIfAllowed(Cmp); 5345 5346 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5347 InstWidening WideningDecision = getWideningDecision(I, VF); 5348 assert(WideningDecision != CM_Unknown && 5349 "Widening decision should be ready at this moment"); 5350 5351 // A uniform memory op is itself uniform. We exclude uniform stores 5352 // here as they demand the last lane, not the first one. 5353 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5354 assert(WideningDecision == CM_Scalarize); 5355 return true; 5356 } 5357 5358 return (WideningDecision == CM_Widen || 5359 WideningDecision == CM_Widen_Reverse || 5360 WideningDecision == CM_Interleave); 5361 }; 5362 5363 5364 // Returns true if Ptr is the pointer operand of a memory access instruction 5365 // I, and I is known to not require scalarization. 5366 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5367 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5368 }; 5369 5370 // Holds a list of values which are known to have at least one uniform use. 5371 // Note that there may be other uses which aren't uniform. A "uniform use" 5372 // here is something which only demands lane 0 of the unrolled iterations; 5373 // it does not imply that all lanes produce the same value (e.g. this is not 5374 // the usual meaning of uniform) 5375 SmallPtrSet<Value *, 8> HasUniformUse; 5376 5377 // Scan the loop for instructions which are either a) known to have only 5378 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5379 for (auto *BB : TheLoop->blocks()) 5380 for (auto &I : *BB) { 5381 // If there's no pointer operand, there's nothing to do. 5382 auto *Ptr = getLoadStorePointerOperand(&I); 5383 if (!Ptr) 5384 continue; 5385 5386 // A uniform memory op is itself uniform. We exclude uniform stores 5387 // here as they demand the last lane, not the first one. 5388 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5389 addToWorklistIfAllowed(&I); 5390 5391 if (isUniformDecision(&I, VF)) { 5392 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5393 HasUniformUse.insert(Ptr); 5394 } 5395 } 5396 5397 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5398 // demanding) users. Since loops are assumed to be in LCSSA form, this 5399 // disallows uses outside the loop as well. 5400 for (auto *V : HasUniformUse) { 5401 if (isOutOfScope(V)) 5402 continue; 5403 auto *I = cast<Instruction>(V); 5404 auto UsersAreMemAccesses = 5405 llvm::all_of(I->users(), [&](User *U) -> bool { 5406 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5407 }); 5408 if (UsersAreMemAccesses) 5409 addToWorklistIfAllowed(I); 5410 } 5411 5412 // Expand Worklist in topological order: whenever a new instruction 5413 // is added , its users should be already inside Worklist. It ensures 5414 // a uniform instruction will only be used by uniform instructions. 5415 unsigned idx = 0; 5416 while (idx != Worklist.size()) { 5417 Instruction *I = Worklist[idx++]; 5418 5419 for (auto OV : I->operand_values()) { 5420 // isOutOfScope operands cannot be uniform instructions. 5421 if (isOutOfScope(OV)) 5422 continue; 5423 // First order recurrence Phi's should typically be considered 5424 // non-uniform. 5425 auto *OP = dyn_cast<PHINode>(OV); 5426 if (OP && Legal->isFirstOrderRecurrence(OP)) 5427 continue; 5428 // If all the users of the operand are uniform, then add the 5429 // operand into the uniform worklist. 5430 auto *OI = cast<Instruction>(OV); 5431 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5432 auto *J = cast<Instruction>(U); 5433 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5434 })) 5435 addToWorklistIfAllowed(OI); 5436 } 5437 } 5438 5439 // For an instruction to be added into Worklist above, all its users inside 5440 // the loop should also be in Worklist. However, this condition cannot be 5441 // true for phi nodes that form a cyclic dependence. We must process phi 5442 // nodes separately. An induction variable will remain uniform if all users 5443 // of the induction variable and induction variable update remain uniform. 5444 // The code below handles both pointer and non-pointer induction variables. 5445 for (auto &Induction : Legal->getInductionVars()) { 5446 auto *Ind = Induction.first; 5447 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5448 5449 // Determine if all users of the induction variable are uniform after 5450 // vectorization. 5451 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5452 auto *I = cast<Instruction>(U); 5453 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5454 isVectorizedMemAccessUse(I, Ind); 5455 }); 5456 if (!UniformInd) 5457 continue; 5458 5459 // Determine if all users of the induction variable update instruction are 5460 // uniform after vectorization. 5461 auto UniformIndUpdate = 5462 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5463 auto *I = cast<Instruction>(U); 5464 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5465 isVectorizedMemAccessUse(I, IndUpdate); 5466 }); 5467 if (!UniformIndUpdate) 5468 continue; 5469 5470 // The induction variable and its update instruction will remain uniform. 5471 addToWorklistIfAllowed(Ind); 5472 addToWorklistIfAllowed(IndUpdate); 5473 } 5474 5475 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5476 } 5477 5478 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5479 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5480 5481 if (Legal->getRuntimePointerChecking()->Need) { 5482 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5483 "runtime pointer checks needed. Enable vectorization of this " 5484 "loop with '#pragma clang loop vectorize(enable)' when " 5485 "compiling with -Os/-Oz", 5486 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5487 return true; 5488 } 5489 5490 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5491 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5492 "runtime SCEV checks needed. Enable vectorization of this " 5493 "loop with '#pragma clang loop vectorize(enable)' when " 5494 "compiling with -Os/-Oz", 5495 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5496 return true; 5497 } 5498 5499 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5500 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5501 reportVectorizationFailure("Runtime stride check for small trip count", 5502 "runtime stride == 1 checks needed. Enable vectorization of " 5503 "this loop without such check by compiling with -Os/-Oz", 5504 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5505 return true; 5506 } 5507 5508 return false; 5509 } 5510 5511 Optional<ElementCount> 5512 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5513 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5514 // TODO: It may by useful to do since it's still likely to be dynamically 5515 // uniform if the target can skip. 5516 reportVectorizationFailure( 5517 "Not inserting runtime ptr check for divergent target", 5518 "runtime pointer checks needed. Not enabled for divergent target", 5519 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5520 return None; 5521 } 5522 5523 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5524 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5525 if (TC == 1) { 5526 reportVectorizationFailure("Single iteration (non) loop", 5527 "loop trip count is one, irrelevant for vectorization", 5528 "SingleIterationLoop", ORE, TheLoop); 5529 return None; 5530 } 5531 5532 switch (ScalarEpilogueStatus) { 5533 case CM_ScalarEpilogueAllowed: 5534 return computeFeasibleMaxVF(TC, UserVF); 5535 case CM_ScalarEpilogueNotAllowedUsePredicate: 5536 LLVM_FALLTHROUGH; 5537 case CM_ScalarEpilogueNotNeededUsePredicate: 5538 LLVM_DEBUG( 5539 dbgs() << "LV: vector predicate hint/switch found.\n" 5540 << "LV: Not allowing scalar epilogue, creating predicated " 5541 << "vector loop.\n"); 5542 break; 5543 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5544 // fallthrough as a special case of OptForSize 5545 case CM_ScalarEpilogueNotAllowedOptSize: 5546 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5547 LLVM_DEBUG( 5548 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5549 else 5550 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5551 << "count.\n"); 5552 5553 // Bail if runtime checks are required, which are not good when optimising 5554 // for size. 5555 if (runtimeChecksRequired()) 5556 return None; 5557 5558 break; 5559 } 5560 5561 // The only loops we can vectorize without a scalar epilogue, are loops with 5562 // a bottom-test and a single exiting block. We'd have to handle the fact 5563 // that not every instruction executes on the last iteration. This will 5564 // require a lane mask which varies through the vector loop body. (TODO) 5565 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5566 // If there was a tail-folding hint/switch, but we can't fold the tail by 5567 // masking, fallback to a vectorization with a scalar epilogue. 5568 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5569 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5570 "scalar epilogue instead.\n"); 5571 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5572 return computeFeasibleMaxVF(TC, UserVF); 5573 } 5574 return None; 5575 } 5576 5577 // Now try the tail folding 5578 5579 // Invalidate interleave groups that require an epilogue if we can't mask 5580 // the interleave-group. 5581 if (!useMaskedInterleavedAccesses(TTI)) { 5582 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5583 "No decisions should have been taken at this point"); 5584 // Note: There is no need to invalidate any cost modeling decisions here, as 5585 // non where taken so far. 5586 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5587 } 5588 5589 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5590 assert(!MaxVF.isScalable() && 5591 "Scalable vectors do not yet support tail folding"); 5592 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5593 "MaxVF must be a power of 2"); 5594 unsigned MaxVFtimesIC = 5595 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5596 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5597 // chose. 5598 ScalarEvolution *SE = PSE.getSE(); 5599 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5600 const SCEV *ExitCount = SE->getAddExpr( 5601 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5602 const SCEV *Rem = SE->getURemExpr( 5603 SE->applyLoopGuards(ExitCount, TheLoop), 5604 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5605 if (Rem->isZero()) { 5606 // Accept MaxVF if we do not have a tail. 5607 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5608 return MaxVF; 5609 } 5610 5611 // If we don't know the precise trip count, or if the trip count that we 5612 // found modulo the vectorization factor is not zero, try to fold the tail 5613 // by masking. 5614 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5615 if (Legal->prepareToFoldTailByMasking()) { 5616 FoldTailByMasking = true; 5617 return MaxVF; 5618 } 5619 5620 // If there was a tail-folding hint/switch, but we can't fold the tail by 5621 // masking, fallback to a vectorization with a scalar epilogue. 5622 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5623 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5624 "scalar epilogue instead.\n"); 5625 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5626 return MaxVF; 5627 } 5628 5629 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5630 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5631 return None; 5632 } 5633 5634 if (TC == 0) { 5635 reportVectorizationFailure( 5636 "Unable to calculate the loop count due to complex control flow", 5637 "unable to calculate the loop count due to complex control flow", 5638 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5639 return None; 5640 } 5641 5642 reportVectorizationFailure( 5643 "Cannot optimize for size and vectorize at the same time.", 5644 "cannot optimize for size and vectorize at the same time. " 5645 "Enable vectorization of this loop with '#pragma clang loop " 5646 "vectorize(enable)' when compiling with -Os/-Oz", 5647 "NoTailLoopWithOptForSize", ORE, TheLoop); 5648 return None; 5649 } 5650 5651 ElementCount 5652 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5653 ElementCount UserVF) { 5654 bool IgnoreScalableUserVF = UserVF.isScalable() && 5655 !TTI.supportsScalableVectors() && 5656 !ForceTargetSupportsScalableVectors; 5657 if (IgnoreScalableUserVF) { 5658 LLVM_DEBUG( 5659 dbgs() << "LV: Ignoring VF=" << UserVF 5660 << " because target does not support scalable vectors.\n"); 5661 ORE->emit([&]() { 5662 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5663 TheLoop->getStartLoc(), 5664 TheLoop->getHeader()) 5665 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5666 << " because target does not support scalable vectors."; 5667 }); 5668 } 5669 5670 // Beyond this point two scenarios are handled. If UserVF isn't specified 5671 // then a suitable VF is chosen. If UserVF is specified and there are 5672 // dependencies, check if it's legal. However, if a UserVF is specified and 5673 // there are no dependencies, then there's nothing to do. 5674 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5675 if (!canVectorizeReductions(UserVF)) { 5676 reportVectorizationFailure( 5677 "LV: Scalable vectorization not supported for the reduction " 5678 "operations found in this loop. Using fixed-width " 5679 "vectorization instead.", 5680 "Scalable vectorization not supported for the reduction operations " 5681 "found in this loop. Using fixed-width vectorization instead.", 5682 "ScalableVFUnfeasible", ORE, TheLoop); 5683 return computeFeasibleMaxVF( 5684 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5685 } 5686 5687 if (Legal->isSafeForAnyVectorWidth()) 5688 return UserVF; 5689 } 5690 5691 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5692 unsigned SmallestType, WidestType; 5693 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5694 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5695 5696 // Get the maximum safe dependence distance in bits computed by LAA. 5697 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5698 // the memory accesses that is most restrictive (involved in the smallest 5699 // dependence distance). 5700 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5701 5702 // If the user vectorization factor is legally unsafe, clamp it to a safe 5703 // value. Otherwise, return as is. 5704 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5705 unsigned MaxSafeElements = 5706 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5707 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5708 5709 if (UserVF.isScalable()) { 5710 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5711 5712 // Scale VF by vscale before checking if it's safe. 5713 MaxSafeVF = ElementCount::getScalable( 5714 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5715 5716 if (MaxSafeVF.isZero()) { 5717 // The dependence distance is too small to use scalable vectors, 5718 // fallback on fixed. 5719 LLVM_DEBUG( 5720 dbgs() 5721 << "LV: Max legal vector width too small, scalable vectorization " 5722 "unfeasible. Using fixed-width vectorization instead.\n"); 5723 ORE->emit([&]() { 5724 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5725 TheLoop->getStartLoc(), 5726 TheLoop->getHeader()) 5727 << "Max legal vector width too small, scalable vectorization " 5728 << "unfeasible. Using fixed-width vectorization instead."; 5729 }); 5730 return computeFeasibleMaxVF( 5731 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5732 } 5733 } 5734 5735 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5736 5737 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5738 return UserVF; 5739 5740 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5741 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5742 << ".\n"); 5743 ORE->emit([&]() { 5744 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5745 TheLoop->getStartLoc(), 5746 TheLoop->getHeader()) 5747 << "User-specified vectorization factor " 5748 << ore::NV("UserVectorizationFactor", UserVF) 5749 << " is unsafe, clamping to maximum safe vectorization factor " 5750 << ore::NV("VectorizationFactor", MaxSafeVF); 5751 }); 5752 return MaxSafeVF; 5753 } 5754 5755 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5756 5757 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5758 // Note that both WidestRegister and WidestType may not be a powers of 2. 5759 auto MaxVectorSize = 5760 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5761 5762 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5763 << " / " << WidestType << " bits.\n"); 5764 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5765 << WidestRegister << " bits.\n"); 5766 5767 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5768 "Did not expect to pack so many elements" 5769 " into one vector!"); 5770 if (MaxVectorSize.getFixedValue() == 0) { 5771 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5772 return ElementCount::getFixed(1); 5773 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5774 isPowerOf2_32(ConstTripCount)) { 5775 // We need to clamp the VF to be the ConstTripCount. There is no point in 5776 // choosing a higher viable VF as done in the loop below. 5777 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5778 << ConstTripCount << "\n"); 5779 return ElementCount::getFixed(ConstTripCount); 5780 } 5781 5782 ElementCount MaxVF = MaxVectorSize; 5783 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5784 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5785 // Collect all viable vectorization factors larger than the default MaxVF 5786 // (i.e. MaxVectorSize). 5787 SmallVector<ElementCount, 8> VFs; 5788 auto MaxVectorSizeMaxBW = 5789 ElementCount::getFixed(WidestRegister / SmallestType); 5790 for (ElementCount VS = MaxVectorSize * 2; 5791 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5792 VFs.push_back(VS); 5793 5794 // For each VF calculate its register usage. 5795 auto RUs = calculateRegisterUsage(VFs); 5796 5797 // Select the largest VF which doesn't require more registers than existing 5798 // ones. 5799 for (int i = RUs.size() - 1; i >= 0; --i) { 5800 bool Selected = true; 5801 for (auto &pair : RUs[i].MaxLocalUsers) { 5802 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5803 if (pair.second > TargetNumRegisters) 5804 Selected = false; 5805 } 5806 if (Selected) { 5807 MaxVF = VFs[i]; 5808 break; 5809 } 5810 } 5811 if (ElementCount MinVF = 5812 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5813 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5814 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5815 << ") with target's minimum: " << MinVF << '\n'); 5816 MaxVF = MinVF; 5817 } 5818 } 5819 } 5820 return MaxVF; 5821 } 5822 5823 VectorizationFactor 5824 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5825 // FIXME: This can be fixed for scalable vectors later, because at this stage 5826 // the LoopVectorizer will only consider vectorizing a loop with scalable 5827 // vectors when the loop has a hint to enable vectorization for a given VF. 5828 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5829 5830 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5831 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5832 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5833 5834 auto Width = ElementCount::getFixed(1); 5835 const float ScalarCost = *ExpectedCost.getValue(); 5836 float Cost = ScalarCost; 5837 5838 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5839 if (ForceVectorization && MaxVF.isVector()) { 5840 // Ignore scalar width, because the user explicitly wants vectorization. 5841 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5842 // evaluation. 5843 Cost = std::numeric_limits<float>::max(); 5844 } 5845 5846 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5847 i *= 2) { 5848 // Notice that the vector loop needs to be executed less times, so 5849 // we need to divide the cost of the vector loops by the width of 5850 // the vector elements. 5851 VectorizationCostTy C = expectedCost(i); 5852 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5853 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5854 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5855 << " costs: " << (int)VectorCost << ".\n"); 5856 if (!C.second && !ForceVectorization) { 5857 LLVM_DEBUG( 5858 dbgs() << "LV: Not considering vector loop of width " << i 5859 << " because it will not generate any vector instructions.\n"); 5860 continue; 5861 } 5862 5863 // If profitable add it to ProfitableVF list. 5864 if (VectorCost < ScalarCost) { 5865 ProfitableVFs.push_back(VectorizationFactor( 5866 {i, (unsigned)VectorCost})); 5867 } 5868 5869 if (VectorCost < Cost) { 5870 Cost = VectorCost; 5871 Width = i; 5872 } 5873 } 5874 5875 if (!EnableCondStoresVectorization && NumPredStores) { 5876 reportVectorizationFailure("There are conditional stores.", 5877 "store that is conditionally executed prevents vectorization", 5878 "ConditionalStore", ORE, TheLoop); 5879 Width = ElementCount::getFixed(1); 5880 Cost = ScalarCost; 5881 } 5882 5883 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5884 << "LV: Vectorization seems to be not beneficial, " 5885 << "but was forced by a user.\n"); 5886 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5887 VectorizationFactor Factor = {Width, 5888 (unsigned)(Width.getKnownMinValue() * Cost)}; 5889 return Factor; 5890 } 5891 5892 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5893 const Loop &L, ElementCount VF) const { 5894 // Cross iteration phis such as reductions need special handling and are 5895 // currently unsupported. 5896 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5897 return Legal->isFirstOrderRecurrence(&Phi) || 5898 Legal->isReductionVariable(&Phi); 5899 })) 5900 return false; 5901 5902 // Phis with uses outside of the loop require special handling and are 5903 // currently unsupported. 5904 for (auto &Entry : Legal->getInductionVars()) { 5905 // Look for uses of the value of the induction at the last iteration. 5906 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5907 for (User *U : PostInc->users()) 5908 if (!L.contains(cast<Instruction>(U))) 5909 return false; 5910 // Look for uses of penultimate value of the induction. 5911 for (User *U : Entry.first->users()) 5912 if (!L.contains(cast<Instruction>(U))) 5913 return false; 5914 } 5915 5916 // Induction variables that are widened require special handling that is 5917 // currently not supported. 5918 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5919 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5920 this->isProfitableToScalarize(Entry.first, VF)); 5921 })) 5922 return false; 5923 5924 return true; 5925 } 5926 5927 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5928 const ElementCount VF) const { 5929 // FIXME: We need a much better cost-model to take different parameters such 5930 // as register pressure, code size increase and cost of extra branches into 5931 // account. For now we apply a very crude heuristic and only consider loops 5932 // with vectorization factors larger than a certain value. 5933 // We also consider epilogue vectorization unprofitable for targets that don't 5934 // consider interleaving beneficial (eg. MVE). 5935 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5936 return false; 5937 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5938 return true; 5939 return false; 5940 } 5941 5942 VectorizationFactor 5943 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5944 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5945 VectorizationFactor Result = VectorizationFactor::Disabled(); 5946 if (!EnableEpilogueVectorization) { 5947 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5948 return Result; 5949 } 5950 5951 if (!isScalarEpilogueAllowed()) { 5952 LLVM_DEBUG( 5953 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5954 "allowed.\n";); 5955 return Result; 5956 } 5957 5958 // FIXME: This can be fixed for scalable vectors later, because at this stage 5959 // the LoopVectorizer will only consider vectorizing a loop with scalable 5960 // vectors when the loop has a hint to enable vectorization for a given VF. 5961 if (MainLoopVF.isScalable()) { 5962 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5963 "yet supported.\n"); 5964 return Result; 5965 } 5966 5967 // Not really a cost consideration, but check for unsupported cases here to 5968 // simplify the logic. 5969 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5970 LLVM_DEBUG( 5971 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5972 "not a supported candidate.\n";); 5973 return Result; 5974 } 5975 5976 if (EpilogueVectorizationForceVF > 1) { 5977 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5978 if (LVP.hasPlanWithVFs( 5979 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5980 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5981 else { 5982 LLVM_DEBUG( 5983 dbgs() 5984 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5985 return Result; 5986 } 5987 } 5988 5989 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5990 TheLoop->getHeader()->getParent()->hasMinSize()) { 5991 LLVM_DEBUG( 5992 dbgs() 5993 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5994 return Result; 5995 } 5996 5997 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5998 return Result; 5999 6000 for (auto &NextVF : ProfitableVFs) 6001 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6002 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6003 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6004 Result = NextVF; 6005 6006 if (Result != VectorizationFactor::Disabled()) 6007 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6008 << Result.Width.getFixedValue() << "\n";); 6009 return Result; 6010 } 6011 6012 std::pair<unsigned, unsigned> 6013 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6014 unsigned MinWidth = -1U; 6015 unsigned MaxWidth = 8; 6016 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6017 6018 // For each block. 6019 for (BasicBlock *BB : TheLoop->blocks()) { 6020 // For each instruction in the loop. 6021 for (Instruction &I : BB->instructionsWithoutDebug()) { 6022 Type *T = I.getType(); 6023 6024 // Skip ignored values. 6025 if (ValuesToIgnore.count(&I)) 6026 continue; 6027 6028 // Only examine Loads, Stores and PHINodes. 6029 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6030 continue; 6031 6032 // Examine PHI nodes that are reduction variables. Update the type to 6033 // account for the recurrence type. 6034 if (auto *PN = dyn_cast<PHINode>(&I)) { 6035 if (!Legal->isReductionVariable(PN)) 6036 continue; 6037 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6038 if (PreferInLoopReductions || 6039 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6040 RdxDesc.getRecurrenceType(), 6041 TargetTransformInfo::ReductionFlags())) 6042 continue; 6043 T = RdxDesc.getRecurrenceType(); 6044 } 6045 6046 // Examine the stored values. 6047 if (auto *ST = dyn_cast<StoreInst>(&I)) 6048 T = ST->getValueOperand()->getType(); 6049 6050 // Ignore loaded pointer types and stored pointer types that are not 6051 // vectorizable. 6052 // 6053 // FIXME: The check here attempts to predict whether a load or store will 6054 // be vectorized. We only know this for certain after a VF has 6055 // been selected. Here, we assume that if an access can be 6056 // vectorized, it will be. We should also look at extending this 6057 // optimization to non-pointer types. 6058 // 6059 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6060 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6061 continue; 6062 6063 MinWidth = std::min(MinWidth, 6064 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6065 MaxWidth = std::max(MaxWidth, 6066 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6067 } 6068 } 6069 6070 return {MinWidth, MaxWidth}; 6071 } 6072 6073 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6074 unsigned LoopCost) { 6075 // -- The interleave heuristics -- 6076 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6077 // There are many micro-architectural considerations that we can't predict 6078 // at this level. For example, frontend pressure (on decode or fetch) due to 6079 // code size, or the number and capabilities of the execution ports. 6080 // 6081 // We use the following heuristics to select the interleave count: 6082 // 1. If the code has reductions, then we interleave to break the cross 6083 // iteration dependency. 6084 // 2. If the loop is really small, then we interleave to reduce the loop 6085 // overhead. 6086 // 3. We don't interleave if we think that we will spill registers to memory 6087 // due to the increased register pressure. 6088 6089 if (!isScalarEpilogueAllowed()) 6090 return 1; 6091 6092 // We used the distance for the interleave count. 6093 if (Legal->getMaxSafeDepDistBytes() != -1U) 6094 return 1; 6095 6096 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6097 const bool HasReductions = !Legal->getReductionVars().empty(); 6098 // Do not interleave loops with a relatively small known or estimated trip 6099 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6100 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6101 // because with the above conditions interleaving can expose ILP and break 6102 // cross iteration dependences for reductions. 6103 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6104 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6105 return 1; 6106 6107 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6108 // We divide by these constants so assume that we have at least one 6109 // instruction that uses at least one register. 6110 for (auto& pair : R.MaxLocalUsers) { 6111 pair.second = std::max(pair.second, 1U); 6112 } 6113 6114 // We calculate the interleave count using the following formula. 6115 // Subtract the number of loop invariants from the number of available 6116 // registers. These registers are used by all of the interleaved instances. 6117 // Next, divide the remaining registers by the number of registers that is 6118 // required by the loop, in order to estimate how many parallel instances 6119 // fit without causing spills. All of this is rounded down if necessary to be 6120 // a power of two. We want power of two interleave count to simplify any 6121 // addressing operations or alignment considerations. 6122 // We also want power of two interleave counts to ensure that the induction 6123 // variable of the vector loop wraps to zero, when tail is folded by masking; 6124 // this currently happens when OptForSize, in which case IC is set to 1 above. 6125 unsigned IC = UINT_MAX; 6126 6127 for (auto& pair : R.MaxLocalUsers) { 6128 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6129 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6130 << " registers of " 6131 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6132 if (VF.isScalar()) { 6133 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6134 TargetNumRegisters = ForceTargetNumScalarRegs; 6135 } else { 6136 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6137 TargetNumRegisters = ForceTargetNumVectorRegs; 6138 } 6139 unsigned MaxLocalUsers = pair.second; 6140 unsigned LoopInvariantRegs = 0; 6141 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6142 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6143 6144 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6145 // Don't count the induction variable as interleaved. 6146 if (EnableIndVarRegisterHeur) { 6147 TmpIC = 6148 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6149 std::max(1U, (MaxLocalUsers - 1))); 6150 } 6151 6152 IC = std::min(IC, TmpIC); 6153 } 6154 6155 // Clamp the interleave ranges to reasonable counts. 6156 unsigned MaxInterleaveCount = 6157 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6158 6159 // Check if the user has overridden the max. 6160 if (VF.isScalar()) { 6161 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6162 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6163 } else { 6164 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6165 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6166 } 6167 6168 // If trip count is known or estimated compile time constant, limit the 6169 // interleave count to be less than the trip count divided by VF, provided it 6170 // is at least 1. 6171 // 6172 // For scalable vectors we can't know if interleaving is beneficial. It may 6173 // not be beneficial for small loops if none of the lanes in the second vector 6174 // iterations is enabled. However, for larger loops, there is likely to be a 6175 // similar benefit as for fixed-width vectors. For now, we choose to leave 6176 // the InterleaveCount as if vscale is '1', although if some information about 6177 // the vector is known (e.g. min vector size), we can make a better decision. 6178 if (BestKnownTC) { 6179 MaxInterleaveCount = 6180 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6181 // Make sure MaxInterleaveCount is greater than 0. 6182 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6183 } 6184 6185 assert(MaxInterleaveCount > 0 && 6186 "Maximum interleave count must be greater than 0"); 6187 6188 // Clamp the calculated IC to be between the 1 and the max interleave count 6189 // that the target and trip count allows. 6190 if (IC > MaxInterleaveCount) 6191 IC = MaxInterleaveCount; 6192 else 6193 // Make sure IC is greater than 0. 6194 IC = std::max(1u, IC); 6195 6196 assert(IC > 0 && "Interleave count must be greater than 0."); 6197 6198 // If we did not calculate the cost for VF (because the user selected the VF) 6199 // then we calculate the cost of VF here. 6200 if (LoopCost == 0) { 6201 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6202 LoopCost = *expectedCost(VF).first.getValue(); 6203 } 6204 6205 assert(LoopCost && "Non-zero loop cost expected"); 6206 6207 // Interleave if we vectorized this loop and there is a reduction that could 6208 // benefit from interleaving. 6209 if (VF.isVector() && HasReductions) { 6210 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6211 return IC; 6212 } 6213 6214 // Note that if we've already vectorized the loop we will have done the 6215 // runtime check and so interleaving won't require further checks. 6216 bool InterleavingRequiresRuntimePointerCheck = 6217 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6218 6219 // We want to interleave small loops in order to reduce the loop overhead and 6220 // potentially expose ILP opportunities. 6221 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6222 << "LV: IC is " << IC << '\n' 6223 << "LV: VF is " << VF << '\n'); 6224 const bool AggressivelyInterleaveReductions = 6225 TTI.enableAggressiveInterleaving(HasReductions); 6226 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6227 // We assume that the cost overhead is 1 and we use the cost model 6228 // to estimate the cost of the loop and interleave until the cost of the 6229 // loop overhead is about 5% of the cost of the loop. 6230 unsigned SmallIC = 6231 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6232 6233 // Interleave until store/load ports (estimated by max interleave count) are 6234 // saturated. 6235 unsigned NumStores = Legal->getNumStores(); 6236 unsigned NumLoads = Legal->getNumLoads(); 6237 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6238 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6239 6240 // If we have a scalar reduction (vector reductions are already dealt with 6241 // by this point), we can increase the critical path length if the loop 6242 // we're interleaving is inside another loop. Limit, by default to 2, so the 6243 // critical path only gets increased by one reduction operation. 6244 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6245 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6246 SmallIC = std::min(SmallIC, F); 6247 StoresIC = std::min(StoresIC, F); 6248 LoadsIC = std::min(LoadsIC, F); 6249 } 6250 6251 if (EnableLoadStoreRuntimeInterleave && 6252 std::max(StoresIC, LoadsIC) > SmallIC) { 6253 LLVM_DEBUG( 6254 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6255 return std::max(StoresIC, LoadsIC); 6256 } 6257 6258 // If there are scalar reductions and TTI has enabled aggressive 6259 // interleaving for reductions, we will interleave to expose ILP. 6260 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6261 AggressivelyInterleaveReductions) { 6262 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6263 // Interleave no less than SmallIC but not as aggressive as the normal IC 6264 // to satisfy the rare situation when resources are too limited. 6265 return std::max(IC / 2, SmallIC); 6266 } else { 6267 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6268 return SmallIC; 6269 } 6270 } 6271 6272 // Interleave if this is a large loop (small loops are already dealt with by 6273 // this point) that could benefit from interleaving. 6274 if (AggressivelyInterleaveReductions) { 6275 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6276 return IC; 6277 } 6278 6279 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6280 return 1; 6281 } 6282 6283 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6284 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6285 // This function calculates the register usage by measuring the highest number 6286 // of values that are alive at a single location. Obviously, this is a very 6287 // rough estimation. We scan the loop in a topological order in order and 6288 // assign a number to each instruction. We use RPO to ensure that defs are 6289 // met before their users. We assume that each instruction that has in-loop 6290 // users starts an interval. We record every time that an in-loop value is 6291 // used, so we have a list of the first and last occurrences of each 6292 // instruction. Next, we transpose this data structure into a multi map that 6293 // holds the list of intervals that *end* at a specific location. This multi 6294 // map allows us to perform a linear search. We scan the instructions linearly 6295 // and record each time that a new interval starts, by placing it in a set. 6296 // If we find this value in the multi-map then we remove it from the set. 6297 // The max register usage is the maximum size of the set. 6298 // We also search for instructions that are defined outside the loop, but are 6299 // used inside the loop. We need this number separately from the max-interval 6300 // usage number because when we unroll, loop-invariant values do not take 6301 // more register. 6302 LoopBlocksDFS DFS(TheLoop); 6303 DFS.perform(LI); 6304 6305 RegisterUsage RU; 6306 6307 // Each 'key' in the map opens a new interval. The values 6308 // of the map are the index of the 'last seen' usage of the 6309 // instruction that is the key. 6310 using IntervalMap = DenseMap<Instruction *, unsigned>; 6311 6312 // Maps instruction to its index. 6313 SmallVector<Instruction *, 64> IdxToInstr; 6314 // Marks the end of each interval. 6315 IntervalMap EndPoint; 6316 // Saves the list of instruction indices that are used in the loop. 6317 SmallPtrSet<Instruction *, 8> Ends; 6318 // Saves the list of values that are used in the loop but are 6319 // defined outside the loop, such as arguments and constants. 6320 SmallPtrSet<Value *, 8> LoopInvariants; 6321 6322 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6323 for (Instruction &I : BB->instructionsWithoutDebug()) { 6324 IdxToInstr.push_back(&I); 6325 6326 // Save the end location of each USE. 6327 for (Value *U : I.operands()) { 6328 auto *Instr = dyn_cast<Instruction>(U); 6329 6330 // Ignore non-instruction values such as arguments, constants, etc. 6331 if (!Instr) 6332 continue; 6333 6334 // If this instruction is outside the loop then record it and continue. 6335 if (!TheLoop->contains(Instr)) { 6336 LoopInvariants.insert(Instr); 6337 continue; 6338 } 6339 6340 // Overwrite previous end points. 6341 EndPoint[Instr] = IdxToInstr.size(); 6342 Ends.insert(Instr); 6343 } 6344 } 6345 } 6346 6347 // Saves the list of intervals that end with the index in 'key'. 6348 using InstrList = SmallVector<Instruction *, 2>; 6349 DenseMap<unsigned, InstrList> TransposeEnds; 6350 6351 // Transpose the EndPoints to a list of values that end at each index. 6352 for (auto &Interval : EndPoint) 6353 TransposeEnds[Interval.second].push_back(Interval.first); 6354 6355 SmallPtrSet<Instruction *, 8> OpenIntervals; 6356 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6357 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6358 6359 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6360 6361 // A lambda that gets the register usage for the given type and VF. 6362 const auto &TTICapture = TTI; 6363 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6364 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6365 return 0U; 6366 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6367 }; 6368 6369 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6370 Instruction *I = IdxToInstr[i]; 6371 6372 // Remove all of the instructions that end at this location. 6373 InstrList &List = TransposeEnds[i]; 6374 for (Instruction *ToRemove : List) 6375 OpenIntervals.erase(ToRemove); 6376 6377 // Ignore instructions that are never used within the loop. 6378 if (!Ends.count(I)) 6379 continue; 6380 6381 // Skip ignored values. 6382 if (ValuesToIgnore.count(I)) 6383 continue; 6384 6385 // For each VF find the maximum usage of registers. 6386 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6387 // Count the number of live intervals. 6388 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6389 6390 if (VFs[j].isScalar()) { 6391 for (auto Inst : OpenIntervals) { 6392 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6393 if (RegUsage.find(ClassID) == RegUsage.end()) 6394 RegUsage[ClassID] = 1; 6395 else 6396 RegUsage[ClassID] += 1; 6397 } 6398 } else { 6399 collectUniformsAndScalars(VFs[j]); 6400 for (auto Inst : OpenIntervals) { 6401 // Skip ignored values for VF > 1. 6402 if (VecValuesToIgnore.count(Inst)) 6403 continue; 6404 if (isScalarAfterVectorization(Inst, VFs[j])) { 6405 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6406 if (RegUsage.find(ClassID) == RegUsage.end()) 6407 RegUsage[ClassID] = 1; 6408 else 6409 RegUsage[ClassID] += 1; 6410 } else { 6411 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6412 if (RegUsage.find(ClassID) == RegUsage.end()) 6413 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6414 else 6415 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6416 } 6417 } 6418 } 6419 6420 for (auto& pair : RegUsage) { 6421 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6422 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6423 else 6424 MaxUsages[j][pair.first] = pair.second; 6425 } 6426 } 6427 6428 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6429 << OpenIntervals.size() << '\n'); 6430 6431 // Add the current instruction to the list of open intervals. 6432 OpenIntervals.insert(I); 6433 } 6434 6435 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6436 SmallMapVector<unsigned, unsigned, 4> Invariant; 6437 6438 for (auto Inst : LoopInvariants) { 6439 unsigned Usage = 6440 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6441 unsigned ClassID = 6442 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6443 if (Invariant.find(ClassID) == Invariant.end()) 6444 Invariant[ClassID] = Usage; 6445 else 6446 Invariant[ClassID] += Usage; 6447 } 6448 6449 LLVM_DEBUG({ 6450 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6451 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6452 << " item\n"; 6453 for (const auto &pair : MaxUsages[i]) { 6454 dbgs() << "LV(REG): RegisterClass: " 6455 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6456 << " registers\n"; 6457 } 6458 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6459 << " item\n"; 6460 for (const auto &pair : Invariant) { 6461 dbgs() << "LV(REG): RegisterClass: " 6462 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6463 << " registers\n"; 6464 } 6465 }); 6466 6467 RU.LoopInvariantRegs = Invariant; 6468 RU.MaxLocalUsers = MaxUsages[i]; 6469 RUs[i] = RU; 6470 } 6471 6472 return RUs; 6473 } 6474 6475 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6476 // TODO: Cost model for emulated masked load/store is completely 6477 // broken. This hack guides the cost model to use an artificially 6478 // high enough value to practically disable vectorization with such 6479 // operations, except where previously deployed legality hack allowed 6480 // using very low cost values. This is to avoid regressions coming simply 6481 // from moving "masked load/store" check from legality to cost model. 6482 // Masked Load/Gather emulation was previously never allowed. 6483 // Limited number of Masked Store/Scatter emulation was allowed. 6484 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6485 return isa<LoadInst>(I) || 6486 (isa<StoreInst>(I) && 6487 NumPredStores > NumberOfStoresToPredicate); 6488 } 6489 6490 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6491 // If we aren't vectorizing the loop, or if we've already collected the 6492 // instructions to scalarize, there's nothing to do. Collection may already 6493 // have occurred if we have a user-selected VF and are now computing the 6494 // expected cost for interleaving. 6495 if (VF.isScalar() || VF.isZero() || 6496 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6497 return; 6498 6499 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6500 // not profitable to scalarize any instructions, the presence of VF in the 6501 // map will indicate that we've analyzed it already. 6502 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6503 6504 // Find all the instructions that are scalar with predication in the loop and 6505 // determine if it would be better to not if-convert the blocks they are in. 6506 // If so, we also record the instructions to scalarize. 6507 for (BasicBlock *BB : TheLoop->blocks()) { 6508 if (!blockNeedsPredication(BB)) 6509 continue; 6510 for (Instruction &I : *BB) 6511 if (isScalarWithPredication(&I)) { 6512 ScalarCostsTy ScalarCosts; 6513 // Do not apply discount logic if hacked cost is needed 6514 // for emulated masked memrefs. 6515 if (!useEmulatedMaskMemRefHack(&I) && 6516 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6517 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6518 // Remember that BB will remain after vectorization. 6519 PredicatedBBsAfterVectorization.insert(BB); 6520 } 6521 } 6522 } 6523 6524 int LoopVectorizationCostModel::computePredInstDiscount( 6525 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6526 assert(!isUniformAfterVectorization(PredInst, VF) && 6527 "Instruction marked uniform-after-vectorization will be predicated"); 6528 6529 // Initialize the discount to zero, meaning that the scalar version and the 6530 // vector version cost the same. 6531 InstructionCost Discount = 0; 6532 6533 // Holds instructions to analyze. The instructions we visit are mapped in 6534 // ScalarCosts. Those instructions are the ones that would be scalarized if 6535 // we find that the scalar version costs less. 6536 SmallVector<Instruction *, 8> Worklist; 6537 6538 // Returns true if the given instruction can be scalarized. 6539 auto canBeScalarized = [&](Instruction *I) -> bool { 6540 // We only attempt to scalarize instructions forming a single-use chain 6541 // from the original predicated block that would otherwise be vectorized. 6542 // Although not strictly necessary, we give up on instructions we know will 6543 // already be scalar to avoid traversing chains that are unlikely to be 6544 // beneficial. 6545 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6546 isScalarAfterVectorization(I, VF)) 6547 return false; 6548 6549 // If the instruction is scalar with predication, it will be analyzed 6550 // separately. We ignore it within the context of PredInst. 6551 if (isScalarWithPredication(I)) 6552 return false; 6553 6554 // If any of the instruction's operands are uniform after vectorization, 6555 // the instruction cannot be scalarized. This prevents, for example, a 6556 // masked load from being scalarized. 6557 // 6558 // We assume we will only emit a value for lane zero of an instruction 6559 // marked uniform after vectorization, rather than VF identical values. 6560 // Thus, if we scalarize an instruction that uses a uniform, we would 6561 // create uses of values corresponding to the lanes we aren't emitting code 6562 // for. This behavior can be changed by allowing getScalarValue to clone 6563 // the lane zero values for uniforms rather than asserting. 6564 for (Use &U : I->operands()) 6565 if (auto *J = dyn_cast<Instruction>(U.get())) 6566 if (isUniformAfterVectorization(J, VF)) 6567 return false; 6568 6569 // Otherwise, we can scalarize the instruction. 6570 return true; 6571 }; 6572 6573 // Compute the expected cost discount from scalarizing the entire expression 6574 // feeding the predicated instruction. We currently only consider expressions 6575 // that are single-use instruction chains. 6576 Worklist.push_back(PredInst); 6577 while (!Worklist.empty()) { 6578 Instruction *I = Worklist.pop_back_val(); 6579 6580 // If we've already analyzed the instruction, there's nothing to do. 6581 if (ScalarCosts.find(I) != ScalarCosts.end()) 6582 continue; 6583 6584 // Compute the cost of the vector instruction. Note that this cost already 6585 // includes the scalarization overhead of the predicated instruction. 6586 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6587 6588 // Compute the cost of the scalarized instruction. This cost is the cost of 6589 // the instruction as if it wasn't if-converted and instead remained in the 6590 // predicated block. We will scale this cost by block probability after 6591 // computing the scalarization overhead. 6592 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6593 InstructionCost ScalarCost = 6594 VF.getKnownMinValue() * 6595 getInstructionCost(I, ElementCount::getFixed(1)).first; 6596 6597 // Compute the scalarization overhead of needed insertelement instructions 6598 // and phi nodes. 6599 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6600 ScalarCost += TTI.getScalarizationOverhead( 6601 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6602 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6603 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6604 ScalarCost += 6605 VF.getKnownMinValue() * 6606 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6607 } 6608 6609 // Compute the scalarization overhead of needed extractelement 6610 // instructions. For each of the instruction's operands, if the operand can 6611 // be scalarized, add it to the worklist; otherwise, account for the 6612 // overhead. 6613 for (Use &U : I->operands()) 6614 if (auto *J = dyn_cast<Instruction>(U.get())) { 6615 assert(VectorType::isValidElementType(J->getType()) && 6616 "Instruction has non-scalar type"); 6617 if (canBeScalarized(J)) 6618 Worklist.push_back(J); 6619 else if (needsExtract(J, VF)) { 6620 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6621 ScalarCost += TTI.getScalarizationOverhead( 6622 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6623 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6624 } 6625 } 6626 6627 // Scale the total scalar cost by block probability. 6628 ScalarCost /= getReciprocalPredBlockProb(); 6629 6630 // Compute the discount. A non-negative discount means the vector version 6631 // of the instruction costs more, and scalarizing would be beneficial. 6632 Discount += VectorCost - ScalarCost; 6633 ScalarCosts[I] = ScalarCost; 6634 } 6635 6636 return *Discount.getValue(); 6637 } 6638 6639 LoopVectorizationCostModel::VectorizationCostTy 6640 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6641 VectorizationCostTy Cost; 6642 6643 // For each block. 6644 for (BasicBlock *BB : TheLoop->blocks()) { 6645 VectorizationCostTy BlockCost; 6646 6647 // For each instruction in the old loop. 6648 for (Instruction &I : BB->instructionsWithoutDebug()) { 6649 // Skip ignored values. 6650 if (ValuesToIgnore.count(&I) || 6651 (VF.isVector() && VecValuesToIgnore.count(&I))) 6652 continue; 6653 6654 VectorizationCostTy C = getInstructionCost(&I, VF); 6655 6656 // Check if we should override the cost. 6657 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6658 C.first = InstructionCost(ForceTargetInstructionCost); 6659 6660 BlockCost.first += C.first; 6661 BlockCost.second |= C.second; 6662 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6663 << " for VF " << VF << " For instruction: " << I 6664 << '\n'); 6665 } 6666 6667 // If we are vectorizing a predicated block, it will have been 6668 // if-converted. This means that the block's instructions (aside from 6669 // stores and instructions that may divide by zero) will now be 6670 // unconditionally executed. For the scalar case, we may not always execute 6671 // the predicated block, if it is an if-else block. Thus, scale the block's 6672 // cost by the probability of executing it. blockNeedsPredication from 6673 // Legal is used so as to not include all blocks in tail folded loops. 6674 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6675 BlockCost.first /= getReciprocalPredBlockProb(); 6676 6677 Cost.first += BlockCost.first; 6678 Cost.second |= BlockCost.second; 6679 } 6680 6681 return Cost; 6682 } 6683 6684 /// Gets Address Access SCEV after verifying that the access pattern 6685 /// is loop invariant except the induction variable dependence. 6686 /// 6687 /// This SCEV can be sent to the Target in order to estimate the address 6688 /// calculation cost. 6689 static const SCEV *getAddressAccessSCEV( 6690 Value *Ptr, 6691 LoopVectorizationLegality *Legal, 6692 PredicatedScalarEvolution &PSE, 6693 const Loop *TheLoop) { 6694 6695 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6696 if (!Gep) 6697 return nullptr; 6698 6699 // We are looking for a gep with all loop invariant indices except for one 6700 // which should be an induction variable. 6701 auto SE = PSE.getSE(); 6702 unsigned NumOperands = Gep->getNumOperands(); 6703 for (unsigned i = 1; i < NumOperands; ++i) { 6704 Value *Opd = Gep->getOperand(i); 6705 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6706 !Legal->isInductionVariable(Opd)) 6707 return nullptr; 6708 } 6709 6710 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6711 return PSE.getSCEV(Ptr); 6712 } 6713 6714 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6715 return Legal->hasStride(I->getOperand(0)) || 6716 Legal->hasStride(I->getOperand(1)); 6717 } 6718 6719 InstructionCost 6720 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6721 ElementCount VF) { 6722 assert(VF.isVector() && 6723 "Scalarization cost of instruction implies vectorization."); 6724 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6725 Type *ValTy = getMemInstValueType(I); 6726 auto SE = PSE.getSE(); 6727 6728 unsigned AS = getLoadStoreAddressSpace(I); 6729 Value *Ptr = getLoadStorePointerOperand(I); 6730 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6731 6732 // Figure out whether the access is strided and get the stride value 6733 // if it's known in compile time 6734 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6735 6736 // Get the cost of the scalar memory instruction and address computation. 6737 InstructionCost Cost = 6738 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6739 6740 // Don't pass *I here, since it is scalar but will actually be part of a 6741 // vectorized loop where the user of it is a vectorized instruction. 6742 const Align Alignment = getLoadStoreAlignment(I); 6743 Cost += VF.getKnownMinValue() * 6744 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6745 AS, TTI::TCK_RecipThroughput); 6746 6747 // Get the overhead of the extractelement and insertelement instructions 6748 // we might create due to scalarization. 6749 Cost += getScalarizationOverhead(I, VF); 6750 6751 // If we have a predicated load/store, it will need extra i1 extracts and 6752 // conditional branches, but may not be executed for each vector lane. Scale 6753 // the cost by the probability of executing the predicated block. 6754 if (isPredicatedInst(I)) { 6755 Cost /= getReciprocalPredBlockProb(); 6756 6757 // Add the cost of an i1 extract and a branch 6758 auto *Vec_i1Ty = 6759 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6760 Cost += TTI.getScalarizationOverhead( 6761 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6762 /*Insert=*/false, /*Extract=*/true); 6763 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6764 6765 if (useEmulatedMaskMemRefHack(I)) 6766 // Artificially setting to a high enough value to practically disable 6767 // vectorization with such operations. 6768 Cost = 3000000; 6769 } 6770 6771 return Cost; 6772 } 6773 6774 InstructionCost 6775 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6776 ElementCount VF) { 6777 Type *ValTy = getMemInstValueType(I); 6778 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6779 Value *Ptr = getLoadStorePointerOperand(I); 6780 unsigned AS = getLoadStoreAddressSpace(I); 6781 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6782 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6783 6784 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6785 "Stride should be 1 or -1 for consecutive memory access"); 6786 const Align Alignment = getLoadStoreAlignment(I); 6787 InstructionCost Cost = 0; 6788 if (Legal->isMaskRequired(I)) 6789 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6790 CostKind); 6791 else 6792 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6793 CostKind, I); 6794 6795 bool Reverse = ConsecutiveStride < 0; 6796 if (Reverse) 6797 Cost += 6798 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6799 return Cost; 6800 } 6801 6802 InstructionCost 6803 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6804 ElementCount VF) { 6805 assert(Legal->isUniformMemOp(*I)); 6806 6807 Type *ValTy = getMemInstValueType(I); 6808 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6809 const Align Alignment = getLoadStoreAlignment(I); 6810 unsigned AS = getLoadStoreAddressSpace(I); 6811 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6812 if (isa<LoadInst>(I)) { 6813 return TTI.getAddressComputationCost(ValTy) + 6814 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6815 CostKind) + 6816 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6817 } 6818 StoreInst *SI = cast<StoreInst>(I); 6819 6820 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6821 return TTI.getAddressComputationCost(ValTy) + 6822 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6823 CostKind) + 6824 (isLoopInvariantStoreValue 6825 ? 0 6826 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6827 VF.getKnownMinValue() - 1)); 6828 } 6829 6830 InstructionCost 6831 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6832 ElementCount VF) { 6833 Type *ValTy = getMemInstValueType(I); 6834 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6835 const Align Alignment = getLoadStoreAlignment(I); 6836 const Value *Ptr = getLoadStorePointerOperand(I); 6837 6838 return TTI.getAddressComputationCost(VectorTy) + 6839 TTI.getGatherScatterOpCost( 6840 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6841 TargetTransformInfo::TCK_RecipThroughput, I); 6842 } 6843 6844 InstructionCost 6845 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6846 ElementCount VF) { 6847 // TODO: Once we have support for interleaving with scalable vectors 6848 // we can calculate the cost properly here. 6849 if (VF.isScalable()) 6850 return InstructionCost::getInvalid(); 6851 6852 Type *ValTy = getMemInstValueType(I); 6853 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6854 unsigned AS = getLoadStoreAddressSpace(I); 6855 6856 auto Group = getInterleavedAccessGroup(I); 6857 assert(Group && "Fail to get an interleaved access group."); 6858 6859 unsigned InterleaveFactor = Group->getFactor(); 6860 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6861 6862 // Holds the indices of existing members in an interleaved load group. 6863 // An interleaved store group doesn't need this as it doesn't allow gaps. 6864 SmallVector<unsigned, 4> Indices; 6865 if (isa<LoadInst>(I)) { 6866 for (unsigned i = 0; i < InterleaveFactor; i++) 6867 if (Group->getMember(i)) 6868 Indices.push_back(i); 6869 } 6870 6871 // Calculate the cost of the whole interleaved group. 6872 bool UseMaskForGaps = 6873 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6874 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6875 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6876 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6877 6878 if (Group->isReverse()) { 6879 // TODO: Add support for reversed masked interleaved access. 6880 assert(!Legal->isMaskRequired(I) && 6881 "Reverse masked interleaved access not supported."); 6882 Cost += 6883 Group->getNumMembers() * 6884 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6885 } 6886 return Cost; 6887 } 6888 6889 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6890 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6891 // Early exit for no inloop reductions 6892 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6893 return InstructionCost::getInvalid(); 6894 auto *VectorTy = cast<VectorType>(Ty); 6895 6896 // We are looking for a pattern of, and finding the minimal acceptable cost: 6897 // reduce(mul(ext(A), ext(B))) or 6898 // reduce(mul(A, B)) or 6899 // reduce(ext(A)) or 6900 // reduce(A). 6901 // The basic idea is that we walk down the tree to do that, finding the root 6902 // reduction instruction in InLoopReductionImmediateChains. From there we find 6903 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6904 // of the components. If the reduction cost is lower then we return it for the 6905 // reduction instruction and 0 for the other instructions in the pattern. If 6906 // it is not we return an invalid cost specifying the orignal cost method 6907 // should be used. 6908 Instruction *RetI = I; 6909 if ((RetI->getOpcode() == Instruction::SExt || 6910 RetI->getOpcode() == Instruction::ZExt)) { 6911 if (!RetI->hasOneUser()) 6912 return InstructionCost::getInvalid(); 6913 RetI = RetI->user_back(); 6914 } 6915 if (RetI->getOpcode() == Instruction::Mul && 6916 RetI->user_back()->getOpcode() == Instruction::Add) { 6917 if (!RetI->hasOneUser()) 6918 return InstructionCost::getInvalid(); 6919 RetI = RetI->user_back(); 6920 } 6921 6922 // Test if the found instruction is a reduction, and if not return an invalid 6923 // cost specifying the parent to use the original cost modelling. 6924 if (!InLoopReductionImmediateChains.count(RetI)) 6925 return InstructionCost::getInvalid(); 6926 6927 // Find the reduction this chain is a part of and calculate the basic cost of 6928 // the reduction on its own. 6929 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6930 Instruction *ReductionPhi = LastChain; 6931 while (!isa<PHINode>(ReductionPhi)) 6932 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6933 6934 RecurrenceDescriptor RdxDesc = 6935 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6936 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6937 VectorTy, false, CostKind); 6938 6939 // Get the operand that was not the reduction chain and match it to one of the 6940 // patterns, returning the better cost if it is found. 6941 Instruction *RedOp = RetI->getOperand(1) == LastChain 6942 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6943 : dyn_cast<Instruction>(RetI->getOperand(1)); 6944 6945 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6946 6947 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6948 !TheLoop->isLoopInvariant(RedOp)) { 6949 bool IsUnsigned = isa<ZExtInst>(RedOp); 6950 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6951 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6952 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6953 CostKind); 6954 6955 unsigned ExtCost = 6956 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6957 TTI::CastContextHint::None, CostKind, RedOp); 6958 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6959 return I == RetI ? *RedCost.getValue() : 0; 6960 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6961 Instruction *Mul = RedOp; 6962 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6963 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6964 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6965 Op0->getOpcode() == Op1->getOpcode() && 6966 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6967 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6968 bool IsUnsigned = isa<ZExtInst>(Op0); 6969 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6970 // reduce(mul(ext, ext)) 6971 unsigned ExtCost = 6972 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6973 TTI::CastContextHint::None, CostKind, Op0); 6974 InstructionCost MulCost = 6975 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6976 6977 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6978 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6979 CostKind); 6980 6981 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6982 return I == RetI ? *RedCost.getValue() : 0; 6983 } else { 6984 InstructionCost MulCost = 6985 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6986 6987 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6988 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6989 CostKind); 6990 6991 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6992 return I == RetI ? *RedCost.getValue() : 0; 6993 } 6994 } 6995 6996 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 6997 } 6998 6999 InstructionCost 7000 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7001 ElementCount VF) { 7002 // Calculate scalar cost only. Vectorization cost should be ready at this 7003 // moment. 7004 if (VF.isScalar()) { 7005 Type *ValTy = getMemInstValueType(I); 7006 const Align Alignment = getLoadStoreAlignment(I); 7007 unsigned AS = getLoadStoreAddressSpace(I); 7008 7009 return TTI.getAddressComputationCost(ValTy) + 7010 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7011 TTI::TCK_RecipThroughput, I); 7012 } 7013 return getWideningCost(I, VF); 7014 } 7015 7016 LoopVectorizationCostModel::VectorizationCostTy 7017 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7018 ElementCount VF) { 7019 // If we know that this instruction will remain uniform, check the cost of 7020 // the scalar version. 7021 if (isUniformAfterVectorization(I, VF)) 7022 VF = ElementCount::getFixed(1); 7023 7024 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7025 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7026 7027 // Forced scalars do not have any scalarization overhead. 7028 auto ForcedScalar = ForcedScalars.find(VF); 7029 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7030 auto InstSet = ForcedScalar->second; 7031 if (InstSet.count(I)) 7032 return VectorizationCostTy( 7033 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7034 VF.getKnownMinValue()), 7035 false); 7036 } 7037 7038 Type *VectorTy; 7039 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7040 7041 bool TypeNotScalarized = 7042 VF.isVector() && VectorTy->isVectorTy() && 7043 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7044 return VectorizationCostTy(C, TypeNotScalarized); 7045 } 7046 7047 InstructionCost 7048 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7049 ElementCount VF) { 7050 7051 if (VF.isScalable()) 7052 return InstructionCost::getInvalid(); 7053 7054 if (VF.isScalar()) 7055 return 0; 7056 7057 InstructionCost Cost = 0; 7058 Type *RetTy = ToVectorTy(I->getType(), VF); 7059 if (!RetTy->isVoidTy() && 7060 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7061 Cost += TTI.getScalarizationOverhead( 7062 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7063 true, false); 7064 7065 // Some targets keep addresses scalar. 7066 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7067 return Cost; 7068 7069 // Some targets support efficient element stores. 7070 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7071 return Cost; 7072 7073 // Collect operands to consider. 7074 CallInst *CI = dyn_cast<CallInst>(I); 7075 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7076 7077 // Skip operands that do not require extraction/scalarization and do not incur 7078 // any overhead. 7079 SmallVector<Type *> Tys; 7080 for (auto *V : filterExtractingOperands(Ops, VF)) 7081 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7082 return Cost + TTI.getOperandsScalarizationOverhead( 7083 filterExtractingOperands(Ops, VF), Tys); 7084 } 7085 7086 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7087 if (VF.isScalar()) 7088 return; 7089 NumPredStores = 0; 7090 for (BasicBlock *BB : TheLoop->blocks()) { 7091 // For each instruction in the old loop. 7092 for (Instruction &I : *BB) { 7093 Value *Ptr = getLoadStorePointerOperand(&I); 7094 if (!Ptr) 7095 continue; 7096 7097 // TODO: We should generate better code and update the cost model for 7098 // predicated uniform stores. Today they are treated as any other 7099 // predicated store (see added test cases in 7100 // invariant-store-vectorization.ll). 7101 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7102 NumPredStores++; 7103 7104 if (Legal->isUniformMemOp(I)) { 7105 // TODO: Avoid replicating loads and stores instead of 7106 // relying on instcombine to remove them. 7107 // Load: Scalar load + broadcast 7108 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7109 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7110 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7111 continue; 7112 } 7113 7114 // We assume that widening is the best solution when possible. 7115 if (memoryInstructionCanBeWidened(&I, VF)) { 7116 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7117 int ConsecutiveStride = 7118 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7119 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7120 "Expected consecutive stride."); 7121 InstWidening Decision = 7122 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7123 setWideningDecision(&I, VF, Decision, Cost); 7124 continue; 7125 } 7126 7127 // Choose between Interleaving, Gather/Scatter or Scalarization. 7128 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7129 unsigned NumAccesses = 1; 7130 if (isAccessInterleaved(&I)) { 7131 auto Group = getInterleavedAccessGroup(&I); 7132 assert(Group && "Fail to get an interleaved access group."); 7133 7134 // Make one decision for the whole group. 7135 if (getWideningDecision(&I, VF) != CM_Unknown) 7136 continue; 7137 7138 NumAccesses = Group->getNumMembers(); 7139 if (interleavedAccessCanBeWidened(&I, VF)) 7140 InterleaveCost = getInterleaveGroupCost(&I, VF); 7141 } 7142 7143 InstructionCost GatherScatterCost = 7144 isLegalGatherOrScatter(&I) 7145 ? getGatherScatterCost(&I, VF) * NumAccesses 7146 : InstructionCost::getInvalid(); 7147 7148 InstructionCost ScalarizationCost = 7149 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7150 : InstructionCost::getInvalid(); 7151 7152 // Choose better solution for the current VF, 7153 // write down this decision and use it during vectorization. 7154 InstructionCost Cost; 7155 InstWidening Decision; 7156 if (InterleaveCost <= GatherScatterCost && 7157 InterleaveCost < ScalarizationCost) { 7158 Decision = CM_Interleave; 7159 Cost = InterleaveCost; 7160 } else if (GatherScatterCost < ScalarizationCost) { 7161 Decision = CM_GatherScatter; 7162 Cost = GatherScatterCost; 7163 } else { 7164 assert(!VF.isScalable() && 7165 "We cannot yet scalarise for scalable vectors"); 7166 Decision = CM_Scalarize; 7167 Cost = ScalarizationCost; 7168 } 7169 // If the instructions belongs to an interleave group, the whole group 7170 // receives the same decision. The whole group receives the cost, but 7171 // the cost will actually be assigned to one instruction. 7172 if (auto Group = getInterleavedAccessGroup(&I)) 7173 setWideningDecision(Group, VF, Decision, Cost); 7174 else 7175 setWideningDecision(&I, VF, Decision, Cost); 7176 } 7177 } 7178 7179 // Make sure that any load of address and any other address computation 7180 // remains scalar unless there is gather/scatter support. This avoids 7181 // inevitable extracts into address registers, and also has the benefit of 7182 // activating LSR more, since that pass can't optimize vectorized 7183 // addresses. 7184 if (TTI.prefersVectorizedAddressing()) 7185 return; 7186 7187 // Start with all scalar pointer uses. 7188 SmallPtrSet<Instruction *, 8> AddrDefs; 7189 for (BasicBlock *BB : TheLoop->blocks()) 7190 for (Instruction &I : *BB) { 7191 Instruction *PtrDef = 7192 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7193 if (PtrDef && TheLoop->contains(PtrDef) && 7194 getWideningDecision(&I, VF) != CM_GatherScatter) 7195 AddrDefs.insert(PtrDef); 7196 } 7197 7198 // Add all instructions used to generate the addresses. 7199 SmallVector<Instruction *, 4> Worklist; 7200 append_range(Worklist, AddrDefs); 7201 while (!Worklist.empty()) { 7202 Instruction *I = Worklist.pop_back_val(); 7203 for (auto &Op : I->operands()) 7204 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7205 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7206 AddrDefs.insert(InstOp).second) 7207 Worklist.push_back(InstOp); 7208 } 7209 7210 for (auto *I : AddrDefs) { 7211 if (isa<LoadInst>(I)) { 7212 // Setting the desired widening decision should ideally be handled in 7213 // by cost functions, but since this involves the task of finding out 7214 // if the loaded register is involved in an address computation, it is 7215 // instead changed here when we know this is the case. 7216 InstWidening Decision = getWideningDecision(I, VF); 7217 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7218 // Scalarize a widened load of address. 7219 setWideningDecision( 7220 I, VF, CM_Scalarize, 7221 (VF.getKnownMinValue() * 7222 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7223 else if (auto Group = getInterleavedAccessGroup(I)) { 7224 // Scalarize an interleave group of address loads. 7225 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7226 if (Instruction *Member = Group->getMember(I)) 7227 setWideningDecision( 7228 Member, VF, CM_Scalarize, 7229 (VF.getKnownMinValue() * 7230 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7231 } 7232 } 7233 } else 7234 // Make sure I gets scalarized and a cost estimate without 7235 // scalarization overhead. 7236 ForcedScalars[VF].insert(I); 7237 } 7238 } 7239 7240 InstructionCost 7241 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7242 Type *&VectorTy) { 7243 Type *RetTy = I->getType(); 7244 if (canTruncateToMinimalBitwidth(I, VF)) 7245 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7246 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7247 auto SE = PSE.getSE(); 7248 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7249 7250 // TODO: We need to estimate the cost of intrinsic calls. 7251 switch (I->getOpcode()) { 7252 case Instruction::GetElementPtr: 7253 // We mark this instruction as zero-cost because the cost of GEPs in 7254 // vectorized code depends on whether the corresponding memory instruction 7255 // is scalarized or not. Therefore, we handle GEPs with the memory 7256 // instruction cost. 7257 return 0; 7258 case Instruction::Br: { 7259 // In cases of scalarized and predicated instructions, there will be VF 7260 // predicated blocks in the vectorized loop. Each branch around these 7261 // blocks requires also an extract of its vector compare i1 element. 7262 bool ScalarPredicatedBB = false; 7263 BranchInst *BI = cast<BranchInst>(I); 7264 if (VF.isVector() && BI->isConditional() && 7265 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7266 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7267 ScalarPredicatedBB = true; 7268 7269 if (ScalarPredicatedBB) { 7270 // Return cost for branches around scalarized and predicated blocks. 7271 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7272 auto *Vec_i1Ty = 7273 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7274 return (TTI.getScalarizationOverhead( 7275 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7276 false, true) + 7277 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7278 VF.getKnownMinValue())); 7279 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7280 // The back-edge branch will remain, as will all scalar branches. 7281 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7282 else 7283 // This branch will be eliminated by if-conversion. 7284 return 0; 7285 // Note: We currently assume zero cost for an unconditional branch inside 7286 // a predicated block since it will become a fall-through, although we 7287 // may decide in the future to call TTI for all branches. 7288 } 7289 case Instruction::PHI: { 7290 auto *Phi = cast<PHINode>(I); 7291 7292 // First-order recurrences are replaced by vector shuffles inside the loop. 7293 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7294 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7295 return TTI.getShuffleCost( 7296 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7297 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7298 7299 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7300 // converted into select instructions. We require N - 1 selects per phi 7301 // node, where N is the number of incoming values. 7302 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7303 return (Phi->getNumIncomingValues() - 1) * 7304 TTI.getCmpSelInstrCost( 7305 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7306 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7307 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7308 7309 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7310 } 7311 case Instruction::UDiv: 7312 case Instruction::SDiv: 7313 case Instruction::URem: 7314 case Instruction::SRem: 7315 // If we have a predicated instruction, it may not be executed for each 7316 // vector lane. Get the scalarization cost and scale this amount by the 7317 // probability of executing the predicated block. If the instruction is not 7318 // predicated, we fall through to the next case. 7319 if (VF.isVector() && isScalarWithPredication(I)) { 7320 InstructionCost Cost = 0; 7321 7322 // These instructions have a non-void type, so account for the phi nodes 7323 // that we will create. This cost is likely to be zero. The phi node 7324 // cost, if any, should be scaled by the block probability because it 7325 // models a copy at the end of each predicated block. 7326 Cost += VF.getKnownMinValue() * 7327 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7328 7329 // The cost of the non-predicated instruction. 7330 Cost += VF.getKnownMinValue() * 7331 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7332 7333 // The cost of insertelement and extractelement instructions needed for 7334 // scalarization. 7335 Cost += getScalarizationOverhead(I, VF); 7336 7337 // Scale the cost by the probability of executing the predicated blocks. 7338 // This assumes the predicated block for each vector lane is equally 7339 // likely. 7340 return Cost / getReciprocalPredBlockProb(); 7341 } 7342 LLVM_FALLTHROUGH; 7343 case Instruction::Add: 7344 case Instruction::FAdd: 7345 case Instruction::Sub: 7346 case Instruction::FSub: 7347 case Instruction::Mul: 7348 case Instruction::FMul: 7349 case Instruction::FDiv: 7350 case Instruction::FRem: 7351 case Instruction::Shl: 7352 case Instruction::LShr: 7353 case Instruction::AShr: 7354 case Instruction::And: 7355 case Instruction::Or: 7356 case Instruction::Xor: { 7357 // Since we will replace the stride by 1 the multiplication should go away. 7358 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7359 return 0; 7360 7361 // Detect reduction patterns 7362 InstructionCost RedCost; 7363 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7364 .isValid()) 7365 return RedCost; 7366 7367 // Certain instructions can be cheaper to vectorize if they have a constant 7368 // second vector operand. One example of this are shifts on x86. 7369 Value *Op2 = I->getOperand(1); 7370 TargetTransformInfo::OperandValueProperties Op2VP; 7371 TargetTransformInfo::OperandValueKind Op2VK = 7372 TTI.getOperandInfo(Op2, Op2VP); 7373 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7374 Op2VK = TargetTransformInfo::OK_UniformValue; 7375 7376 SmallVector<const Value *, 4> Operands(I->operand_values()); 7377 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7378 return N * TTI.getArithmeticInstrCost( 7379 I->getOpcode(), VectorTy, CostKind, 7380 TargetTransformInfo::OK_AnyValue, 7381 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7382 } 7383 case Instruction::FNeg: { 7384 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7385 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7386 return N * TTI.getArithmeticInstrCost( 7387 I->getOpcode(), VectorTy, CostKind, 7388 TargetTransformInfo::OK_AnyValue, 7389 TargetTransformInfo::OK_AnyValue, 7390 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7391 I->getOperand(0), I); 7392 } 7393 case Instruction::Select: { 7394 SelectInst *SI = cast<SelectInst>(I); 7395 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7396 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7397 Type *CondTy = SI->getCondition()->getType(); 7398 if (!ScalarCond) 7399 CondTy = VectorType::get(CondTy, VF); 7400 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7401 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7402 } 7403 case Instruction::ICmp: 7404 case Instruction::FCmp: { 7405 Type *ValTy = I->getOperand(0)->getType(); 7406 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7407 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7408 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7409 VectorTy = ToVectorTy(ValTy, VF); 7410 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7411 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7412 } 7413 case Instruction::Store: 7414 case Instruction::Load: { 7415 ElementCount Width = VF; 7416 if (Width.isVector()) { 7417 InstWidening Decision = getWideningDecision(I, Width); 7418 assert(Decision != CM_Unknown && 7419 "CM decision should be taken at this point"); 7420 if (Decision == CM_Scalarize) 7421 Width = ElementCount::getFixed(1); 7422 } 7423 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7424 return getMemoryInstructionCost(I, VF); 7425 } 7426 case Instruction::ZExt: 7427 case Instruction::SExt: 7428 case Instruction::FPToUI: 7429 case Instruction::FPToSI: 7430 case Instruction::FPExt: 7431 case Instruction::PtrToInt: 7432 case Instruction::IntToPtr: 7433 case Instruction::SIToFP: 7434 case Instruction::UIToFP: 7435 case Instruction::Trunc: 7436 case Instruction::FPTrunc: 7437 case Instruction::BitCast: { 7438 // Computes the CastContextHint from a Load/Store instruction. 7439 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7440 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7441 "Expected a load or a store!"); 7442 7443 if (VF.isScalar() || !TheLoop->contains(I)) 7444 return TTI::CastContextHint::Normal; 7445 7446 switch (getWideningDecision(I, VF)) { 7447 case LoopVectorizationCostModel::CM_GatherScatter: 7448 return TTI::CastContextHint::GatherScatter; 7449 case LoopVectorizationCostModel::CM_Interleave: 7450 return TTI::CastContextHint::Interleave; 7451 case LoopVectorizationCostModel::CM_Scalarize: 7452 case LoopVectorizationCostModel::CM_Widen: 7453 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7454 : TTI::CastContextHint::Normal; 7455 case LoopVectorizationCostModel::CM_Widen_Reverse: 7456 return TTI::CastContextHint::Reversed; 7457 case LoopVectorizationCostModel::CM_Unknown: 7458 llvm_unreachable("Instr did not go through cost modelling?"); 7459 } 7460 7461 llvm_unreachable("Unhandled case!"); 7462 }; 7463 7464 unsigned Opcode = I->getOpcode(); 7465 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7466 // For Trunc, the context is the only user, which must be a StoreInst. 7467 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7468 if (I->hasOneUse()) 7469 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7470 CCH = ComputeCCH(Store); 7471 } 7472 // For Z/Sext, the context is the operand, which must be a LoadInst. 7473 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7474 Opcode == Instruction::FPExt) { 7475 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7476 CCH = ComputeCCH(Load); 7477 } 7478 7479 // We optimize the truncation of induction variables having constant 7480 // integer steps. The cost of these truncations is the same as the scalar 7481 // operation. 7482 if (isOptimizableIVTruncate(I, VF)) { 7483 auto *Trunc = cast<TruncInst>(I); 7484 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7485 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7486 } 7487 7488 // Detect reduction patterns 7489 InstructionCost RedCost; 7490 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7491 .isValid()) 7492 return RedCost; 7493 7494 Type *SrcScalarTy = I->getOperand(0)->getType(); 7495 Type *SrcVecTy = 7496 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7497 if (canTruncateToMinimalBitwidth(I, VF)) { 7498 // This cast is going to be shrunk. This may remove the cast or it might 7499 // turn it into slightly different cast. For example, if MinBW == 16, 7500 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7501 // 7502 // Calculate the modified src and dest types. 7503 Type *MinVecTy = VectorTy; 7504 if (Opcode == Instruction::Trunc) { 7505 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7506 VectorTy = 7507 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7508 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7509 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7510 VectorTy = 7511 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7512 } 7513 } 7514 7515 unsigned N; 7516 if (isScalarAfterVectorization(I, VF)) { 7517 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7518 N = VF.getKnownMinValue(); 7519 } else 7520 N = 1; 7521 return N * 7522 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7523 } 7524 case Instruction::Call: { 7525 bool NeedToScalarize; 7526 CallInst *CI = cast<CallInst>(I); 7527 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7528 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7529 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7530 return std::min(CallCost, IntrinsicCost); 7531 } 7532 return CallCost; 7533 } 7534 case Instruction::ExtractValue: 7535 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7536 default: 7537 // The cost of executing VF copies of the scalar instruction. This opcode 7538 // is unknown. Assume that it is the same as 'mul'. 7539 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7540 Instruction::Mul, VectorTy, CostKind) + 7541 getScalarizationOverhead(I, VF); 7542 } // end of switch. 7543 } 7544 7545 char LoopVectorize::ID = 0; 7546 7547 static const char lv_name[] = "Loop Vectorization"; 7548 7549 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7550 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7551 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7552 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7553 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7554 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7555 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7556 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7557 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7558 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7559 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7560 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7561 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7562 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7563 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7564 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7565 7566 namespace llvm { 7567 7568 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7569 7570 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7571 bool VectorizeOnlyWhenForced) { 7572 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7573 } 7574 7575 } // end namespace llvm 7576 7577 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7578 // Check if the pointer operand of a load or store instruction is 7579 // consecutive. 7580 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7581 return Legal->isConsecutivePtr(Ptr); 7582 return false; 7583 } 7584 7585 void LoopVectorizationCostModel::collectValuesToIgnore() { 7586 // Ignore ephemeral values. 7587 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7588 7589 // Ignore type-promoting instructions we identified during reduction 7590 // detection. 7591 for (auto &Reduction : Legal->getReductionVars()) { 7592 RecurrenceDescriptor &RedDes = Reduction.second; 7593 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7594 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7595 } 7596 // Ignore type-casting instructions we identified during induction 7597 // detection. 7598 for (auto &Induction : Legal->getInductionVars()) { 7599 InductionDescriptor &IndDes = Induction.second; 7600 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7601 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7602 } 7603 } 7604 7605 void LoopVectorizationCostModel::collectInLoopReductions() { 7606 for (auto &Reduction : Legal->getReductionVars()) { 7607 PHINode *Phi = Reduction.first; 7608 RecurrenceDescriptor &RdxDesc = Reduction.second; 7609 7610 // We don't collect reductions that are type promoted (yet). 7611 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7612 continue; 7613 7614 // If the target would prefer this reduction to happen "in-loop", then we 7615 // want to record it as such. 7616 unsigned Opcode = RdxDesc.getOpcode(); 7617 if (!PreferInLoopReductions && 7618 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7619 TargetTransformInfo::ReductionFlags())) 7620 continue; 7621 7622 // Check that we can correctly put the reductions into the loop, by 7623 // finding the chain of operations that leads from the phi to the loop 7624 // exit value. 7625 SmallVector<Instruction *, 4> ReductionOperations = 7626 RdxDesc.getReductionOpChain(Phi, TheLoop); 7627 bool InLoop = !ReductionOperations.empty(); 7628 if (InLoop) { 7629 InLoopReductionChains[Phi] = ReductionOperations; 7630 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7631 Instruction *LastChain = Phi; 7632 for (auto *I : ReductionOperations) { 7633 InLoopReductionImmediateChains[I] = LastChain; 7634 LastChain = I; 7635 } 7636 } 7637 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7638 << " reduction for phi: " << *Phi << "\n"); 7639 } 7640 } 7641 7642 // TODO: we could return a pair of values that specify the max VF and 7643 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7644 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7645 // doesn't have a cost model that can choose which plan to execute if 7646 // more than one is generated. 7647 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7648 LoopVectorizationCostModel &CM) { 7649 unsigned WidestType; 7650 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7651 return WidestVectorRegBits / WidestType; 7652 } 7653 7654 VectorizationFactor 7655 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7656 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7657 ElementCount VF = UserVF; 7658 // Outer loop handling: They may require CFG and instruction level 7659 // transformations before even evaluating whether vectorization is profitable. 7660 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7661 // the vectorization pipeline. 7662 if (!OrigLoop->isInnermost()) { 7663 // If the user doesn't provide a vectorization factor, determine a 7664 // reasonable one. 7665 if (UserVF.isZero()) { 7666 VF = ElementCount::getFixed( 7667 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7668 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7669 7670 // Make sure we have a VF > 1 for stress testing. 7671 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7672 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7673 << "overriding computed VF.\n"); 7674 VF = ElementCount::getFixed(4); 7675 } 7676 } 7677 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7678 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7679 "VF needs to be a power of two"); 7680 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7681 << "VF " << VF << " to build VPlans.\n"); 7682 buildVPlans(VF, VF); 7683 7684 // For VPlan build stress testing, we bail out after VPlan construction. 7685 if (VPlanBuildStressTest) 7686 return VectorizationFactor::Disabled(); 7687 7688 return {VF, 0 /*Cost*/}; 7689 } 7690 7691 LLVM_DEBUG( 7692 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7693 "VPlan-native path.\n"); 7694 return VectorizationFactor::Disabled(); 7695 } 7696 7697 Optional<VectorizationFactor> 7698 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7699 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7700 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7701 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7702 return None; 7703 7704 // Invalidate interleave groups if all blocks of loop will be predicated. 7705 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7706 !useMaskedInterleavedAccesses(*TTI)) { 7707 LLVM_DEBUG( 7708 dbgs() 7709 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7710 "which requires masked-interleaved support.\n"); 7711 if (CM.InterleaveInfo.invalidateGroups()) 7712 // Invalidating interleave groups also requires invalidating all decisions 7713 // based on them, which includes widening decisions and uniform and scalar 7714 // values. 7715 CM.invalidateCostModelingDecisions(); 7716 } 7717 7718 ElementCount MaxVF = MaybeMaxVF.getValue(); 7719 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7720 7721 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7722 if (!UserVF.isZero() && 7723 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7724 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7725 // VFs here, this should be reverted to only use legal UserVFs once the 7726 // loop below supports scalable VFs. 7727 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7728 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7729 << " VF " << VF << ".\n"); 7730 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7731 "VF needs to be a power of two"); 7732 // Collect the instructions (and their associated costs) that will be more 7733 // profitable to scalarize. 7734 CM.selectUserVectorizationFactor(VF); 7735 CM.collectInLoopReductions(); 7736 buildVPlansWithVPRecipes(VF, VF); 7737 LLVM_DEBUG(printPlans(dbgs())); 7738 return {{VF, 0}}; 7739 } 7740 7741 assert(!MaxVF.isScalable() && 7742 "Scalable vectors not yet supported beyond this point"); 7743 7744 for (ElementCount VF = ElementCount::getFixed(1); 7745 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7746 // Collect Uniform and Scalar instructions after vectorization with VF. 7747 CM.collectUniformsAndScalars(VF); 7748 7749 // Collect the instructions (and their associated costs) that will be more 7750 // profitable to scalarize. 7751 if (VF.isVector()) 7752 CM.collectInstsToScalarize(VF); 7753 } 7754 7755 CM.collectInLoopReductions(); 7756 7757 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7758 LLVM_DEBUG(printPlans(dbgs())); 7759 if (MaxVF.isScalar()) 7760 return VectorizationFactor::Disabled(); 7761 7762 // Select the optimal vectorization factor. 7763 return CM.selectVectorizationFactor(MaxVF); 7764 } 7765 7766 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7767 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7768 << '\n'); 7769 BestVF = VF; 7770 BestUF = UF; 7771 7772 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7773 return !Plan->hasVF(VF); 7774 }); 7775 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7776 } 7777 7778 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7779 DominatorTree *DT) { 7780 // Perform the actual loop transformation. 7781 7782 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7783 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7784 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7785 7786 VPTransformState State{ 7787 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7788 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7789 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7790 State.CanonicalIV = ILV.Induction; 7791 7792 ILV.printDebugTracesAtStart(); 7793 7794 //===------------------------------------------------===// 7795 // 7796 // Notice: any optimization or new instruction that go 7797 // into the code below should also be implemented in 7798 // the cost-model. 7799 // 7800 //===------------------------------------------------===// 7801 7802 // 2. Copy and widen instructions from the old loop into the new loop. 7803 VPlans.front()->execute(&State); 7804 7805 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7806 // predication, updating analyses. 7807 ILV.fixVectorizedLoop(State); 7808 7809 ILV.printDebugTracesAtEnd(); 7810 } 7811 7812 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7813 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7814 7815 // We create new control-flow for the vectorized loop, so the original exit 7816 // conditions will be dead after vectorization if it's only used by the 7817 // terminator 7818 SmallVector<BasicBlock*> ExitingBlocks; 7819 OrigLoop->getExitingBlocks(ExitingBlocks); 7820 for (auto *BB : ExitingBlocks) { 7821 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7822 if (!Cmp || !Cmp->hasOneUse()) 7823 continue; 7824 7825 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7826 if (!DeadInstructions.insert(Cmp).second) 7827 continue; 7828 7829 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7830 // TODO: can recurse through operands in general 7831 for (Value *Op : Cmp->operands()) { 7832 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7833 DeadInstructions.insert(cast<Instruction>(Op)); 7834 } 7835 } 7836 7837 // We create new "steps" for induction variable updates to which the original 7838 // induction variables map. An original update instruction will be dead if 7839 // all its users except the induction variable are dead. 7840 auto *Latch = OrigLoop->getLoopLatch(); 7841 for (auto &Induction : Legal->getInductionVars()) { 7842 PHINode *Ind = Induction.first; 7843 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7844 7845 // If the tail is to be folded by masking, the primary induction variable, 7846 // if exists, isn't dead: it will be used for masking. Don't kill it. 7847 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7848 continue; 7849 7850 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7851 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7852 })) 7853 DeadInstructions.insert(IndUpdate); 7854 7855 // We record as "Dead" also the type-casting instructions we had identified 7856 // during induction analysis. We don't need any handling for them in the 7857 // vectorized loop because we have proven that, under a proper runtime 7858 // test guarding the vectorized loop, the value of the phi, and the casted 7859 // value of the phi, are the same. The last instruction in this casting chain 7860 // will get its scalar/vector/widened def from the scalar/vector/widened def 7861 // of the respective phi node. Any other casts in the induction def-use chain 7862 // have no other uses outside the phi update chain, and will be ignored. 7863 InductionDescriptor &IndDes = Induction.second; 7864 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7865 DeadInstructions.insert(Casts.begin(), Casts.end()); 7866 } 7867 } 7868 7869 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7870 7871 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7872 7873 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7874 Instruction::BinaryOps BinOp) { 7875 // When unrolling and the VF is 1, we only need to add a simple scalar. 7876 Type *Ty = Val->getType(); 7877 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7878 7879 if (Ty->isFloatingPointTy()) { 7880 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7881 7882 // Floating-point operations inherit FMF via the builder's flags. 7883 Value *MulOp = Builder.CreateFMul(C, Step); 7884 return Builder.CreateBinOp(BinOp, Val, MulOp); 7885 } 7886 Constant *C = ConstantInt::get(Ty, StartIdx); 7887 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7888 } 7889 7890 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7891 SmallVector<Metadata *, 4> MDs; 7892 // Reserve first location for self reference to the LoopID metadata node. 7893 MDs.push_back(nullptr); 7894 bool IsUnrollMetadata = false; 7895 MDNode *LoopID = L->getLoopID(); 7896 if (LoopID) { 7897 // First find existing loop unrolling disable metadata. 7898 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7899 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7900 if (MD) { 7901 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7902 IsUnrollMetadata = 7903 S && S->getString().startswith("llvm.loop.unroll.disable"); 7904 } 7905 MDs.push_back(LoopID->getOperand(i)); 7906 } 7907 } 7908 7909 if (!IsUnrollMetadata) { 7910 // Add runtime unroll disable metadata. 7911 LLVMContext &Context = L->getHeader()->getContext(); 7912 SmallVector<Metadata *, 1> DisableOperands; 7913 DisableOperands.push_back( 7914 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7915 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7916 MDs.push_back(DisableNode); 7917 MDNode *NewLoopID = MDNode::get(Context, MDs); 7918 // Set operand 0 to refer to the loop id itself. 7919 NewLoopID->replaceOperandWith(0, NewLoopID); 7920 L->setLoopID(NewLoopID); 7921 } 7922 } 7923 7924 //===--------------------------------------------------------------------===// 7925 // EpilogueVectorizerMainLoop 7926 //===--------------------------------------------------------------------===// 7927 7928 /// This function is partially responsible for generating the control flow 7929 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7930 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7931 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7932 Loop *Lp = createVectorLoopSkeleton(""); 7933 7934 // Generate the code to check the minimum iteration count of the vector 7935 // epilogue (see below). 7936 EPI.EpilogueIterationCountCheck = 7937 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7938 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7939 7940 // Generate the code to check any assumptions that we've made for SCEV 7941 // expressions. 7942 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 7943 7944 // Generate the code that checks at runtime if arrays overlap. We put the 7945 // checks into a separate block to make the more common case of few elements 7946 // faster. 7947 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7948 7949 // Generate the iteration count check for the main loop, *after* the check 7950 // for the epilogue loop, so that the path-length is shorter for the case 7951 // that goes directly through the vector epilogue. The longer-path length for 7952 // the main loop is compensated for, by the gain from vectorizing the larger 7953 // trip count. Note: the branch will get updated later on when we vectorize 7954 // the epilogue. 7955 EPI.MainLoopIterationCountCheck = 7956 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7957 7958 // Generate the induction variable. 7959 OldInduction = Legal->getPrimaryInduction(); 7960 Type *IdxTy = Legal->getWidestInductionType(); 7961 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7962 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7963 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7964 EPI.VectorTripCount = CountRoundDown; 7965 Induction = 7966 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7967 getDebugLocFromInstOrOperands(OldInduction)); 7968 7969 // Skip induction resume value creation here because they will be created in 7970 // the second pass. If we created them here, they wouldn't be used anyway, 7971 // because the vplan in the second pass still contains the inductions from the 7972 // original loop. 7973 7974 return completeLoopSkeleton(Lp, OrigLoopID); 7975 } 7976 7977 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7978 LLVM_DEBUG({ 7979 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7980 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7981 << ", Main Loop UF:" << EPI.MainLoopUF 7982 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7983 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7984 }); 7985 } 7986 7987 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7988 DEBUG_WITH_TYPE(VerboseDebug, { 7989 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7990 }); 7991 } 7992 7993 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7994 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7995 assert(L && "Expected valid Loop."); 7996 assert(Bypass && "Expected valid bypass basic block."); 7997 unsigned VFactor = 7998 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7999 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8000 Value *Count = getOrCreateTripCount(L); 8001 // Reuse existing vector loop preheader for TC checks. 8002 // Note that new preheader block is generated for vector loop. 8003 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8004 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8005 8006 // Generate code to check if the loop's trip count is less than VF * UF of the 8007 // main vector loop. 8008 auto P = 8009 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8010 8011 Value *CheckMinIters = Builder.CreateICmp( 8012 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8013 "min.iters.check"); 8014 8015 if (!ForEpilogue) 8016 TCCheckBlock->setName("vector.main.loop.iter.check"); 8017 8018 // Create new preheader for vector loop. 8019 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8020 DT, LI, nullptr, "vector.ph"); 8021 8022 if (ForEpilogue) { 8023 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8024 DT->getNode(Bypass)->getIDom()) && 8025 "TC check is expected to dominate Bypass"); 8026 8027 // Update dominator for Bypass & LoopExit. 8028 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8029 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8030 8031 LoopBypassBlocks.push_back(TCCheckBlock); 8032 8033 // Save the trip count so we don't have to regenerate it in the 8034 // vec.epilog.iter.check. This is safe to do because the trip count 8035 // generated here dominates the vector epilog iter check. 8036 EPI.TripCount = Count; 8037 } 8038 8039 ReplaceInstWithInst( 8040 TCCheckBlock->getTerminator(), 8041 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8042 8043 return TCCheckBlock; 8044 } 8045 8046 //===--------------------------------------------------------------------===// 8047 // EpilogueVectorizerEpilogueLoop 8048 //===--------------------------------------------------------------------===// 8049 8050 /// This function is partially responsible for generating the control flow 8051 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8052 BasicBlock * 8053 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8054 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8055 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8056 8057 // Now, compare the remaining count and if there aren't enough iterations to 8058 // execute the vectorized epilogue skip to the scalar part. 8059 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8060 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8061 LoopVectorPreHeader = 8062 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8063 LI, nullptr, "vec.epilog.ph"); 8064 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8065 VecEpilogueIterationCountCheck); 8066 8067 // Adjust the control flow taking the state info from the main loop 8068 // vectorization into account. 8069 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8070 "expected this to be saved from the previous pass."); 8071 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8072 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8073 8074 DT->changeImmediateDominator(LoopVectorPreHeader, 8075 EPI.MainLoopIterationCountCheck); 8076 8077 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8078 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8079 8080 if (EPI.SCEVSafetyCheck) 8081 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8082 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8083 if (EPI.MemSafetyCheck) 8084 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8085 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8086 8087 DT->changeImmediateDominator( 8088 VecEpilogueIterationCountCheck, 8089 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8090 8091 DT->changeImmediateDominator(LoopScalarPreHeader, 8092 EPI.EpilogueIterationCountCheck); 8093 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8094 8095 // Keep track of bypass blocks, as they feed start values to the induction 8096 // phis in the scalar loop preheader. 8097 if (EPI.SCEVSafetyCheck) 8098 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8099 if (EPI.MemSafetyCheck) 8100 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8101 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8102 8103 // Generate a resume induction for the vector epilogue and put it in the 8104 // vector epilogue preheader 8105 Type *IdxTy = Legal->getWidestInductionType(); 8106 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8107 LoopVectorPreHeader->getFirstNonPHI()); 8108 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8109 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8110 EPI.MainLoopIterationCountCheck); 8111 8112 // Generate the induction variable. 8113 OldInduction = Legal->getPrimaryInduction(); 8114 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8115 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8116 Value *StartIdx = EPResumeVal; 8117 Induction = 8118 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8119 getDebugLocFromInstOrOperands(OldInduction)); 8120 8121 // Generate induction resume values. These variables save the new starting 8122 // indexes for the scalar loop. They are used to test if there are any tail 8123 // iterations left once the vector loop has completed. 8124 // Note that when the vectorized epilogue is skipped due to iteration count 8125 // check, then the resume value for the induction variable comes from 8126 // the trip count of the main vector loop, hence passing the AdditionalBypass 8127 // argument. 8128 createInductionResumeValues(Lp, CountRoundDown, 8129 {VecEpilogueIterationCountCheck, 8130 EPI.VectorTripCount} /* AdditionalBypass */); 8131 8132 AddRuntimeUnrollDisableMetaData(Lp); 8133 return completeLoopSkeleton(Lp, OrigLoopID); 8134 } 8135 8136 BasicBlock * 8137 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8138 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8139 8140 assert(EPI.TripCount && 8141 "Expected trip count to have been safed in the first pass."); 8142 assert( 8143 (!isa<Instruction>(EPI.TripCount) || 8144 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8145 "saved trip count does not dominate insertion point."); 8146 Value *TC = EPI.TripCount; 8147 IRBuilder<> Builder(Insert->getTerminator()); 8148 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8149 8150 // Generate code to check if the loop's trip count is less than VF * UF of the 8151 // vector epilogue loop. 8152 auto P = 8153 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8154 8155 Value *CheckMinIters = Builder.CreateICmp( 8156 P, Count, 8157 ConstantInt::get(Count->getType(), 8158 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8159 "min.epilog.iters.check"); 8160 8161 ReplaceInstWithInst( 8162 Insert->getTerminator(), 8163 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8164 8165 LoopBypassBlocks.push_back(Insert); 8166 return Insert; 8167 } 8168 8169 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8170 LLVM_DEBUG({ 8171 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8172 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8173 << ", Main Loop UF:" << EPI.MainLoopUF 8174 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8175 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8176 }); 8177 } 8178 8179 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8180 DEBUG_WITH_TYPE(VerboseDebug, { 8181 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8182 }); 8183 } 8184 8185 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8186 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8187 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8188 bool PredicateAtRangeStart = Predicate(Range.Start); 8189 8190 for (ElementCount TmpVF = Range.Start * 2; 8191 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8192 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8193 Range.End = TmpVF; 8194 break; 8195 } 8196 8197 return PredicateAtRangeStart; 8198 } 8199 8200 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8201 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8202 /// of VF's starting at a given VF and extending it as much as possible. Each 8203 /// vectorization decision can potentially shorten this sub-range during 8204 /// buildVPlan(). 8205 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8206 ElementCount MaxVF) { 8207 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8208 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8209 VFRange SubRange = {VF, MaxVFPlusOne}; 8210 VPlans.push_back(buildVPlan(SubRange)); 8211 VF = SubRange.End; 8212 } 8213 } 8214 8215 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8216 VPlanPtr &Plan) { 8217 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8218 8219 // Look for cached value. 8220 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8221 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8222 if (ECEntryIt != EdgeMaskCache.end()) 8223 return ECEntryIt->second; 8224 8225 VPValue *SrcMask = createBlockInMask(Src, Plan); 8226 8227 // The terminator has to be a branch inst! 8228 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8229 assert(BI && "Unexpected terminator found"); 8230 8231 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8232 return EdgeMaskCache[Edge] = SrcMask; 8233 8234 // If source is an exiting block, we know the exit edge is dynamically dead 8235 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8236 // adding uses of an otherwise potentially dead instruction. 8237 if (OrigLoop->isLoopExiting(Src)) 8238 return EdgeMaskCache[Edge] = SrcMask; 8239 8240 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8241 assert(EdgeMask && "No Edge Mask found for condition"); 8242 8243 if (BI->getSuccessor(0) != Dst) 8244 EdgeMask = Builder.createNot(EdgeMask); 8245 8246 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8247 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8248 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8249 // The select version does not introduce new UB if SrcMask is false and 8250 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8251 VPValue *False = Plan->getOrAddVPValue( 8252 ConstantInt::getFalse(BI->getCondition()->getType())); 8253 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8254 } 8255 8256 return EdgeMaskCache[Edge] = EdgeMask; 8257 } 8258 8259 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8260 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8261 8262 // Look for cached value. 8263 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8264 if (BCEntryIt != BlockMaskCache.end()) 8265 return BCEntryIt->second; 8266 8267 // All-one mask is modelled as no-mask following the convention for masked 8268 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8269 VPValue *BlockMask = nullptr; 8270 8271 if (OrigLoop->getHeader() == BB) { 8272 if (!CM.blockNeedsPredication(BB)) 8273 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8274 8275 // Create the block in mask as the first non-phi instruction in the block. 8276 VPBuilder::InsertPointGuard Guard(Builder); 8277 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8278 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8279 8280 // Introduce the early-exit compare IV <= BTC to form header block mask. 8281 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8282 // Start by constructing the desired canonical IV. 8283 VPValue *IV = nullptr; 8284 if (Legal->getPrimaryInduction()) 8285 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8286 else { 8287 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8288 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8289 IV = IVRecipe->getVPValue(); 8290 } 8291 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8292 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8293 8294 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8295 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8296 // as a second argument, we only pass the IV here and extract the 8297 // tripcount from the transform state where codegen of the VP instructions 8298 // happen. 8299 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8300 } else { 8301 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8302 } 8303 return BlockMaskCache[BB] = BlockMask; 8304 } 8305 8306 // This is the block mask. We OR all incoming edges. 8307 for (auto *Predecessor : predecessors(BB)) { 8308 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8309 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8310 return BlockMaskCache[BB] = EdgeMask; 8311 8312 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8313 BlockMask = EdgeMask; 8314 continue; 8315 } 8316 8317 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8318 } 8319 8320 return BlockMaskCache[BB] = BlockMask; 8321 } 8322 8323 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8324 VPlanPtr &Plan) { 8325 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8326 "Must be called with either a load or store"); 8327 8328 auto willWiden = [&](ElementCount VF) -> bool { 8329 if (VF.isScalar()) 8330 return false; 8331 LoopVectorizationCostModel::InstWidening Decision = 8332 CM.getWideningDecision(I, VF); 8333 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8334 "CM decision should be taken at this point."); 8335 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8336 return true; 8337 if (CM.isScalarAfterVectorization(I, VF) || 8338 CM.isProfitableToScalarize(I, VF)) 8339 return false; 8340 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8341 }; 8342 8343 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8344 return nullptr; 8345 8346 VPValue *Mask = nullptr; 8347 if (Legal->isMaskRequired(I)) 8348 Mask = createBlockInMask(I->getParent(), Plan); 8349 8350 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8351 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8352 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8353 8354 StoreInst *Store = cast<StoreInst>(I); 8355 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8356 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8357 } 8358 8359 VPWidenIntOrFpInductionRecipe * 8360 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8361 // Check if this is an integer or fp induction. If so, build the recipe that 8362 // produces its scalar and vector values. 8363 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8364 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8365 II.getKind() == InductionDescriptor::IK_FpInduction) { 8366 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8367 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8368 return new VPWidenIntOrFpInductionRecipe( 8369 Phi, Start, Casts.empty() ? nullptr : Casts.front()); 8370 } 8371 8372 return nullptr; 8373 } 8374 8375 VPWidenIntOrFpInductionRecipe * 8376 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8377 VPlan &Plan) const { 8378 // Optimize the special case where the source is a constant integer 8379 // induction variable. Notice that we can only optimize the 'trunc' case 8380 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8381 // (c) other casts depend on pointer size. 8382 8383 // Determine whether \p K is a truncation based on an induction variable that 8384 // can be optimized. 8385 auto isOptimizableIVTruncate = 8386 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8387 return [=](ElementCount VF) -> bool { 8388 return CM.isOptimizableIVTruncate(K, VF); 8389 }; 8390 }; 8391 8392 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8393 isOptimizableIVTruncate(I), Range)) { 8394 8395 InductionDescriptor II = 8396 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8397 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8398 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8399 Start, nullptr, I); 8400 } 8401 return nullptr; 8402 } 8403 8404 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8405 // If all incoming values are equal, the incoming VPValue can be used directly 8406 // instead of creating a new VPBlendRecipe. 8407 Value *FirstIncoming = Phi->getIncomingValue(0); 8408 if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) { 8409 return FirstIncoming == Inc; 8410 })) { 8411 return Plan->getOrAddVPValue(Phi->getIncomingValue(0)); 8412 } 8413 8414 // We know that all PHIs in non-header blocks are converted into selects, so 8415 // we don't have to worry about the insertion order and we can just use the 8416 // builder. At this point we generate the predication tree. There may be 8417 // duplications since this is a simple recursive scan, but future 8418 // optimizations will clean it up. 8419 SmallVector<VPValue *, 2> Operands; 8420 unsigned NumIncoming = Phi->getNumIncomingValues(); 8421 8422 for (unsigned In = 0; In < NumIncoming; In++) { 8423 VPValue *EdgeMask = 8424 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8425 assert((EdgeMask || NumIncoming == 1) && 8426 "Multiple predecessors with one having a full mask"); 8427 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8428 if (EdgeMask) 8429 Operands.push_back(EdgeMask); 8430 } 8431 return toVPRecipeResult(new VPBlendRecipe(Phi, Operands)); 8432 } 8433 8434 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8435 VPlan &Plan) const { 8436 8437 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8438 [this, CI](ElementCount VF) { 8439 return CM.isScalarWithPredication(CI, VF); 8440 }, 8441 Range); 8442 8443 if (IsPredicated) 8444 return nullptr; 8445 8446 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8447 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8448 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8449 ID == Intrinsic::pseudoprobe || 8450 ID == Intrinsic::experimental_noalias_scope_decl)) 8451 return nullptr; 8452 8453 auto willWiden = [&](ElementCount VF) -> bool { 8454 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8455 // The following case may be scalarized depending on the VF. 8456 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8457 // version of the instruction. 8458 // Is it beneficial to perform intrinsic call compared to lib call? 8459 bool NeedToScalarize = false; 8460 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8461 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8462 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8463 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8464 "Cannot have invalid costs while widening"); 8465 return UseVectorIntrinsic || !NeedToScalarize; 8466 }; 8467 8468 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8469 return nullptr; 8470 8471 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8472 } 8473 8474 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8475 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8476 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8477 // Instruction should be widened, unless it is scalar after vectorization, 8478 // scalarization is profitable or it is predicated. 8479 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8480 return CM.isScalarAfterVectorization(I, VF) || 8481 CM.isProfitableToScalarize(I, VF) || 8482 CM.isScalarWithPredication(I, VF); 8483 }; 8484 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8485 Range); 8486 } 8487 8488 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8489 auto IsVectorizableOpcode = [](unsigned Opcode) { 8490 switch (Opcode) { 8491 case Instruction::Add: 8492 case Instruction::And: 8493 case Instruction::AShr: 8494 case Instruction::BitCast: 8495 case Instruction::FAdd: 8496 case Instruction::FCmp: 8497 case Instruction::FDiv: 8498 case Instruction::FMul: 8499 case Instruction::FNeg: 8500 case Instruction::FPExt: 8501 case Instruction::FPToSI: 8502 case Instruction::FPToUI: 8503 case Instruction::FPTrunc: 8504 case Instruction::FRem: 8505 case Instruction::FSub: 8506 case Instruction::ICmp: 8507 case Instruction::IntToPtr: 8508 case Instruction::LShr: 8509 case Instruction::Mul: 8510 case Instruction::Or: 8511 case Instruction::PtrToInt: 8512 case Instruction::SDiv: 8513 case Instruction::Select: 8514 case Instruction::SExt: 8515 case Instruction::Shl: 8516 case Instruction::SIToFP: 8517 case Instruction::SRem: 8518 case Instruction::Sub: 8519 case Instruction::Trunc: 8520 case Instruction::UDiv: 8521 case Instruction::UIToFP: 8522 case Instruction::URem: 8523 case Instruction::Xor: 8524 case Instruction::ZExt: 8525 return true; 8526 } 8527 return false; 8528 }; 8529 8530 if (!IsVectorizableOpcode(I->getOpcode())) 8531 return nullptr; 8532 8533 // Success: widen this instruction. 8534 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8535 } 8536 8537 VPBasicBlock *VPRecipeBuilder::handleReplication( 8538 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8539 VPlanPtr &Plan) { 8540 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8541 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8542 Range); 8543 8544 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8545 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8546 Range); 8547 8548 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8549 IsUniform, IsPredicated); 8550 setRecipe(I, Recipe); 8551 Plan->addVPValue(I, Recipe); 8552 8553 // Find if I uses a predicated instruction. If so, it will use its scalar 8554 // value. Avoid hoisting the insert-element which packs the scalar value into 8555 // a vector value, as that happens iff all users use the vector value. 8556 for (VPValue *Op : Recipe->operands()) { 8557 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8558 if (!PredR) 8559 continue; 8560 auto *RepR = 8561 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8562 assert(RepR->isPredicated() && 8563 "expected Replicate recipe to be predicated"); 8564 RepR->setAlsoPack(false); 8565 } 8566 8567 // Finalize the recipe for Instr, first if it is not predicated. 8568 if (!IsPredicated) { 8569 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8570 VPBB->appendRecipe(Recipe); 8571 return VPBB; 8572 } 8573 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8574 assert(VPBB->getSuccessors().empty() && 8575 "VPBB has successors when handling predicated replication."); 8576 // Record predicated instructions for above packing optimizations. 8577 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8578 VPBlockUtils::insertBlockAfter(Region, VPBB); 8579 auto *RegSucc = new VPBasicBlock(); 8580 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8581 return RegSucc; 8582 } 8583 8584 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8585 VPRecipeBase *PredRecipe, 8586 VPlanPtr &Plan) { 8587 // Instructions marked for predication are replicated and placed under an 8588 // if-then construct to prevent side-effects. 8589 8590 // Generate recipes to compute the block mask for this region. 8591 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8592 8593 // Build the triangular if-then region. 8594 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8595 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8596 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8597 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8598 auto *PHIRecipe = Instr->getType()->isVoidTy() 8599 ? nullptr 8600 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8601 if (PHIRecipe) { 8602 Plan->removeVPValueFor(Instr); 8603 Plan->addVPValue(Instr, PHIRecipe); 8604 } 8605 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8606 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8607 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8608 8609 // Note: first set Entry as region entry and then connect successors starting 8610 // from it in order, to propagate the "parent" of each VPBasicBlock. 8611 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8612 VPBlockUtils::connectBlocks(Pred, Exit); 8613 8614 return Region; 8615 } 8616 8617 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8618 VFRange &Range, 8619 VPlanPtr &Plan) { 8620 // First, check for specific widening recipes that deal with calls, memory 8621 // operations, inductions and Phi nodes. 8622 if (auto *CI = dyn_cast<CallInst>(Instr)) 8623 return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan)); 8624 8625 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8626 return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan)); 8627 8628 VPRecipeBase *Recipe; 8629 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8630 if (Phi->getParent() != OrigLoop->getHeader()) 8631 return tryToBlend(Phi, Plan); 8632 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8633 return toVPRecipeResult(Recipe); 8634 8635 if (Legal->isReductionVariable(Phi)) { 8636 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8637 VPValue *StartV = 8638 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8639 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8640 } 8641 8642 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8643 } 8644 8645 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8646 cast<TruncInst>(Instr), Range, *Plan))) 8647 return toVPRecipeResult(Recipe); 8648 8649 if (!shouldWiden(Instr, Range)) 8650 return nullptr; 8651 8652 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8653 return toVPRecipeResult(new VPWidenGEPRecipe( 8654 GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop)); 8655 8656 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8657 bool InvariantCond = 8658 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8659 return toVPRecipeResult(new VPWidenSelectRecipe( 8660 *SI, Plan->mapToVPValues(SI->operands()), InvariantCond)); 8661 } 8662 8663 return toVPRecipeResult(tryToWiden(Instr, *Plan)); 8664 } 8665 8666 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8667 ElementCount MaxVF) { 8668 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8669 8670 // Collect instructions from the original loop that will become trivially dead 8671 // in the vectorized loop. We don't need to vectorize these instructions. For 8672 // example, original induction update instructions can become dead because we 8673 // separately emit induction "steps" when generating code for the new loop. 8674 // Similarly, we create a new latch condition when setting up the structure 8675 // of the new loop, so the old one can become dead. 8676 SmallPtrSet<Instruction *, 4> DeadInstructions; 8677 collectTriviallyDeadInstructions(DeadInstructions); 8678 8679 // Add assume instructions we need to drop to DeadInstructions, to prevent 8680 // them from being added to the VPlan. 8681 // TODO: We only need to drop assumes in blocks that get flattend. If the 8682 // control flow is preserved, we should keep them. 8683 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8684 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8685 8686 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8687 // Dead instructions do not need sinking. Remove them from SinkAfter. 8688 for (Instruction *I : DeadInstructions) 8689 SinkAfter.erase(I); 8690 8691 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8692 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8693 VFRange SubRange = {VF, MaxVFPlusOne}; 8694 VPlans.push_back( 8695 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8696 VF = SubRange.End; 8697 } 8698 } 8699 8700 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8701 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8702 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8703 8704 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8705 8706 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8707 8708 // --------------------------------------------------------------------------- 8709 // Pre-construction: record ingredients whose recipes we'll need to further 8710 // process after constructing the initial VPlan. 8711 // --------------------------------------------------------------------------- 8712 8713 // Mark instructions we'll need to sink later and their targets as 8714 // ingredients whose recipe we'll need to record. 8715 for (auto &Entry : SinkAfter) { 8716 RecipeBuilder.recordRecipeOf(Entry.first); 8717 RecipeBuilder.recordRecipeOf(Entry.second); 8718 } 8719 for (auto &Reduction : CM.getInLoopReductionChains()) { 8720 PHINode *Phi = Reduction.first; 8721 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8722 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8723 8724 RecipeBuilder.recordRecipeOf(Phi); 8725 for (auto &R : ReductionOperations) { 8726 RecipeBuilder.recordRecipeOf(R); 8727 // For min/max reducitons, where we have a pair of icmp/select, we also 8728 // need to record the ICmp recipe, so it can be removed later. 8729 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8730 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8731 } 8732 } 8733 8734 // For each interleave group which is relevant for this (possibly trimmed) 8735 // Range, add it to the set of groups to be later applied to the VPlan and add 8736 // placeholders for its members' Recipes which we'll be replacing with a 8737 // single VPInterleaveRecipe. 8738 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8739 auto applyIG = [IG, this](ElementCount VF) -> bool { 8740 return (VF.isVector() && // Query is illegal for VF == 1 8741 CM.getWideningDecision(IG->getInsertPos(), VF) == 8742 LoopVectorizationCostModel::CM_Interleave); 8743 }; 8744 if (!getDecisionAndClampRange(applyIG, Range)) 8745 continue; 8746 InterleaveGroups.insert(IG); 8747 for (unsigned i = 0; i < IG->getFactor(); i++) 8748 if (Instruction *Member = IG->getMember(i)) 8749 RecipeBuilder.recordRecipeOf(Member); 8750 }; 8751 8752 // --------------------------------------------------------------------------- 8753 // Build initial VPlan: Scan the body of the loop in a topological order to 8754 // visit each basic block after having visited its predecessor basic blocks. 8755 // --------------------------------------------------------------------------- 8756 8757 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8758 auto Plan = std::make_unique<VPlan>(); 8759 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8760 Plan->setEntry(VPBB); 8761 8762 // Scan the body of the loop in a topological order to visit each basic block 8763 // after having visited its predecessor basic blocks. 8764 LoopBlocksDFS DFS(OrigLoop); 8765 DFS.perform(LI); 8766 8767 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8768 // Relevant instructions from basic block BB will be grouped into VPRecipe 8769 // ingredients and fill a new VPBasicBlock. 8770 unsigned VPBBsForBB = 0; 8771 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8772 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8773 VPBB = FirstVPBBForBB; 8774 Builder.setInsertPoint(VPBB); 8775 8776 // Introduce each ingredient into VPlan. 8777 // TODO: Model and preserve debug instrinsics in VPlan. 8778 for (Instruction &I : BB->instructionsWithoutDebug()) { 8779 Instruction *Instr = &I; 8780 8781 // First filter out irrelevant instructions, to ensure no recipes are 8782 // built for them. 8783 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8784 continue; 8785 8786 if (auto RecipeOrValue = 8787 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8788 // If Instr can be simplified to an existing VPValue, use it. 8789 if (RecipeOrValue.is<VPValue *>()) { 8790 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8791 continue; 8792 } 8793 // Otherwise, add the new recipe. 8794 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8795 for (auto *Def : Recipe->definedValues()) { 8796 auto *UV = Def->getUnderlyingValue(); 8797 Plan->addVPValue(UV, Def); 8798 } 8799 8800 RecipeBuilder.setRecipe(Instr, Recipe); 8801 VPBB->appendRecipe(Recipe); 8802 continue; 8803 } 8804 8805 // Otherwise, if all widening options failed, Instruction is to be 8806 // replicated. This may create a successor for VPBB. 8807 VPBasicBlock *NextVPBB = 8808 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8809 if (NextVPBB != VPBB) { 8810 VPBB = NextVPBB; 8811 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8812 : ""); 8813 } 8814 } 8815 } 8816 8817 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8818 // may also be empty, such as the last one VPBB, reflecting original 8819 // basic-blocks with no recipes. 8820 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8821 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8822 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8823 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8824 delete PreEntry; 8825 8826 // --------------------------------------------------------------------------- 8827 // Transform initial VPlan: Apply previously taken decisions, in order, to 8828 // bring the VPlan to its final state. 8829 // --------------------------------------------------------------------------- 8830 8831 // Apply Sink-After legal constraints. 8832 for (auto &Entry : SinkAfter) { 8833 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8834 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8835 // If the target is in a replication region, make sure to move Sink to the 8836 // block after it, not into the replication region itself. 8837 if (auto *Region = 8838 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8839 if (Region->isReplicator()) { 8840 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8841 VPBasicBlock *NextBlock = 8842 cast<VPBasicBlock>(Region->getSuccessors().front()); 8843 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8844 continue; 8845 } 8846 } 8847 Sink->moveAfter(Target); 8848 } 8849 8850 // Interleave memory: for each Interleave Group we marked earlier as relevant 8851 // for this VPlan, replace the Recipes widening its memory instructions with a 8852 // single VPInterleaveRecipe at its insertion point. 8853 for (auto IG : InterleaveGroups) { 8854 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8855 RecipeBuilder.getRecipe(IG->getInsertPos())); 8856 SmallVector<VPValue *, 4> StoredValues; 8857 for (unsigned i = 0; i < IG->getFactor(); ++i) 8858 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8859 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8860 8861 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8862 Recipe->getMask()); 8863 VPIG->insertBefore(Recipe); 8864 unsigned J = 0; 8865 for (unsigned i = 0; i < IG->getFactor(); ++i) 8866 if (Instruction *Member = IG->getMember(i)) { 8867 if (!Member->getType()->isVoidTy()) { 8868 VPValue *OriginalV = Plan->getVPValue(Member); 8869 Plan->removeVPValueFor(Member); 8870 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8871 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8872 J++; 8873 } 8874 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8875 } 8876 } 8877 8878 // Adjust the recipes for any inloop reductions. 8879 if (Range.Start.isVector()) 8880 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8881 8882 // Finally, if tail is folded by masking, introduce selects between the phi 8883 // and the live-out instruction of each reduction, at the end of the latch. 8884 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8885 Builder.setInsertPoint(VPBB); 8886 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8887 for (auto &Reduction : Legal->getReductionVars()) { 8888 if (CM.isInLoopReduction(Reduction.first)) 8889 continue; 8890 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8891 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8892 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8893 } 8894 } 8895 8896 std::string PlanName; 8897 raw_string_ostream RSO(PlanName); 8898 ElementCount VF = Range.Start; 8899 Plan->addVF(VF); 8900 RSO << "Initial VPlan for VF={" << VF; 8901 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8902 Plan->addVF(VF); 8903 RSO << "," << VF; 8904 } 8905 RSO << "},UF>=1"; 8906 RSO.flush(); 8907 Plan->setName(PlanName); 8908 8909 return Plan; 8910 } 8911 8912 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8913 // Outer loop handling: They may require CFG and instruction level 8914 // transformations before even evaluating whether vectorization is profitable. 8915 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8916 // the vectorization pipeline. 8917 assert(!OrigLoop->isInnermost()); 8918 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8919 8920 // Create new empty VPlan 8921 auto Plan = std::make_unique<VPlan>(); 8922 8923 // Build hierarchical CFG 8924 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8925 HCFGBuilder.buildHierarchicalCFG(); 8926 8927 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8928 VF *= 2) 8929 Plan->addVF(VF); 8930 8931 if (EnableVPlanPredication) { 8932 VPlanPredicator VPP(*Plan); 8933 VPP.predicate(); 8934 8935 // Avoid running transformation to recipes until masked code generation in 8936 // VPlan-native path is in place. 8937 return Plan; 8938 } 8939 8940 SmallPtrSet<Instruction *, 1> DeadInstructions; 8941 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 8942 Legal->getInductionVars(), 8943 DeadInstructions, *PSE.getSE()); 8944 return Plan; 8945 } 8946 8947 // Adjust the recipes for any inloop reductions. The chain of instructions 8948 // leading from the loop exit instr to the phi need to be converted to 8949 // reductions, with one operand being vector and the other being the scalar 8950 // reduction chain. 8951 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8952 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8953 for (auto &Reduction : CM.getInLoopReductionChains()) { 8954 PHINode *Phi = Reduction.first; 8955 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8956 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8957 8958 // ReductionOperations are orders top-down from the phi's use to the 8959 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8960 // which of the two operands will remain scalar and which will be reduced. 8961 // For minmax the chain will be the select instructions. 8962 Instruction *Chain = Phi; 8963 for (Instruction *R : ReductionOperations) { 8964 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8965 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8966 8967 VPValue *ChainOp = Plan->getVPValue(Chain); 8968 unsigned FirstOpId; 8969 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8970 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8971 "Expected to replace a VPWidenSelectSC"); 8972 FirstOpId = 1; 8973 } else { 8974 assert(isa<VPWidenRecipe>(WidenRecipe) && 8975 "Expected to replace a VPWidenSC"); 8976 FirstOpId = 0; 8977 } 8978 unsigned VecOpId = 8979 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8980 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8981 8982 auto *CondOp = CM.foldTailByMasking() 8983 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8984 : nullptr; 8985 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8986 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 8987 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8988 Plan->removeVPValueFor(R); 8989 Plan->addVPValue(R, RedRecipe); 8990 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8991 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8992 WidenRecipe->eraseFromParent(); 8993 8994 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8995 VPRecipeBase *CompareRecipe = 8996 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8997 assert(isa<VPWidenRecipe>(CompareRecipe) && 8998 "Expected to replace a VPWidenSC"); 8999 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9000 "Expected no remaining users"); 9001 CompareRecipe->eraseFromParent(); 9002 } 9003 Chain = R; 9004 } 9005 } 9006 } 9007 9008 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9009 VPSlotTracker &SlotTracker) const { 9010 O << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9011 IG->getInsertPos()->printAsOperand(O, false); 9012 O << ", "; 9013 getAddr()->printAsOperand(O, SlotTracker); 9014 VPValue *Mask = getMask(); 9015 if (Mask) { 9016 O << ", "; 9017 Mask->printAsOperand(O, SlotTracker); 9018 } 9019 for (unsigned i = 0; i < IG->getFactor(); ++i) 9020 if (Instruction *I = IG->getMember(i)) 9021 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 9022 } 9023 9024 void VPWidenCallRecipe::execute(VPTransformState &State) { 9025 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9026 *this, State); 9027 } 9028 9029 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9030 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9031 this, *this, InvariantCond, State); 9032 } 9033 9034 void VPWidenRecipe::execute(VPTransformState &State) { 9035 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9036 } 9037 9038 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9039 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9040 *this, State.UF, State.VF, IsPtrLoopInvariant, 9041 IsIndexLoopInvariant, State); 9042 } 9043 9044 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9045 assert(!State.Instance && "Int or FP induction being replicated."); 9046 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9047 getTruncInst(), getVPValue(0), 9048 getCastValue(), State); 9049 } 9050 9051 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9052 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9053 getStartValue(), this, State); 9054 } 9055 9056 void VPBlendRecipe::execute(VPTransformState &State) { 9057 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9058 // We know that all PHIs in non-header blocks are converted into 9059 // selects, so we don't have to worry about the insertion order and we 9060 // can just use the builder. 9061 // At this point we generate the predication tree. There may be 9062 // duplications since this is a simple recursive scan, but future 9063 // optimizations will clean it up. 9064 9065 unsigned NumIncoming = getNumIncomingValues(); 9066 9067 // Generate a sequence of selects of the form: 9068 // SELECT(Mask3, In3, 9069 // SELECT(Mask2, In2, 9070 // SELECT(Mask1, In1, 9071 // In0))) 9072 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9073 // are essentially undef are taken from In0. 9074 InnerLoopVectorizer::VectorParts Entry(State.UF); 9075 for (unsigned In = 0; In < NumIncoming; ++In) { 9076 for (unsigned Part = 0; Part < State.UF; ++Part) { 9077 // We might have single edge PHIs (blocks) - use an identity 9078 // 'select' for the first PHI operand. 9079 Value *In0 = State.get(getIncomingValue(In), Part); 9080 if (In == 0) 9081 Entry[Part] = In0; // Initialize with the first incoming value. 9082 else { 9083 // Select between the current value and the previous incoming edge 9084 // based on the incoming mask. 9085 Value *Cond = State.get(getMask(In), Part); 9086 Entry[Part] = 9087 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9088 } 9089 } 9090 } 9091 for (unsigned Part = 0; Part < State.UF; ++Part) 9092 State.set(this, Entry[Part], Part); 9093 } 9094 9095 void VPInterleaveRecipe::execute(VPTransformState &State) { 9096 assert(!State.Instance && "Interleave group being replicated."); 9097 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9098 getStoredValues(), getMask()); 9099 } 9100 9101 void VPReductionRecipe::execute(VPTransformState &State) { 9102 assert(!State.Instance && "Reduction being replicated."); 9103 for (unsigned Part = 0; Part < State.UF; ++Part) { 9104 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9105 Value *NewVecOp = State.get(getVecOp(), Part); 9106 if (VPValue *Cond = getCondOp()) { 9107 Value *NewCond = State.get(Cond, Part); 9108 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9109 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9110 Kind, VecTy->getElementType()); 9111 Constant *IdenVec = 9112 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9113 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9114 NewVecOp = Select; 9115 } 9116 Value *NewRed = 9117 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9118 Value *PrevInChain = State.get(getChainOp(), Part); 9119 Value *NextInChain; 9120 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9121 NextInChain = 9122 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9123 NewRed, PrevInChain); 9124 } else { 9125 NextInChain = State.Builder.CreateBinOp( 9126 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9127 PrevInChain); 9128 } 9129 State.set(this, NextInChain, Part); 9130 } 9131 } 9132 9133 void VPReplicateRecipe::execute(VPTransformState &State) { 9134 if (State.Instance) { // Generate a single instance. 9135 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9136 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9137 *State.Instance, IsPredicated, State); 9138 // Insert scalar instance packing it into a vector. 9139 if (AlsoPack && State.VF.isVector()) { 9140 // If we're constructing lane 0, initialize to start from poison. 9141 if (State.Instance->Lane.isFirstLane()) { 9142 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9143 Value *Poison = PoisonValue::get( 9144 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9145 State.set(this, Poison, State.Instance->Part); 9146 } 9147 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9148 } 9149 return; 9150 } 9151 9152 // Generate scalar instances for all VF lanes of all UF parts, unless the 9153 // instruction is uniform inwhich case generate only the first lane for each 9154 // of the UF parts. 9155 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9156 assert((!State.VF.isScalable() || IsUniform) && 9157 "Can't scalarize a scalable vector"); 9158 for (unsigned Part = 0; Part < State.UF; ++Part) 9159 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9160 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9161 VPIteration(Part, Lane), IsPredicated, 9162 State); 9163 } 9164 9165 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9166 assert(State.Instance && "Branch on Mask works only on single instance."); 9167 9168 unsigned Part = State.Instance->Part; 9169 unsigned Lane = State.Instance->Lane.getKnownLane(); 9170 9171 Value *ConditionBit = nullptr; 9172 VPValue *BlockInMask = getMask(); 9173 if (BlockInMask) { 9174 ConditionBit = State.get(BlockInMask, Part); 9175 if (ConditionBit->getType()->isVectorTy()) 9176 ConditionBit = State.Builder.CreateExtractElement( 9177 ConditionBit, State.Builder.getInt32(Lane)); 9178 } else // Block in mask is all-one. 9179 ConditionBit = State.Builder.getTrue(); 9180 9181 // Replace the temporary unreachable terminator with a new conditional branch, 9182 // whose two destinations will be set later when they are created. 9183 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9184 assert(isa<UnreachableInst>(CurrentTerminator) && 9185 "Expected to replace unreachable terminator with conditional branch."); 9186 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9187 CondBr->setSuccessor(0, nullptr); 9188 ReplaceInstWithInst(CurrentTerminator, CondBr); 9189 } 9190 9191 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9192 assert(State.Instance && "Predicated instruction PHI works per instance."); 9193 Instruction *ScalarPredInst = 9194 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9195 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9196 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9197 assert(PredicatingBB && "Predicated block has no single predecessor."); 9198 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9199 "operand must be VPReplicateRecipe"); 9200 9201 // By current pack/unpack logic we need to generate only a single phi node: if 9202 // a vector value for the predicated instruction exists at this point it means 9203 // the instruction has vector users only, and a phi for the vector value is 9204 // needed. In this case the recipe of the predicated instruction is marked to 9205 // also do that packing, thereby "hoisting" the insert-element sequence. 9206 // Otherwise, a phi node for the scalar value is needed. 9207 unsigned Part = State.Instance->Part; 9208 if (State.hasVectorValue(getOperand(0), Part)) { 9209 Value *VectorValue = State.get(getOperand(0), Part); 9210 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9211 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9212 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9213 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9214 if (State.hasVectorValue(this, Part)) 9215 State.reset(this, VPhi, Part); 9216 else 9217 State.set(this, VPhi, Part); 9218 // NOTE: Currently we need to update the value of the operand, so the next 9219 // predicated iteration inserts its generated value in the correct vector. 9220 State.reset(getOperand(0), VPhi, Part); 9221 } else { 9222 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9223 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9224 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9225 PredicatingBB); 9226 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9227 if (State.hasScalarValue(this, *State.Instance)) 9228 State.reset(this, Phi, *State.Instance); 9229 else 9230 State.set(this, Phi, *State.Instance); 9231 // NOTE: Currently we need to update the value of the operand, so the next 9232 // predicated iteration inserts its generated value in the correct vector. 9233 State.reset(getOperand(0), Phi, *State.Instance); 9234 } 9235 } 9236 9237 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9238 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9239 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9240 StoredValue ? nullptr : getVPValue(), 9241 getAddr(), StoredValue, getMask()); 9242 } 9243 9244 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9245 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9246 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9247 // for predication. 9248 static ScalarEpilogueLowering getScalarEpilogueLowering( 9249 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9250 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9251 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9252 LoopVectorizationLegality &LVL) { 9253 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9254 // don't look at hints or options, and don't request a scalar epilogue. 9255 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9256 // LoopAccessInfo (due to code dependency and not being able to reliably get 9257 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9258 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9259 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9260 // back to the old way and vectorize with versioning when forced. See D81345.) 9261 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9262 PGSOQueryType::IRPass) && 9263 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9264 return CM_ScalarEpilogueNotAllowedOptSize; 9265 9266 // 2) If set, obey the directives 9267 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9268 switch (PreferPredicateOverEpilogue) { 9269 case PreferPredicateTy::ScalarEpilogue: 9270 return CM_ScalarEpilogueAllowed; 9271 case PreferPredicateTy::PredicateElseScalarEpilogue: 9272 return CM_ScalarEpilogueNotNeededUsePredicate; 9273 case PreferPredicateTy::PredicateOrDontVectorize: 9274 return CM_ScalarEpilogueNotAllowedUsePredicate; 9275 }; 9276 } 9277 9278 // 3) If set, obey the hints 9279 switch (Hints.getPredicate()) { 9280 case LoopVectorizeHints::FK_Enabled: 9281 return CM_ScalarEpilogueNotNeededUsePredicate; 9282 case LoopVectorizeHints::FK_Disabled: 9283 return CM_ScalarEpilogueAllowed; 9284 }; 9285 9286 // 4) if the TTI hook indicates this is profitable, request predication. 9287 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9288 LVL.getLAI())) 9289 return CM_ScalarEpilogueNotNeededUsePredicate; 9290 9291 return CM_ScalarEpilogueAllowed; 9292 } 9293 9294 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9295 // If Values have been set for this Def return the one relevant for \p Part. 9296 if (hasVectorValue(Def, Part)) 9297 return Data.PerPartOutput[Def][Part]; 9298 9299 if (!hasScalarValue(Def, {Part, 0})) { 9300 Value *IRV = Def->getLiveInIRValue(); 9301 Value *B = ILV->getBroadcastInstrs(IRV); 9302 set(Def, B, Part); 9303 return B; 9304 } 9305 9306 Value *ScalarValue = get(Def, {Part, 0}); 9307 // If we aren't vectorizing, we can just copy the scalar map values over 9308 // to the vector map. 9309 if (VF.isScalar()) { 9310 set(Def, ScalarValue, Part); 9311 return ScalarValue; 9312 } 9313 9314 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9315 bool IsUniform = RepR && RepR->isUniform(); 9316 9317 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9318 // Check if there is a scalar value for the selected lane. 9319 if (!hasScalarValue(Def, {Part, LastLane})) { 9320 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9321 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9322 "unexpected recipe found to be invariant"); 9323 IsUniform = true; 9324 LastLane = 0; 9325 } 9326 9327 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9328 9329 // Set the insert point after the last scalarized instruction. This 9330 // ensures the insertelement sequence will directly follow the scalar 9331 // definitions. 9332 auto OldIP = Builder.saveIP(); 9333 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9334 Builder.SetInsertPoint(&*NewIP); 9335 9336 // However, if we are vectorizing, we need to construct the vector values. 9337 // If the value is known to be uniform after vectorization, we can just 9338 // broadcast the scalar value corresponding to lane zero for each unroll 9339 // iteration. Otherwise, we construct the vector values using 9340 // insertelement instructions. Since the resulting vectors are stored in 9341 // State, we will only generate the insertelements once. 9342 Value *VectorValue = nullptr; 9343 if (IsUniform) { 9344 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9345 set(Def, VectorValue, Part); 9346 } else { 9347 // Initialize packing with insertelements to start from undef. 9348 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9349 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9350 set(Def, Undef, Part); 9351 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9352 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9353 VectorValue = get(Def, Part); 9354 } 9355 Builder.restoreIP(OldIP); 9356 return VectorValue; 9357 } 9358 9359 // Process the loop in the VPlan-native vectorization path. This path builds 9360 // VPlan upfront in the vectorization pipeline, which allows to apply 9361 // VPlan-to-VPlan transformations from the very beginning without modifying the 9362 // input LLVM IR. 9363 static bool processLoopInVPlanNativePath( 9364 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9365 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9366 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9367 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9368 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9369 9370 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9371 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9372 return false; 9373 } 9374 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9375 Function *F = L->getHeader()->getParent(); 9376 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9377 9378 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9379 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9380 9381 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9382 &Hints, IAI); 9383 // Use the planner for outer loop vectorization. 9384 // TODO: CM is not used at this point inside the planner. Turn CM into an 9385 // optional argument if we don't need it in the future. 9386 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9387 9388 // Get user vectorization factor. 9389 ElementCount UserVF = Hints.getWidth(); 9390 9391 // Plan how to best vectorize, return the best VF and its cost. 9392 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9393 9394 // If we are stress testing VPlan builds, do not attempt to generate vector 9395 // code. Masked vector code generation support will follow soon. 9396 // Also, do not attempt to vectorize if no vector code will be produced. 9397 if (VPlanBuildStressTest || EnableVPlanPredication || 9398 VectorizationFactor::Disabled() == VF) 9399 return false; 9400 9401 LVP.setBestPlan(VF.Width, 1); 9402 9403 { 9404 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9405 F->getParent()->getDataLayout()); 9406 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9407 &CM, BFI, PSI, Checks); 9408 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9409 << L->getHeader()->getParent()->getName() << "\"\n"); 9410 LVP.executePlan(LB, DT); 9411 } 9412 9413 // Mark the loop as already vectorized to avoid vectorizing again. 9414 Hints.setAlreadyVectorized(); 9415 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9416 return true; 9417 } 9418 9419 // Emit a remark if there are stores to floats that required a floating point 9420 // extension. If the vectorized loop was generated with floating point there 9421 // will be a performance penalty from the conversion overhead and the change in 9422 // the vector width. 9423 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9424 SmallVector<Instruction *, 4> Worklist; 9425 for (BasicBlock *BB : L->getBlocks()) { 9426 for (Instruction &Inst : *BB) { 9427 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9428 if (S->getValueOperand()->getType()->isFloatTy()) 9429 Worklist.push_back(S); 9430 } 9431 } 9432 } 9433 9434 // Traverse the floating point stores upwards searching, for floating point 9435 // conversions. 9436 SmallPtrSet<const Instruction *, 4> Visited; 9437 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9438 while (!Worklist.empty()) { 9439 auto *I = Worklist.pop_back_val(); 9440 if (!L->contains(I)) 9441 continue; 9442 if (!Visited.insert(I).second) 9443 continue; 9444 9445 // Emit a remark if the floating point store required a floating 9446 // point conversion. 9447 // TODO: More work could be done to identify the root cause such as a 9448 // constant or a function return type and point the user to it. 9449 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9450 ORE->emit([&]() { 9451 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9452 I->getDebugLoc(), L->getHeader()) 9453 << "floating point conversion changes vector width. " 9454 << "Mixed floating point precision requires an up/down " 9455 << "cast that will negatively impact performance."; 9456 }); 9457 9458 for (Use &Op : I->operands()) 9459 if (auto *OpI = dyn_cast<Instruction>(Op)) 9460 Worklist.push_back(OpI); 9461 } 9462 } 9463 9464 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9465 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9466 !EnableLoopInterleaving), 9467 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9468 !EnableLoopVectorization) {} 9469 9470 bool LoopVectorizePass::processLoop(Loop *L) { 9471 assert((EnableVPlanNativePath || L->isInnermost()) && 9472 "VPlan-native path is not enabled. Only process inner loops."); 9473 9474 #ifndef NDEBUG 9475 const std::string DebugLocStr = getDebugLocString(L); 9476 #endif /* NDEBUG */ 9477 9478 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9479 << L->getHeader()->getParent()->getName() << "\" from " 9480 << DebugLocStr << "\n"); 9481 9482 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9483 9484 LLVM_DEBUG( 9485 dbgs() << "LV: Loop hints:" 9486 << " force=" 9487 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9488 ? "disabled" 9489 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9490 ? "enabled" 9491 : "?")) 9492 << " width=" << Hints.getWidth() 9493 << " unroll=" << Hints.getInterleave() << "\n"); 9494 9495 // Function containing loop 9496 Function *F = L->getHeader()->getParent(); 9497 9498 // Looking at the diagnostic output is the only way to determine if a loop 9499 // was vectorized (other than looking at the IR or machine code), so it 9500 // is important to generate an optimization remark for each loop. Most of 9501 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9502 // generated as OptimizationRemark and OptimizationRemarkMissed are 9503 // less verbose reporting vectorized loops and unvectorized loops that may 9504 // benefit from vectorization, respectively. 9505 9506 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9507 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9508 return false; 9509 } 9510 9511 PredicatedScalarEvolution PSE(*SE, *L); 9512 9513 // Check if it is legal to vectorize the loop. 9514 LoopVectorizationRequirements Requirements(*ORE); 9515 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9516 &Requirements, &Hints, DB, AC, BFI, PSI); 9517 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9518 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9519 Hints.emitRemarkWithHints(); 9520 return false; 9521 } 9522 9523 // Check the function attributes and profiles to find out if this function 9524 // should be optimized for size. 9525 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9526 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9527 9528 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9529 // here. They may require CFG and instruction level transformations before 9530 // even evaluating whether vectorization is profitable. Since we cannot modify 9531 // the incoming IR, we need to build VPlan upfront in the vectorization 9532 // pipeline. 9533 if (!L->isInnermost()) 9534 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9535 ORE, BFI, PSI, Hints); 9536 9537 assert(L->isInnermost() && "Inner loop expected."); 9538 9539 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9540 // count by optimizing for size, to minimize overheads. 9541 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9542 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9543 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9544 << "This loop is worth vectorizing only if no scalar " 9545 << "iteration overheads are incurred."); 9546 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9547 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9548 else { 9549 LLVM_DEBUG(dbgs() << "\n"); 9550 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9551 } 9552 } 9553 9554 // Check the function attributes to see if implicit floats are allowed. 9555 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9556 // an integer loop and the vector instructions selected are purely integer 9557 // vector instructions? 9558 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9559 reportVectorizationFailure( 9560 "Can't vectorize when the NoImplicitFloat attribute is used", 9561 "loop not vectorized due to NoImplicitFloat attribute", 9562 "NoImplicitFloat", ORE, L); 9563 Hints.emitRemarkWithHints(); 9564 return false; 9565 } 9566 9567 // Check if the target supports potentially unsafe FP vectorization. 9568 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9569 // for the target we're vectorizing for, to make sure none of the 9570 // additional fp-math flags can help. 9571 if (Hints.isPotentiallyUnsafe() && 9572 TTI->isFPVectorizationPotentiallyUnsafe()) { 9573 reportVectorizationFailure( 9574 "Potentially unsafe FP op prevents vectorization", 9575 "loop not vectorized due to unsafe FP support.", 9576 "UnsafeFP", ORE, L); 9577 Hints.emitRemarkWithHints(); 9578 return false; 9579 } 9580 9581 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9582 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9583 9584 // If an override option has been passed in for interleaved accesses, use it. 9585 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9586 UseInterleaved = EnableInterleavedMemAccesses; 9587 9588 // Analyze interleaved memory accesses. 9589 if (UseInterleaved) { 9590 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9591 } 9592 9593 // Use the cost model. 9594 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9595 F, &Hints, IAI); 9596 CM.collectValuesToIgnore(); 9597 9598 // Use the planner for vectorization. 9599 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9600 9601 // Get user vectorization factor and interleave count. 9602 ElementCount UserVF = Hints.getWidth(); 9603 unsigned UserIC = Hints.getInterleave(); 9604 9605 // Plan how to best vectorize, return the best VF and its cost. 9606 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9607 9608 VectorizationFactor VF = VectorizationFactor::Disabled(); 9609 unsigned IC = 1; 9610 9611 if (MaybeVF) { 9612 VF = *MaybeVF; 9613 // Select the interleave count. 9614 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9615 } 9616 9617 // Identify the diagnostic messages that should be produced. 9618 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9619 bool VectorizeLoop = true, InterleaveLoop = true; 9620 if (Requirements.doesNotMeet(F, L, Hints)) { 9621 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9622 "requirements.\n"); 9623 Hints.emitRemarkWithHints(); 9624 return false; 9625 } 9626 9627 if (VF.Width.isScalar()) { 9628 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9629 VecDiagMsg = std::make_pair( 9630 "VectorizationNotBeneficial", 9631 "the cost-model indicates that vectorization is not beneficial"); 9632 VectorizeLoop = false; 9633 } 9634 9635 if (!MaybeVF && UserIC > 1) { 9636 // Tell the user interleaving was avoided up-front, despite being explicitly 9637 // requested. 9638 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9639 "interleaving should be avoided up front\n"); 9640 IntDiagMsg = std::make_pair( 9641 "InterleavingAvoided", 9642 "Ignoring UserIC, because interleaving was avoided up front"); 9643 InterleaveLoop = false; 9644 } else if (IC == 1 && UserIC <= 1) { 9645 // Tell the user interleaving is not beneficial. 9646 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9647 IntDiagMsg = std::make_pair( 9648 "InterleavingNotBeneficial", 9649 "the cost-model indicates that interleaving is not beneficial"); 9650 InterleaveLoop = false; 9651 if (UserIC == 1) { 9652 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9653 IntDiagMsg.second += 9654 " and is explicitly disabled or interleave count is set to 1"; 9655 } 9656 } else if (IC > 1 && UserIC == 1) { 9657 // Tell the user interleaving is beneficial, but it explicitly disabled. 9658 LLVM_DEBUG( 9659 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9660 IntDiagMsg = std::make_pair( 9661 "InterleavingBeneficialButDisabled", 9662 "the cost-model indicates that interleaving is beneficial " 9663 "but is explicitly disabled or interleave count is set to 1"); 9664 InterleaveLoop = false; 9665 } 9666 9667 // Override IC if user provided an interleave count. 9668 IC = UserIC > 0 ? UserIC : IC; 9669 9670 // Emit diagnostic messages, if any. 9671 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9672 if (!VectorizeLoop && !InterleaveLoop) { 9673 // Do not vectorize or interleaving the loop. 9674 ORE->emit([&]() { 9675 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9676 L->getStartLoc(), L->getHeader()) 9677 << VecDiagMsg.second; 9678 }); 9679 ORE->emit([&]() { 9680 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9681 L->getStartLoc(), L->getHeader()) 9682 << IntDiagMsg.second; 9683 }); 9684 return false; 9685 } else if (!VectorizeLoop && InterleaveLoop) { 9686 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9687 ORE->emit([&]() { 9688 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9689 L->getStartLoc(), L->getHeader()) 9690 << VecDiagMsg.second; 9691 }); 9692 } else if (VectorizeLoop && !InterleaveLoop) { 9693 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9694 << ") in " << DebugLocStr << '\n'); 9695 ORE->emit([&]() { 9696 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9697 L->getStartLoc(), L->getHeader()) 9698 << IntDiagMsg.second; 9699 }); 9700 } else if (VectorizeLoop && InterleaveLoop) { 9701 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9702 << ") in " << DebugLocStr << '\n'); 9703 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9704 } 9705 9706 bool DisableRuntimeUnroll = false; 9707 MDNode *OrigLoopID = L->getLoopID(); 9708 { 9709 // Optimistically generate runtime checks. Drop them if they turn out to not 9710 // be profitable. Limit the scope of Checks, so the cleanup happens 9711 // immediately after vector codegeneration is done. 9712 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9713 F->getParent()->getDataLayout()); 9714 if (!VF.Width.isScalar() || IC > 1) 9715 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9716 LVP.setBestPlan(VF.Width, IC); 9717 9718 using namespace ore; 9719 if (!VectorizeLoop) { 9720 assert(IC > 1 && "interleave count should not be 1 or 0"); 9721 // If we decided that it is not legal to vectorize the loop, then 9722 // interleave it. 9723 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9724 &CM, BFI, PSI, Checks); 9725 LVP.executePlan(Unroller, DT); 9726 9727 ORE->emit([&]() { 9728 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9729 L->getHeader()) 9730 << "interleaved loop (interleaved count: " 9731 << NV("InterleaveCount", IC) << ")"; 9732 }); 9733 } else { 9734 // If we decided that it is *legal* to vectorize the loop, then do it. 9735 9736 // Consider vectorizing the epilogue too if it's profitable. 9737 VectorizationFactor EpilogueVF = 9738 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9739 if (EpilogueVF.Width.isVector()) { 9740 9741 // The first pass vectorizes the main loop and creates a scalar epilogue 9742 // to be vectorized by executing the plan (potentially with a different 9743 // factor) again shortly afterwards. 9744 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9745 EpilogueVF.Width.getKnownMinValue(), 9746 1); 9747 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9748 EPI, &LVL, &CM, BFI, PSI, Checks); 9749 9750 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9751 LVP.executePlan(MainILV, DT); 9752 ++LoopsVectorized; 9753 9754 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9755 formLCSSARecursively(*L, *DT, LI, SE); 9756 9757 // Second pass vectorizes the epilogue and adjusts the control flow 9758 // edges from the first pass. 9759 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9760 EPI.MainLoopVF = EPI.EpilogueVF; 9761 EPI.MainLoopUF = EPI.EpilogueUF; 9762 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9763 ORE, EPI, &LVL, &CM, BFI, PSI, 9764 Checks); 9765 LVP.executePlan(EpilogILV, DT); 9766 ++LoopsEpilogueVectorized; 9767 9768 if (!MainILV.areSafetyChecksAdded()) 9769 DisableRuntimeUnroll = true; 9770 } else { 9771 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9772 &LVL, &CM, BFI, PSI, Checks); 9773 LVP.executePlan(LB, DT); 9774 ++LoopsVectorized; 9775 9776 // Add metadata to disable runtime unrolling a scalar loop when there 9777 // are no runtime checks about strides and memory. A scalar loop that is 9778 // rarely used is not worth unrolling. 9779 if (!LB.areSafetyChecksAdded()) 9780 DisableRuntimeUnroll = true; 9781 } 9782 // Report the vectorization decision. 9783 ORE->emit([&]() { 9784 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9785 L->getHeader()) 9786 << "vectorized loop (vectorization width: " 9787 << NV("VectorizationFactor", VF.Width) 9788 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9789 }); 9790 } 9791 9792 if (ORE->allowExtraAnalysis(LV_NAME)) 9793 checkMixedPrecision(L, ORE); 9794 } 9795 9796 Optional<MDNode *> RemainderLoopID = 9797 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9798 LLVMLoopVectorizeFollowupEpilogue}); 9799 if (RemainderLoopID.hasValue()) { 9800 L->setLoopID(RemainderLoopID.getValue()); 9801 } else { 9802 if (DisableRuntimeUnroll) 9803 AddRuntimeUnrollDisableMetaData(L); 9804 9805 // Mark the loop as already vectorized to avoid vectorizing again. 9806 Hints.setAlreadyVectorized(); 9807 } 9808 9809 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9810 return true; 9811 } 9812 9813 LoopVectorizeResult LoopVectorizePass::runImpl( 9814 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9815 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9816 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9817 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9818 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9819 SE = &SE_; 9820 LI = &LI_; 9821 TTI = &TTI_; 9822 DT = &DT_; 9823 BFI = &BFI_; 9824 TLI = TLI_; 9825 AA = &AA_; 9826 AC = &AC_; 9827 GetLAA = &GetLAA_; 9828 DB = &DB_; 9829 ORE = &ORE_; 9830 PSI = PSI_; 9831 9832 // Don't attempt if 9833 // 1. the target claims to have no vector registers, and 9834 // 2. interleaving won't help ILP. 9835 // 9836 // The second condition is necessary because, even if the target has no 9837 // vector registers, loop vectorization may still enable scalar 9838 // interleaving. 9839 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9840 TTI->getMaxInterleaveFactor(1) < 2) 9841 return LoopVectorizeResult(false, false); 9842 9843 bool Changed = false, CFGChanged = false; 9844 9845 // The vectorizer requires loops to be in simplified form. 9846 // Since simplification may add new inner loops, it has to run before the 9847 // legality and profitability checks. This means running the loop vectorizer 9848 // will simplify all loops, regardless of whether anything end up being 9849 // vectorized. 9850 for (auto &L : *LI) 9851 Changed |= CFGChanged |= 9852 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9853 9854 // Build up a worklist of inner-loops to vectorize. This is necessary as 9855 // the act of vectorizing or partially unrolling a loop creates new loops 9856 // and can invalidate iterators across the loops. 9857 SmallVector<Loop *, 8> Worklist; 9858 9859 for (Loop *L : *LI) 9860 collectSupportedLoops(*L, LI, ORE, Worklist); 9861 9862 LoopsAnalyzed += Worklist.size(); 9863 9864 // Now walk the identified inner loops. 9865 while (!Worklist.empty()) { 9866 Loop *L = Worklist.pop_back_val(); 9867 9868 // For the inner loops we actually process, form LCSSA to simplify the 9869 // transform. 9870 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9871 9872 Changed |= CFGChanged |= processLoop(L); 9873 } 9874 9875 // Process each loop nest in the function. 9876 return LoopVectorizeResult(Changed, CFGChanged); 9877 } 9878 9879 PreservedAnalyses LoopVectorizePass::run(Function &F, 9880 FunctionAnalysisManager &AM) { 9881 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9882 auto &LI = AM.getResult<LoopAnalysis>(F); 9883 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9884 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9885 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9886 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9887 auto &AA = AM.getResult<AAManager>(F); 9888 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9889 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9890 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9891 MemorySSA *MSSA = EnableMSSALoopDependency 9892 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9893 : nullptr; 9894 9895 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9896 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9897 [&](Loop &L) -> const LoopAccessInfo & { 9898 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9899 TLI, TTI, nullptr, MSSA}; 9900 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9901 }; 9902 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9903 ProfileSummaryInfo *PSI = 9904 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9905 LoopVectorizeResult Result = 9906 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9907 if (!Result.MadeAnyChange) 9908 return PreservedAnalyses::all(); 9909 PreservedAnalyses PA; 9910 9911 // We currently do not preserve loopinfo/dominator analyses with outer loop 9912 // vectorization. Until this is addressed, mark these analyses as preserved 9913 // only for non-VPlan-native path. 9914 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9915 if (!EnableVPlanNativePath) { 9916 PA.preserve<LoopAnalysis>(); 9917 PA.preserve<DominatorTreeAnalysis>(); 9918 } 9919 PA.preserve<BasicAA>(); 9920 PA.preserve<GlobalsAA>(); 9921 if (!Result.MadeCFGChange) 9922 PA.preserveSet<CFGAnalyses>(); 9923 return PA; 9924 } 9925