1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<bool> ForceTargetSupportsScalableVectors( 276 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 277 cl::desc( 278 "Pretend that scalable vectors are supported, even if the target does " 279 "not support them. This flag should only be used for testing.")); 280 281 static cl::opt<unsigned> SmallLoopCost( 282 "small-loop-cost", cl::init(20), cl::Hidden, 283 cl::desc( 284 "The cost of a loop that is considered 'small' by the interleaver.")); 285 286 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 287 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 288 cl::desc("Enable the use of the block frequency analysis to access PGO " 289 "heuristics minimizing code growth in cold regions and being more " 290 "aggressive in hot regions.")); 291 292 // Runtime interleave loops for load/store throughput. 293 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 294 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 295 cl::desc( 296 "Enable runtime interleaving until load/store ports are saturated")); 297 298 /// Interleave small loops with scalar reductions. 299 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 300 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 301 cl::desc("Enable interleaving for loops with small iteration counts that " 302 "contain scalar reductions to expose ILP.")); 303 304 /// The number of stores in a loop that are allowed to need predication. 305 static cl::opt<unsigned> NumberOfStoresToPredicate( 306 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 307 cl::desc("Max number of stores to be predicated behind an if.")); 308 309 static cl::opt<bool> EnableIndVarRegisterHeur( 310 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 311 cl::desc("Count the induction variable only once when interleaving")); 312 313 static cl::opt<bool> EnableCondStoresVectorization( 314 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 315 cl::desc("Enable if predication of stores during vectorization.")); 316 317 static cl::opt<unsigned> MaxNestedScalarReductionIC( 318 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 319 cl::desc("The maximum interleave count to use when interleaving a scalar " 320 "reduction in a nested loop.")); 321 322 static cl::opt<bool> 323 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 324 cl::Hidden, 325 cl::desc("Prefer in-loop vector reductions, " 326 "overriding the targets preference.")); 327 328 static cl::opt<bool> PreferPredicatedReductionSelect( 329 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 330 cl::desc( 331 "Prefer predicating a reduction operation over an after loop select.")); 332 333 cl::opt<bool> EnableVPlanNativePath( 334 "enable-vplan-native-path", cl::init(false), cl::Hidden, 335 cl::desc("Enable VPlan-native vectorization path with " 336 "support for outer loop vectorization.")); 337 338 // FIXME: Remove this switch once we have divergence analysis. Currently we 339 // assume divergent non-backedge branches when this switch is true. 340 cl::opt<bool> EnableVPlanPredication( 341 "enable-vplan-predication", cl::init(false), cl::Hidden, 342 cl::desc("Enable VPlan-native vectorization path predicator with " 343 "support for outer loop vectorization.")); 344 345 // This flag enables the stress testing of the VPlan H-CFG construction in the 346 // VPlan-native vectorization path. It must be used in conjuction with 347 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 348 // verification of the H-CFGs built. 349 static cl::opt<bool> VPlanBuildStressTest( 350 "vplan-build-stress-test", cl::init(false), cl::Hidden, 351 cl::desc( 352 "Build VPlan for every supported loop nest in the function and bail " 353 "out right after the build (stress test the VPlan H-CFG construction " 354 "in the VPlan-native vectorization path).")); 355 356 cl::opt<bool> llvm::EnableLoopInterleaving( 357 "interleave-loops", cl::init(true), cl::Hidden, 358 cl::desc("Enable loop interleaving in Loop vectorization passes")); 359 cl::opt<bool> llvm::EnableLoopVectorization( 360 "vectorize-loops", cl::init(true), cl::Hidden, 361 cl::desc("Run the Loop vectorization passes")); 362 363 cl::opt<bool> PrintVPlansInDotFormat( 364 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 365 cl::desc("Use dot format instead of plain text when dumping VPlans")); 366 367 /// A helper function that returns the type of loaded or stored value. 368 static Type *getMemInstValueType(Value *I) { 369 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 370 "Expected Load or Store instruction"); 371 if (auto *LI = dyn_cast<LoadInst>(I)) 372 return LI->getType(); 373 return cast<StoreInst>(I)->getValueOperand()->getType(); 374 } 375 376 /// A helper function that returns true if the given type is irregular. The 377 /// type is irregular if its allocated size doesn't equal the store size of an 378 /// element of the corresponding vector type. 379 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 380 // Determine if an array of N elements of type Ty is "bitcast compatible" 381 // with a <N x Ty> vector. 382 // This is only true if there is no padding between the array elements. 383 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 384 } 385 386 /// A helper function that returns the reciprocal of the block probability of 387 /// predicated blocks. If we return X, we are assuming the predicated block 388 /// will execute once for every X iterations of the loop header. 389 /// 390 /// TODO: We should use actual block probability here, if available. Currently, 391 /// we always assume predicated blocks have a 50% chance of executing. 392 static unsigned getReciprocalPredBlockProb() { return 2; } 393 394 /// A helper function that returns an integer or floating-point constant with 395 /// value C. 396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 397 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 398 : ConstantFP::get(Ty, C); 399 } 400 401 /// Returns "best known" trip count for the specified loop \p L as defined by 402 /// the following procedure: 403 /// 1) Returns exact trip count if it is known. 404 /// 2) Returns expected trip count according to profile data if any. 405 /// 3) Returns upper bound estimate if it is known. 406 /// 4) Returns None if all of the above failed. 407 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 408 // Check if exact trip count is known. 409 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 410 return ExpectedTC; 411 412 // Check if there is an expected trip count available from profile data. 413 if (LoopVectorizeWithBlockFrequency) 414 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 415 return EstimatedTC; 416 417 // Check if upper bound estimate is known. 418 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 419 return ExpectedTC; 420 421 return None; 422 } 423 424 // Forward declare GeneratedRTChecks. 425 class GeneratedRTChecks; 426 427 namespace llvm { 428 429 /// InnerLoopVectorizer vectorizes loops which contain only one basic 430 /// block to a specified vectorization factor (VF). 431 /// This class performs the widening of scalars into vectors, or multiple 432 /// scalars. This class also implements the following features: 433 /// * It inserts an epilogue loop for handling loops that don't have iteration 434 /// counts that are known to be a multiple of the vectorization factor. 435 /// * It handles the code generation for reduction variables. 436 /// * Scalarization (implementation using scalars) of un-vectorizable 437 /// instructions. 438 /// InnerLoopVectorizer does not perform any vectorization-legality 439 /// checks, and relies on the caller to check for the different legality 440 /// aspects. The InnerLoopVectorizer relies on the 441 /// LoopVectorizationLegality class to provide information about the induction 442 /// and reduction variables that were found to a given vectorization factor. 443 class InnerLoopVectorizer { 444 public: 445 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 446 LoopInfo *LI, DominatorTree *DT, 447 const TargetLibraryInfo *TLI, 448 const TargetTransformInfo *TTI, AssumptionCache *AC, 449 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 450 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 451 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 452 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 453 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 454 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 455 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 456 PSI(PSI), RTChecks(RTChecks) { 457 // Query this against the original loop and save it here because the profile 458 // of the original loop header may change as the transformation happens. 459 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 460 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 461 } 462 463 virtual ~InnerLoopVectorizer() = default; 464 465 /// Create a new empty loop that will contain vectorized instructions later 466 /// on, while the old loop will be used as the scalar remainder. Control flow 467 /// is generated around the vectorized (and scalar epilogue) loops consisting 468 /// of various checks and bypasses. Return the pre-header block of the new 469 /// loop. 470 /// In the case of epilogue vectorization, this function is overriden to 471 /// handle the more complex control flow around the loops. 472 virtual BasicBlock *createVectorizedLoopSkeleton(); 473 474 /// Widen a single instruction within the innermost loop. 475 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 476 VPTransformState &State); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Widen a single select instruction within the innermost loop. 483 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 484 bool InvariantCond, VPTransformState &State); 485 486 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 487 void fixVectorizedLoop(VPTransformState &State); 488 489 // Return true if any runtime check is added. 490 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 491 492 /// A type for vectorized values in the new loop. Each value from the 493 /// original loop, when vectorized, is represented by UF vector values in the 494 /// new unrolled loop, where UF is the unroll factor. 495 using VectorParts = SmallVector<Value *, 2>; 496 497 /// Vectorize a single GetElementPtrInst based on information gathered and 498 /// decisions taken during planning. 499 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 500 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 501 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 502 503 /// Vectorize a single PHINode in a block. This method handles the induction 504 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 505 /// arbitrary length vectors. 506 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 507 VPValue *StartV, VPValue *Def, 508 VPTransformState &State); 509 510 /// A helper function to scalarize a single Instruction in the innermost loop. 511 /// Generates a sequence of scalar instances for each lane between \p MinLane 512 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 513 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 514 /// Instr's operands. 515 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 516 const VPIteration &Instance, bool IfPredicateInstr, 517 VPTransformState &State); 518 519 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 520 /// is provided, the integer induction variable will first be truncated to 521 /// the corresponding type. 522 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 523 VPValue *Def, VPValue *CastDef, 524 VPTransformState &State); 525 526 /// Construct the vector value of a scalarized value \p V one lane at a time. 527 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 528 VPTransformState &State); 529 530 /// Try to vectorize interleaved access group \p Group with the base address 531 /// given in \p Addr, optionally masking the vector operations if \p 532 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 533 /// values in the vectorized loop. 534 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 535 ArrayRef<VPValue *> VPDefs, 536 VPTransformState &State, VPValue *Addr, 537 ArrayRef<VPValue *> StoredValues, 538 VPValue *BlockInMask = nullptr); 539 540 /// Vectorize Load and Store instructions with the base address given in \p 541 /// Addr, optionally masking the vector operations if \p BlockInMask is 542 /// non-null. Use \p State to translate given VPValues to IR values in the 543 /// vectorized loop. 544 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 545 VPValue *Def, VPValue *Addr, 546 VPValue *StoredValue, VPValue *BlockInMask); 547 548 /// Set the debug location in the builder using the debug location in 549 /// the instruction. 550 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 551 552 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 553 void fixNonInductionPHIs(VPTransformState &State); 554 555 /// Create a broadcast instruction. This method generates a broadcast 556 /// instruction (shuffle) for loop invariant values and for the induction 557 /// value. If this is the induction variable then we extend it to N, N+1, ... 558 /// this is needed because each iteration in the loop corresponds to a SIMD 559 /// element. 560 virtual Value *getBroadcastInstrs(Value *V); 561 562 protected: 563 friend class LoopVectorizationPlanner; 564 565 /// A small list of PHINodes. 566 using PhiVector = SmallVector<PHINode *, 4>; 567 568 /// A type for scalarized values in the new loop. Each value from the 569 /// original loop, when scalarized, is represented by UF x VF scalar values 570 /// in the new unrolled loop, where UF is the unroll factor and VF is the 571 /// vectorization factor. 572 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 573 574 /// Set up the values of the IVs correctly when exiting the vector loop. 575 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 576 Value *CountRoundDown, Value *EndValue, 577 BasicBlock *MiddleBlock); 578 579 /// Create a new induction variable inside L. 580 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 581 Value *Step, Instruction *DL); 582 583 /// Handle all cross-iteration phis in the header. 584 void fixCrossIterationPHIs(VPTransformState &State); 585 586 /// Fix a first-order recurrence. This is the second phase of vectorizing 587 /// this phi node. 588 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 589 590 /// Fix a reduction cross-iteration phi. This is the second phase of 591 /// vectorizing this phi node. 592 void fixReduction(PHINode *Phi, VPTransformState &State); 593 594 /// Clear NSW/NUW flags from reduction instructions if necessary. 595 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 596 VPTransformState &State); 597 598 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 599 /// means we need to add the appropriate incoming value from the middle 600 /// block as exiting edges from the scalar epilogue loop (if present) are 601 /// already in place, and we exit the vector loop exclusively to the middle 602 /// block. 603 void fixLCSSAPHIs(VPTransformState &State); 604 605 /// Iteratively sink the scalarized operands of a predicated instruction into 606 /// the block that was created for it. 607 void sinkScalarOperands(Instruction *PredInst); 608 609 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 610 /// represented as. 611 void truncateToMinimalBitwidths(VPTransformState &State); 612 613 /// This function adds 614 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 615 /// to each vector element of Val. The sequence starts at StartIndex. 616 /// \p Opcode is relevant for FP induction variable. 617 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 618 Instruction::BinaryOps Opcode = 619 Instruction::BinaryOpsEnd); 620 621 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 622 /// variable on which to base the steps, \p Step is the size of the step, and 623 /// \p EntryVal is the value from the original loop that maps to the steps. 624 /// Note that \p EntryVal doesn't have to be an induction variable - it 625 /// can also be a truncate instruction. 626 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 627 const InductionDescriptor &ID, VPValue *Def, 628 VPValue *CastDef, VPTransformState &State); 629 630 /// Create a vector induction phi node based on an existing scalar one. \p 631 /// EntryVal is the value from the original loop that maps to the vector phi 632 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 633 /// truncate instruction, instead of widening the original IV, we widen a 634 /// version of the IV truncated to \p EntryVal's type. 635 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 636 Value *Step, Value *Start, 637 Instruction *EntryVal, VPValue *Def, 638 VPValue *CastDef, 639 VPTransformState &State); 640 641 /// Returns true if an instruction \p I should be scalarized instead of 642 /// vectorized for the chosen vectorization factor. 643 bool shouldScalarizeInstruction(Instruction *I) const; 644 645 /// Returns true if we should generate a scalar version of \p IV. 646 bool needsScalarInduction(Instruction *IV) const; 647 648 /// If there is a cast involved in the induction variable \p ID, which should 649 /// be ignored in the vectorized loop body, this function records the 650 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 651 /// cast. We had already proved that the casted Phi is equal to the uncasted 652 /// Phi in the vectorized loop (under a runtime guard), and therefore 653 /// there is no need to vectorize the cast - the same value can be used in the 654 /// vector loop for both the Phi and the cast. 655 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 656 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 657 /// 658 /// \p EntryVal is the value from the original loop that maps to the vector 659 /// phi node and is used to distinguish what is the IV currently being 660 /// processed - original one (if \p EntryVal is a phi corresponding to the 661 /// original IV) or the "newly-created" one based on the proof mentioned above 662 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 663 /// latter case \p EntryVal is a TruncInst and we must not record anything for 664 /// that IV, but it's error-prone to expect callers of this routine to care 665 /// about that, hence this explicit parameter. 666 void recordVectorLoopValueForInductionCast( 667 const InductionDescriptor &ID, const Instruction *EntryVal, 668 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 669 unsigned Part, unsigned Lane = UINT_MAX); 670 671 /// Generate a shuffle sequence that will reverse the vector Vec. 672 virtual Value *reverseVector(Value *Vec); 673 674 /// Returns (and creates if needed) the original loop trip count. 675 Value *getOrCreateTripCount(Loop *NewLoop); 676 677 /// Returns (and creates if needed) the trip count of the widened loop. 678 Value *getOrCreateVectorTripCount(Loop *NewLoop); 679 680 /// Returns a bitcasted value to the requested vector type. 681 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 682 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 683 const DataLayout &DL); 684 685 /// Emit a bypass check to see if the vector trip count is zero, including if 686 /// it overflows. 687 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 688 689 /// Emit a bypass check to see if all of the SCEV assumptions we've 690 /// had to make are correct. Returns the block containing the checks or 691 /// nullptr if no checks have been added. 692 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 693 694 /// Emit bypass checks to check any memory assumptions we may have made. 695 /// Returns the block containing the checks or nullptr if no checks have been 696 /// added. 697 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Compute the transformed value of Index at offset StartValue using step 700 /// StepValue. 701 /// For integer induction, returns StartValue + Index * StepValue. 702 /// For pointer induction, returns StartValue[Index * StepValue]. 703 /// FIXME: The newly created binary instructions should contain nsw/nuw 704 /// flags, which can be found from the original scalar operations. 705 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 706 const DataLayout &DL, 707 const InductionDescriptor &ID) const; 708 709 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 710 /// vector loop preheader, middle block and scalar preheader. Also 711 /// allocate a loop object for the new vector loop and return it. 712 Loop *createVectorLoopSkeleton(StringRef Prefix); 713 714 /// Create new phi nodes for the induction variables to resume iteration count 715 /// in the scalar epilogue, from where the vectorized loop left off (given by 716 /// \p VectorTripCount). 717 /// In cases where the loop skeleton is more complicated (eg. epilogue 718 /// vectorization) and the resume values can come from an additional bypass 719 /// block, the \p AdditionalBypass pair provides information about the bypass 720 /// block and the end value on the edge from bypass to this loop. 721 void createInductionResumeValues( 722 Loop *L, Value *VectorTripCount, 723 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 724 725 /// Complete the loop skeleton by adding debug MDs, creating appropriate 726 /// conditional branches in the middle block, preparing the builder and 727 /// running the verifier. Take in the vector loop \p L as argument, and return 728 /// the preheader of the completed vector loop. 729 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 730 731 /// Add additional metadata to \p To that was not present on \p Orig. 732 /// 733 /// Currently this is used to add the noalias annotations based on the 734 /// inserted memchecks. Use this for instructions that are *cloned* into the 735 /// vector loop. 736 void addNewMetadata(Instruction *To, const Instruction *Orig); 737 738 /// Add metadata from one instruction to another. 739 /// 740 /// This includes both the original MDs from \p From and additional ones (\see 741 /// addNewMetadata). Use this for *newly created* instructions in the vector 742 /// loop. 743 void addMetadata(Instruction *To, Instruction *From); 744 745 /// Similar to the previous function but it adds the metadata to a 746 /// vector of instructions. 747 void addMetadata(ArrayRef<Value *> To, Instruction *From); 748 749 /// Allow subclasses to override and print debug traces before/after vplan 750 /// execution, when trace information is requested. 751 virtual void printDebugTracesAtStart(){}; 752 virtual void printDebugTracesAtEnd(){}; 753 754 /// The original loop. 755 Loop *OrigLoop; 756 757 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 758 /// dynamic knowledge to simplify SCEV expressions and converts them to a 759 /// more usable form. 760 PredicatedScalarEvolution &PSE; 761 762 /// Loop Info. 763 LoopInfo *LI; 764 765 /// Dominator Tree. 766 DominatorTree *DT; 767 768 /// Alias Analysis. 769 AAResults *AA; 770 771 /// Target Library Info. 772 const TargetLibraryInfo *TLI; 773 774 /// Target Transform Info. 775 const TargetTransformInfo *TTI; 776 777 /// Assumption Cache. 778 AssumptionCache *AC; 779 780 /// Interface to emit optimization remarks. 781 OptimizationRemarkEmitter *ORE; 782 783 /// LoopVersioning. It's only set up (non-null) if memchecks were 784 /// used. 785 /// 786 /// This is currently only used to add no-alias metadata based on the 787 /// memchecks. The actually versioning is performed manually. 788 std::unique_ptr<LoopVersioning> LVer; 789 790 /// The vectorization SIMD factor to use. Each vector will have this many 791 /// vector elements. 792 ElementCount VF; 793 794 /// The vectorization unroll factor to use. Each scalar is vectorized to this 795 /// many different vector instructions. 796 unsigned UF; 797 798 /// The builder that we use 799 IRBuilder<> Builder; 800 801 // --- Vectorization state --- 802 803 /// The vector-loop preheader. 804 BasicBlock *LoopVectorPreHeader; 805 806 /// The scalar-loop preheader. 807 BasicBlock *LoopScalarPreHeader; 808 809 /// Middle Block between the vector and the scalar. 810 BasicBlock *LoopMiddleBlock; 811 812 /// The (unique) ExitBlock of the scalar loop. Note that 813 /// there can be multiple exiting edges reaching this block. 814 BasicBlock *LoopExitBlock; 815 816 /// The vector loop body. 817 BasicBlock *LoopVectorBody; 818 819 /// The scalar loop body. 820 BasicBlock *LoopScalarBody; 821 822 /// A list of all bypass blocks. The first block is the entry of the loop. 823 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 824 825 /// The new Induction variable which was added to the new block. 826 PHINode *Induction = nullptr; 827 828 /// The induction variable of the old basic block. 829 PHINode *OldInduction = nullptr; 830 831 /// Store instructions that were predicated. 832 SmallVector<Instruction *, 4> PredicatedInstructions; 833 834 /// Trip count of the original loop. 835 Value *TripCount = nullptr; 836 837 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 838 Value *VectorTripCount = nullptr; 839 840 /// The legality analysis. 841 LoopVectorizationLegality *Legal; 842 843 /// The profitablity analysis. 844 LoopVectorizationCostModel *Cost; 845 846 // Record whether runtime checks are added. 847 bool AddedSafetyChecks = false; 848 849 // Holds the end values for each induction variable. We save the end values 850 // so we can later fix-up the external users of the induction variables. 851 DenseMap<PHINode *, Value *> IVEndValues; 852 853 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 854 // fixed up at the end of vector code generation. 855 SmallVector<PHINode *, 8> OrigPHIsToFix; 856 857 /// BFI and PSI are used to check for profile guided size optimizations. 858 BlockFrequencyInfo *BFI; 859 ProfileSummaryInfo *PSI; 860 861 // Whether this loop should be optimized for size based on profile guided size 862 // optimizatios. 863 bool OptForSizeBasedOnProfile; 864 865 /// Structure to hold information about generated runtime checks, responsible 866 /// for cleaning the checks, if vectorization turns out unprofitable. 867 GeneratedRTChecks &RTChecks; 868 }; 869 870 class InnerLoopUnroller : public InnerLoopVectorizer { 871 public: 872 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 873 LoopInfo *LI, DominatorTree *DT, 874 const TargetLibraryInfo *TLI, 875 const TargetTransformInfo *TTI, AssumptionCache *AC, 876 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 877 LoopVectorizationLegality *LVL, 878 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 879 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 880 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 881 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 882 BFI, PSI, Check) {} 883 884 private: 885 Value *getBroadcastInstrs(Value *V) override; 886 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 887 Instruction::BinaryOps Opcode = 888 Instruction::BinaryOpsEnd) override; 889 Value *reverseVector(Value *Vec) override; 890 }; 891 892 /// Encapsulate information regarding vectorization of a loop and its epilogue. 893 /// This information is meant to be updated and used across two stages of 894 /// epilogue vectorization. 895 struct EpilogueLoopVectorizationInfo { 896 ElementCount MainLoopVF = ElementCount::getFixed(0); 897 unsigned MainLoopUF = 0; 898 ElementCount EpilogueVF = ElementCount::getFixed(0); 899 unsigned EpilogueUF = 0; 900 BasicBlock *MainLoopIterationCountCheck = nullptr; 901 BasicBlock *EpilogueIterationCountCheck = nullptr; 902 BasicBlock *SCEVSafetyCheck = nullptr; 903 BasicBlock *MemSafetyCheck = nullptr; 904 Value *TripCount = nullptr; 905 Value *VectorTripCount = nullptr; 906 907 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 908 unsigned EUF) 909 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 910 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 911 assert(EUF == 1 && 912 "A high UF for the epilogue loop is likely not beneficial."); 913 } 914 }; 915 916 /// An extension of the inner loop vectorizer that creates a skeleton for a 917 /// vectorized loop that has its epilogue (residual) also vectorized. 918 /// The idea is to run the vplan on a given loop twice, firstly to setup the 919 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 920 /// from the first step and vectorize the epilogue. This is achieved by 921 /// deriving two concrete strategy classes from this base class and invoking 922 /// them in succession from the loop vectorizer planner. 923 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 924 public: 925 InnerLoopAndEpilogueVectorizer( 926 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 927 DominatorTree *DT, const TargetLibraryInfo *TLI, 928 const TargetTransformInfo *TTI, AssumptionCache *AC, 929 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 930 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 931 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 932 GeneratedRTChecks &Checks) 933 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 934 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 935 Checks), 936 EPI(EPI) {} 937 938 // Override this function to handle the more complex control flow around the 939 // three loops. 940 BasicBlock *createVectorizedLoopSkeleton() final override { 941 return createEpilogueVectorizedLoopSkeleton(); 942 } 943 944 /// The interface for creating a vectorized skeleton using one of two 945 /// different strategies, each corresponding to one execution of the vplan 946 /// as described above. 947 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 948 949 /// Holds and updates state information required to vectorize the main loop 950 /// and its epilogue in two separate passes. This setup helps us avoid 951 /// regenerating and recomputing runtime safety checks. It also helps us to 952 /// shorten the iteration-count-check path length for the cases where the 953 /// iteration count of the loop is so small that the main vector loop is 954 /// completely skipped. 955 EpilogueLoopVectorizationInfo &EPI; 956 }; 957 958 /// A specialized derived class of inner loop vectorizer that performs 959 /// vectorization of *main* loops in the process of vectorizing loops and their 960 /// epilogues. 961 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 962 public: 963 EpilogueVectorizerMainLoop( 964 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 965 DominatorTree *DT, const TargetLibraryInfo *TLI, 966 const TargetTransformInfo *TTI, AssumptionCache *AC, 967 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 968 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 969 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 970 GeneratedRTChecks &Check) 971 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 972 EPI, LVL, CM, BFI, PSI, Check) {} 973 /// Implements the interface for creating a vectorized skeleton using the 974 /// *main loop* strategy (ie the first pass of vplan execution). 975 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 976 977 protected: 978 /// Emits an iteration count bypass check once for the main loop (when \p 979 /// ForEpilogue is false) and once for the epilogue loop (when \p 980 /// ForEpilogue is true). 981 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 982 bool ForEpilogue); 983 void printDebugTracesAtStart() override; 984 void printDebugTracesAtEnd() override; 985 }; 986 987 // A specialized derived class of inner loop vectorizer that performs 988 // vectorization of *epilogue* loops in the process of vectorizing loops and 989 // their epilogues. 990 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 991 public: 992 EpilogueVectorizerEpilogueLoop( 993 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 994 DominatorTree *DT, const TargetLibraryInfo *TLI, 995 const TargetTransformInfo *TTI, AssumptionCache *AC, 996 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 997 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 998 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 999 GeneratedRTChecks &Checks) 1000 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1001 EPI, LVL, CM, BFI, PSI, Checks) {} 1002 /// Implements the interface for creating a vectorized skeleton using the 1003 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1004 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1005 1006 protected: 1007 /// Emits an iteration count bypass check after the main vector loop has 1008 /// finished to see if there are any iterations left to execute by either 1009 /// the vector epilogue or the scalar epilogue. 1010 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1011 BasicBlock *Bypass, 1012 BasicBlock *Insert); 1013 void printDebugTracesAtStart() override; 1014 void printDebugTracesAtEnd() override; 1015 }; 1016 } // end namespace llvm 1017 1018 /// Look for a meaningful debug location on the instruction or it's 1019 /// operands. 1020 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1021 if (!I) 1022 return I; 1023 1024 DebugLoc Empty; 1025 if (I->getDebugLoc() != Empty) 1026 return I; 1027 1028 for (Use &Op : I->operands()) { 1029 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1030 if (OpInst->getDebugLoc() != Empty) 1031 return OpInst; 1032 } 1033 1034 return I; 1035 } 1036 1037 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1038 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1039 const DILocation *DIL = Inst->getDebugLoc(); 1040 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1041 !isa<DbgInfoIntrinsic>(Inst)) { 1042 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1043 auto NewDIL = 1044 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1045 if (NewDIL) 1046 B.SetCurrentDebugLocation(NewDIL.getValue()); 1047 else 1048 LLVM_DEBUG(dbgs() 1049 << "Failed to create new discriminator: " 1050 << DIL->getFilename() << " Line: " << DIL->getLine()); 1051 } 1052 else 1053 B.SetCurrentDebugLocation(DIL); 1054 } else 1055 B.SetCurrentDebugLocation(DebugLoc()); 1056 } 1057 1058 /// Write a record \p DebugMsg about vectorization failure to the debug 1059 /// output stream. If \p I is passed, it is an instruction that prevents 1060 /// vectorization. 1061 #ifndef NDEBUG 1062 static void debugVectorizationFailure(const StringRef DebugMsg, 1063 Instruction *I) { 1064 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1065 if (I != nullptr) 1066 dbgs() << " " << *I; 1067 else 1068 dbgs() << '.'; 1069 dbgs() << '\n'; 1070 } 1071 #endif 1072 1073 /// Create an analysis remark that explains why vectorization failed 1074 /// 1075 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1076 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1077 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1078 /// the location of the remark. \return the remark object that can be 1079 /// streamed to. 1080 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1081 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1082 Value *CodeRegion = TheLoop->getHeader(); 1083 DebugLoc DL = TheLoop->getStartLoc(); 1084 1085 if (I) { 1086 CodeRegion = I->getParent(); 1087 // If there is no debug location attached to the instruction, revert back to 1088 // using the loop's. 1089 if (I->getDebugLoc()) 1090 DL = I->getDebugLoc(); 1091 } 1092 1093 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1094 R << "loop not vectorized: "; 1095 return R; 1096 } 1097 1098 /// Return a value for Step multiplied by VF. 1099 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1100 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1101 Constant *StepVal = ConstantInt::get( 1102 Step->getType(), 1103 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1104 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1105 } 1106 1107 namespace llvm { 1108 1109 /// Return the runtime value for VF. 1110 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1111 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1113 } 1114 1115 void reportVectorizationFailure(const StringRef DebugMsg, 1116 const StringRef OREMsg, const StringRef ORETag, 1117 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1118 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1119 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1120 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1121 ORETag, TheLoop, I) << OREMsg); 1122 } 1123 1124 } // end namespace llvm 1125 1126 #ifndef NDEBUG 1127 /// \return string containing a file name and a line # for the given loop. 1128 static std::string getDebugLocString(const Loop *L) { 1129 std::string Result; 1130 if (L) { 1131 raw_string_ostream OS(Result); 1132 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1133 LoopDbgLoc.print(OS); 1134 else 1135 // Just print the module name. 1136 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1137 OS.flush(); 1138 } 1139 return Result; 1140 } 1141 #endif 1142 1143 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1144 const Instruction *Orig) { 1145 // If the loop was versioned with memchecks, add the corresponding no-alias 1146 // metadata. 1147 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1148 LVer->annotateInstWithNoAlias(To, Orig); 1149 } 1150 1151 void InnerLoopVectorizer::addMetadata(Instruction *To, 1152 Instruction *From) { 1153 propagateMetadata(To, From); 1154 addNewMetadata(To, From); 1155 } 1156 1157 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1158 Instruction *From) { 1159 for (Value *V : To) { 1160 if (Instruction *I = dyn_cast<Instruction>(V)) 1161 addMetadata(I, From); 1162 } 1163 } 1164 1165 namespace llvm { 1166 1167 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1168 // lowered. 1169 enum ScalarEpilogueLowering { 1170 1171 // The default: allowing scalar epilogues. 1172 CM_ScalarEpilogueAllowed, 1173 1174 // Vectorization with OptForSize: don't allow epilogues. 1175 CM_ScalarEpilogueNotAllowedOptSize, 1176 1177 // A special case of vectorisation with OptForSize: loops with a very small 1178 // trip count are considered for vectorization under OptForSize, thereby 1179 // making sure the cost of their loop body is dominant, free of runtime 1180 // guards and scalar iteration overheads. 1181 CM_ScalarEpilogueNotAllowedLowTripLoop, 1182 1183 // Loop hint predicate indicating an epilogue is undesired. 1184 CM_ScalarEpilogueNotNeededUsePredicate, 1185 1186 // Directive indicating we must either tail fold or not vectorize 1187 CM_ScalarEpilogueNotAllowedUsePredicate 1188 }; 1189 1190 /// LoopVectorizationCostModel - estimates the expected speedups due to 1191 /// vectorization. 1192 /// In many cases vectorization is not profitable. This can happen because of 1193 /// a number of reasons. In this class we mainly attempt to predict the 1194 /// expected speedup/slowdowns due to the supported instruction set. We use the 1195 /// TargetTransformInfo to query the different backends for the cost of 1196 /// different operations. 1197 class LoopVectorizationCostModel { 1198 public: 1199 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1200 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1201 LoopVectorizationLegality *Legal, 1202 const TargetTransformInfo &TTI, 1203 const TargetLibraryInfo *TLI, DemandedBits *DB, 1204 AssumptionCache *AC, 1205 OptimizationRemarkEmitter *ORE, const Function *F, 1206 const LoopVectorizeHints *Hints, 1207 InterleavedAccessInfo &IAI) 1208 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1209 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1210 Hints(Hints), InterleaveInfo(IAI) {} 1211 1212 /// \return An upper bound for the vectorization factor, or None if 1213 /// vectorization and interleaving should be avoided up front. 1214 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1215 1216 /// \return True if runtime checks are required for vectorization, and false 1217 /// otherwise. 1218 bool runtimeChecksRequired(); 1219 1220 /// \return The most profitable vectorization factor and the cost of that VF. 1221 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1222 /// then this vectorization factor will be selected if vectorization is 1223 /// possible. 1224 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1225 VectorizationFactor 1226 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1227 const LoopVectorizationPlanner &LVP); 1228 1229 /// Setup cost-based decisions for user vectorization factor. 1230 void selectUserVectorizationFactor(ElementCount UserVF) { 1231 collectUniformsAndScalars(UserVF); 1232 collectInstsToScalarize(UserVF); 1233 } 1234 1235 /// \return The size (in bits) of the smallest and widest types in the code 1236 /// that needs to be vectorized. We ignore values that remain scalar such as 1237 /// 64 bit loop indices. 1238 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1239 1240 /// \return The desired interleave count. 1241 /// If interleave count has been specified by metadata it will be returned. 1242 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1243 /// are the selected vectorization factor and the cost of the selected VF. 1244 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1245 1246 /// Memory access instruction may be vectorized in more than one way. 1247 /// Form of instruction after vectorization depends on cost. 1248 /// This function takes cost-based decisions for Load/Store instructions 1249 /// and collects them in a map. This decisions map is used for building 1250 /// the lists of loop-uniform and loop-scalar instructions. 1251 /// The calculated cost is saved with widening decision in order to 1252 /// avoid redundant calculations. 1253 void setCostBasedWideningDecision(ElementCount VF); 1254 1255 /// A struct that represents some properties of the register usage 1256 /// of a loop. 1257 struct RegisterUsage { 1258 /// Holds the number of loop invariant values that are used in the loop. 1259 /// The key is ClassID of target-provided register class. 1260 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1261 /// Holds the maximum number of concurrent live intervals in the loop. 1262 /// The key is ClassID of target-provided register class. 1263 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1264 }; 1265 1266 /// \return Returns information about the register usages of the loop for the 1267 /// given vectorization factors. 1268 SmallVector<RegisterUsage, 8> 1269 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1270 1271 /// Collect values we want to ignore in the cost model. 1272 void collectValuesToIgnore(); 1273 1274 /// Split reductions into those that happen in the loop, and those that happen 1275 /// outside. In loop reductions are collected into InLoopReductionChains. 1276 void collectInLoopReductions(); 1277 1278 /// \returns The smallest bitwidth each instruction can be represented with. 1279 /// The vector equivalents of these instructions should be truncated to this 1280 /// type. 1281 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1282 return MinBWs; 1283 } 1284 1285 /// \returns True if it is more profitable to scalarize instruction \p I for 1286 /// vectorization factor \p VF. 1287 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1288 assert(VF.isVector() && 1289 "Profitable to scalarize relevant only for VF > 1."); 1290 1291 // Cost model is not run in the VPlan-native path - return conservative 1292 // result until this changes. 1293 if (EnableVPlanNativePath) 1294 return false; 1295 1296 auto Scalars = InstsToScalarize.find(VF); 1297 assert(Scalars != InstsToScalarize.end() && 1298 "VF not yet analyzed for scalarization profitability"); 1299 return Scalars->second.find(I) != Scalars->second.end(); 1300 } 1301 1302 /// Returns true if \p I is known to be uniform after vectorization. 1303 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1304 if (VF.isScalar()) 1305 return true; 1306 1307 // Cost model is not run in the VPlan-native path - return conservative 1308 // result until this changes. 1309 if (EnableVPlanNativePath) 1310 return false; 1311 1312 auto UniformsPerVF = Uniforms.find(VF); 1313 assert(UniformsPerVF != Uniforms.end() && 1314 "VF not yet analyzed for uniformity"); 1315 return UniformsPerVF->second.count(I); 1316 } 1317 1318 /// Returns true if \p I is known to be scalar after vectorization. 1319 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1320 if (VF.isScalar()) 1321 return true; 1322 1323 // Cost model is not run in the VPlan-native path - return conservative 1324 // result until this changes. 1325 if (EnableVPlanNativePath) 1326 return false; 1327 1328 auto ScalarsPerVF = Scalars.find(VF); 1329 assert(ScalarsPerVF != Scalars.end() && 1330 "Scalar values are not calculated for VF"); 1331 return ScalarsPerVF->second.count(I); 1332 } 1333 1334 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1335 /// for vectorization factor \p VF. 1336 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1337 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1338 !isProfitableToScalarize(I, VF) && 1339 !isScalarAfterVectorization(I, VF); 1340 } 1341 1342 /// Decision that was taken during cost calculation for memory instruction. 1343 enum InstWidening { 1344 CM_Unknown, 1345 CM_Widen, // For consecutive accesses with stride +1. 1346 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1347 CM_Interleave, 1348 CM_GatherScatter, 1349 CM_Scalarize 1350 }; 1351 1352 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1353 /// instruction \p I and vector width \p VF. 1354 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1355 InstructionCost Cost) { 1356 assert(VF.isVector() && "Expected VF >=2"); 1357 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1358 } 1359 1360 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1361 /// interleaving group \p Grp and vector width \p VF. 1362 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1363 ElementCount VF, InstWidening W, 1364 InstructionCost Cost) { 1365 assert(VF.isVector() && "Expected VF >=2"); 1366 /// Broadcast this decicion to all instructions inside the group. 1367 /// But the cost will be assigned to one instruction only. 1368 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1369 if (auto *I = Grp->getMember(i)) { 1370 if (Grp->getInsertPos() == I) 1371 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1372 else 1373 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1374 } 1375 } 1376 } 1377 1378 /// Return the cost model decision for the given instruction \p I and vector 1379 /// width \p VF. Return CM_Unknown if this instruction did not pass 1380 /// through the cost modeling. 1381 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1382 assert(VF.isVector() && "Expected VF to be a vector VF"); 1383 // Cost model is not run in the VPlan-native path - return conservative 1384 // result until this changes. 1385 if (EnableVPlanNativePath) 1386 return CM_GatherScatter; 1387 1388 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1389 auto Itr = WideningDecisions.find(InstOnVF); 1390 if (Itr == WideningDecisions.end()) 1391 return CM_Unknown; 1392 return Itr->second.first; 1393 } 1394 1395 /// Return the vectorization cost for the given instruction \p I and vector 1396 /// width \p VF. 1397 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1398 assert(VF.isVector() && "Expected VF >=2"); 1399 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1400 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1401 "The cost is not calculated"); 1402 return WideningDecisions[InstOnVF].second; 1403 } 1404 1405 /// Return True if instruction \p I is an optimizable truncate whose operand 1406 /// is an induction variable. Such a truncate will be removed by adding a new 1407 /// induction variable with the destination type. 1408 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1409 // If the instruction is not a truncate, return false. 1410 auto *Trunc = dyn_cast<TruncInst>(I); 1411 if (!Trunc) 1412 return false; 1413 1414 // Get the source and destination types of the truncate. 1415 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1416 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1417 1418 // If the truncate is free for the given types, return false. Replacing a 1419 // free truncate with an induction variable would add an induction variable 1420 // update instruction to each iteration of the loop. We exclude from this 1421 // check the primary induction variable since it will need an update 1422 // instruction regardless. 1423 Value *Op = Trunc->getOperand(0); 1424 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1425 return false; 1426 1427 // If the truncated value is not an induction variable, return false. 1428 return Legal->isInductionPhi(Op); 1429 } 1430 1431 /// Collects the instructions to scalarize for each predicated instruction in 1432 /// the loop. 1433 void collectInstsToScalarize(ElementCount VF); 1434 1435 /// Collect Uniform and Scalar values for the given \p VF. 1436 /// The sets depend on CM decision for Load/Store instructions 1437 /// that may be vectorized as interleave, gather-scatter or scalarized. 1438 void collectUniformsAndScalars(ElementCount VF) { 1439 // Do the analysis once. 1440 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1441 return; 1442 setCostBasedWideningDecision(VF); 1443 collectLoopUniforms(VF); 1444 collectLoopScalars(VF); 1445 } 1446 1447 /// Returns true if the target machine supports masked store operation 1448 /// for the given \p DataType and kind of access to \p Ptr. 1449 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1450 return Legal->isConsecutivePtr(Ptr) && 1451 TTI.isLegalMaskedStore(DataType, Alignment); 1452 } 1453 1454 /// Returns true if the target machine supports masked load operation 1455 /// for the given \p DataType and kind of access to \p Ptr. 1456 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1457 return Legal->isConsecutivePtr(Ptr) && 1458 TTI.isLegalMaskedLoad(DataType, Alignment); 1459 } 1460 1461 /// Returns true if the target machine supports masked scatter operation 1462 /// for the given \p DataType. 1463 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1464 return TTI.isLegalMaskedScatter(DataType, Alignment); 1465 } 1466 1467 /// Returns true if the target machine supports masked gather operation 1468 /// for the given \p DataType. 1469 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1470 return TTI.isLegalMaskedGather(DataType, Alignment); 1471 } 1472 1473 /// Returns true if the target machine can represent \p V as a masked gather 1474 /// or scatter operation. 1475 bool isLegalGatherOrScatter(Value *V) { 1476 bool LI = isa<LoadInst>(V); 1477 bool SI = isa<StoreInst>(V); 1478 if (!LI && !SI) 1479 return false; 1480 auto *Ty = getMemInstValueType(V); 1481 Align Align = getLoadStoreAlignment(V); 1482 return (LI && isLegalMaskedGather(Ty, Align)) || 1483 (SI && isLegalMaskedScatter(Ty, Align)); 1484 } 1485 1486 /// Returns true if the target machine supports all of the reduction 1487 /// variables found for the given VF. 1488 bool canVectorizeReductions(ElementCount VF) { 1489 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1490 RecurrenceDescriptor RdxDesc = Reduction.second; 1491 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1492 })); 1493 } 1494 1495 /// Returns true if \p I is an instruction that will be scalarized with 1496 /// predication. Such instructions include conditional stores and 1497 /// instructions that may divide by zero. 1498 /// If a non-zero VF has been calculated, we check if I will be scalarized 1499 /// predication for that VF. 1500 bool isScalarWithPredication(Instruction *I, 1501 ElementCount VF = ElementCount::getFixed(1)); 1502 1503 // Returns true if \p I is an instruction that will be predicated either 1504 // through scalar predication or masked load/store or masked gather/scatter. 1505 // Superset of instructions that return true for isScalarWithPredication. 1506 bool isPredicatedInst(Instruction *I) { 1507 if (!blockNeedsPredication(I->getParent())) 1508 return false; 1509 // Loads and stores that need some form of masked operation are predicated 1510 // instructions. 1511 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1512 return Legal->isMaskRequired(I); 1513 return isScalarWithPredication(I); 1514 } 1515 1516 /// Returns true if \p I is a memory instruction with consecutive memory 1517 /// access that can be widened. 1518 bool 1519 memoryInstructionCanBeWidened(Instruction *I, 1520 ElementCount VF = ElementCount::getFixed(1)); 1521 1522 /// Returns true if \p I is a memory instruction in an interleaved-group 1523 /// of memory accesses that can be vectorized with wide vector loads/stores 1524 /// and shuffles. 1525 bool 1526 interleavedAccessCanBeWidened(Instruction *I, 1527 ElementCount VF = ElementCount::getFixed(1)); 1528 1529 /// Check if \p Instr belongs to any interleaved access group. 1530 bool isAccessInterleaved(Instruction *Instr) { 1531 return InterleaveInfo.isInterleaved(Instr); 1532 } 1533 1534 /// Get the interleaved access group that \p Instr belongs to. 1535 const InterleaveGroup<Instruction> * 1536 getInterleavedAccessGroup(Instruction *Instr) { 1537 return InterleaveInfo.getInterleaveGroup(Instr); 1538 } 1539 1540 /// Returns true if we're required to use a scalar epilogue for at least 1541 /// the final iteration of the original loop. 1542 bool requiresScalarEpilogue() const { 1543 if (!isScalarEpilogueAllowed()) 1544 return false; 1545 // If we might exit from anywhere but the latch, must run the exiting 1546 // iteration in scalar form. 1547 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1548 return true; 1549 return InterleaveInfo.requiresScalarEpilogue(); 1550 } 1551 1552 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1553 /// loop hint annotation. 1554 bool isScalarEpilogueAllowed() const { 1555 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1556 } 1557 1558 /// Returns true if all loop blocks should be masked to fold tail loop. 1559 bool foldTailByMasking() const { return FoldTailByMasking; } 1560 1561 bool blockNeedsPredication(BasicBlock *BB) { 1562 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1563 } 1564 1565 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1566 /// nodes to the chain of instructions representing the reductions. Uses a 1567 /// MapVector to ensure deterministic iteration order. 1568 using ReductionChainMap = 1569 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1570 1571 /// Return the chain of instructions representing an inloop reduction. 1572 const ReductionChainMap &getInLoopReductionChains() const { 1573 return InLoopReductionChains; 1574 } 1575 1576 /// Returns true if the Phi is part of an inloop reduction. 1577 bool isInLoopReduction(PHINode *Phi) const { 1578 return InLoopReductionChains.count(Phi); 1579 } 1580 1581 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1582 /// with factor VF. Return the cost of the instruction, including 1583 /// scalarization overhead if it's needed. 1584 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1585 1586 /// Estimate cost of a call instruction CI if it were vectorized with factor 1587 /// VF. Return the cost of the instruction, including scalarization overhead 1588 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1589 /// scalarized - 1590 /// i.e. either vector version isn't available, or is too expensive. 1591 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1592 bool &NeedToScalarize); 1593 1594 /// Invalidates decisions already taken by the cost model. 1595 void invalidateCostModelingDecisions() { 1596 WideningDecisions.clear(); 1597 Uniforms.clear(); 1598 Scalars.clear(); 1599 } 1600 1601 private: 1602 unsigned NumPredStores = 0; 1603 1604 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1605 /// than zero. One is returned if vectorization should best be avoided due 1606 /// to cost. 1607 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1608 ElementCount UserVF); 1609 1610 /// The vectorization cost is a combination of the cost itself and a boolean 1611 /// indicating whether any of the contributing operations will actually 1612 /// operate on 1613 /// vector values after type legalization in the backend. If this latter value 1614 /// is 1615 /// false, then all operations will be scalarized (i.e. no vectorization has 1616 /// actually taken place). 1617 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1618 1619 /// Returns the expected execution cost. The unit of the cost does 1620 /// not matter because we use the 'cost' units to compare different 1621 /// vector widths. The cost that is returned is *not* normalized by 1622 /// the factor width. 1623 VectorizationCostTy expectedCost(ElementCount VF); 1624 1625 /// Returns the execution time cost of an instruction for a given vector 1626 /// width. Vector width of one means scalar. 1627 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1628 1629 /// The cost-computation logic from getInstructionCost which provides 1630 /// the vector type as an output parameter. 1631 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1632 Type *&VectorTy); 1633 1634 /// Return the cost of instructions in an inloop reduction pattern, if I is 1635 /// part of that pattern. 1636 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1637 Type *VectorTy, 1638 TTI::TargetCostKind CostKind); 1639 1640 /// Calculate vectorization cost of memory instruction \p I. 1641 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1642 1643 /// The cost computation for scalarized memory instruction. 1644 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1645 1646 /// The cost computation for interleaving group of memory instructions. 1647 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1648 1649 /// The cost computation for Gather/Scatter instruction. 1650 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1651 1652 /// The cost computation for widening instruction \p I with consecutive 1653 /// memory access. 1654 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1655 1656 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1657 /// Load: scalar load + broadcast. 1658 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1659 /// element) 1660 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1661 1662 /// Estimate the overhead of scalarizing an instruction. This is a 1663 /// convenience wrapper for the type-based getScalarizationOverhead API. 1664 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1665 1666 /// Returns whether the instruction is a load or store and will be a emitted 1667 /// as a vector operation. 1668 bool isConsecutiveLoadOrStore(Instruction *I); 1669 1670 /// Returns true if an artificially high cost for emulated masked memrefs 1671 /// should be used. 1672 bool useEmulatedMaskMemRefHack(Instruction *I); 1673 1674 /// Map of scalar integer values to the smallest bitwidth they can be legally 1675 /// represented as. The vector equivalents of these values should be truncated 1676 /// to this type. 1677 MapVector<Instruction *, uint64_t> MinBWs; 1678 1679 /// A type representing the costs for instructions if they were to be 1680 /// scalarized rather than vectorized. The entries are Instruction-Cost 1681 /// pairs. 1682 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1683 1684 /// A set containing all BasicBlocks that are known to present after 1685 /// vectorization as a predicated block. 1686 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1687 1688 /// Records whether it is allowed to have the original scalar loop execute at 1689 /// least once. This may be needed as a fallback loop in case runtime 1690 /// aliasing/dependence checks fail, or to handle the tail/remainder 1691 /// iterations when the trip count is unknown or doesn't divide by the VF, 1692 /// or as a peel-loop to handle gaps in interleave-groups. 1693 /// Under optsize and when the trip count is very small we don't allow any 1694 /// iterations to execute in the scalar loop. 1695 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1696 1697 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1698 bool FoldTailByMasking = false; 1699 1700 /// A map holding scalar costs for different vectorization factors. The 1701 /// presence of a cost for an instruction in the mapping indicates that the 1702 /// instruction will be scalarized when vectorizing with the associated 1703 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1704 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1705 1706 /// Holds the instructions known to be uniform after vectorization. 1707 /// The data is collected per VF. 1708 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1709 1710 /// Holds the instructions known to be scalar after vectorization. 1711 /// The data is collected per VF. 1712 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1713 1714 /// Holds the instructions (address computations) that are forced to be 1715 /// scalarized. 1716 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1717 1718 /// PHINodes of the reductions that should be expanded in-loop along with 1719 /// their associated chains of reduction operations, in program order from top 1720 /// (PHI) to bottom 1721 ReductionChainMap InLoopReductionChains; 1722 1723 /// A Map of inloop reduction operations and their immediate chain operand. 1724 /// FIXME: This can be removed once reductions can be costed correctly in 1725 /// vplan. This was added to allow quick lookup to the inloop operations, 1726 /// without having to loop through InLoopReductionChains. 1727 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1728 1729 /// Returns the expected difference in cost from scalarizing the expression 1730 /// feeding a predicated instruction \p PredInst. The instructions to 1731 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1732 /// non-negative return value implies the expression will be scalarized. 1733 /// Currently, only single-use chains are considered for scalarization. 1734 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1735 ElementCount VF); 1736 1737 /// Collect the instructions that are uniform after vectorization. An 1738 /// instruction is uniform if we represent it with a single scalar value in 1739 /// the vectorized loop corresponding to each vector iteration. Examples of 1740 /// uniform instructions include pointer operands of consecutive or 1741 /// interleaved memory accesses. Note that although uniformity implies an 1742 /// instruction will be scalar, the reverse is not true. In general, a 1743 /// scalarized instruction will be represented by VF scalar values in the 1744 /// vectorized loop, each corresponding to an iteration of the original 1745 /// scalar loop. 1746 void collectLoopUniforms(ElementCount VF); 1747 1748 /// Collect the instructions that are scalar after vectorization. An 1749 /// instruction is scalar if it is known to be uniform or will be scalarized 1750 /// during vectorization. Non-uniform scalarized instructions will be 1751 /// represented by VF values in the vectorized loop, each corresponding to an 1752 /// iteration of the original scalar loop. 1753 void collectLoopScalars(ElementCount VF); 1754 1755 /// Keeps cost model vectorization decision and cost for instructions. 1756 /// Right now it is used for memory instructions only. 1757 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1758 std::pair<InstWidening, InstructionCost>>; 1759 1760 DecisionList WideningDecisions; 1761 1762 /// Returns true if \p V is expected to be vectorized and it needs to be 1763 /// extracted. 1764 bool needsExtract(Value *V, ElementCount VF) const { 1765 Instruction *I = dyn_cast<Instruction>(V); 1766 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1767 TheLoop->isLoopInvariant(I)) 1768 return false; 1769 1770 // Assume we can vectorize V (and hence we need extraction) if the 1771 // scalars are not computed yet. This can happen, because it is called 1772 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1773 // the scalars are collected. That should be a safe assumption in most 1774 // cases, because we check if the operands have vectorizable types 1775 // beforehand in LoopVectorizationLegality. 1776 return Scalars.find(VF) == Scalars.end() || 1777 !isScalarAfterVectorization(I, VF); 1778 }; 1779 1780 /// Returns a range containing only operands needing to be extracted. 1781 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1782 ElementCount VF) { 1783 return SmallVector<Value *, 4>(make_filter_range( 1784 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1785 } 1786 1787 /// Determines if we have the infrastructure to vectorize loop \p L and its 1788 /// epilogue, assuming the main loop is vectorized by \p VF. 1789 bool isCandidateForEpilogueVectorization(const Loop &L, 1790 const ElementCount VF) const; 1791 1792 /// Returns true if epilogue vectorization is considered profitable, and 1793 /// false otherwise. 1794 /// \p VF is the vectorization factor chosen for the original loop. 1795 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1796 1797 public: 1798 /// The loop that we evaluate. 1799 Loop *TheLoop; 1800 1801 /// Predicated scalar evolution analysis. 1802 PredicatedScalarEvolution &PSE; 1803 1804 /// Loop Info analysis. 1805 LoopInfo *LI; 1806 1807 /// Vectorization legality. 1808 LoopVectorizationLegality *Legal; 1809 1810 /// Vector target information. 1811 const TargetTransformInfo &TTI; 1812 1813 /// Target Library Info. 1814 const TargetLibraryInfo *TLI; 1815 1816 /// Demanded bits analysis. 1817 DemandedBits *DB; 1818 1819 /// Assumption cache. 1820 AssumptionCache *AC; 1821 1822 /// Interface to emit optimization remarks. 1823 OptimizationRemarkEmitter *ORE; 1824 1825 const Function *TheFunction; 1826 1827 /// Loop Vectorize Hint. 1828 const LoopVectorizeHints *Hints; 1829 1830 /// The interleave access information contains groups of interleaved accesses 1831 /// with the same stride and close to each other. 1832 InterleavedAccessInfo &InterleaveInfo; 1833 1834 /// Values to ignore in the cost model. 1835 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1836 1837 /// Values to ignore in the cost model when VF > 1. 1838 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1839 1840 /// Profitable vector factors. 1841 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1842 }; 1843 } // end namespace llvm 1844 1845 /// Helper struct to manage generating runtime checks for vectorization. 1846 /// 1847 /// The runtime checks are created up-front in temporary blocks to allow better 1848 /// estimating the cost and un-linked from the existing IR. After deciding to 1849 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1850 /// temporary blocks are completely removed. 1851 class GeneratedRTChecks { 1852 /// Basic block which contains the generated SCEV checks, if any. 1853 BasicBlock *SCEVCheckBlock = nullptr; 1854 1855 /// The value representing the result of the generated SCEV checks. If it is 1856 /// nullptr, either no SCEV checks have been generated or they have been used. 1857 Value *SCEVCheckCond = nullptr; 1858 1859 /// Basic block which contains the generated memory runtime checks, if any. 1860 BasicBlock *MemCheckBlock = nullptr; 1861 1862 /// The value representing the result of the generated memory runtime checks. 1863 /// If it is nullptr, either no memory runtime checks have been generated or 1864 /// they have been used. 1865 Instruction *MemRuntimeCheckCond = nullptr; 1866 1867 DominatorTree *DT; 1868 LoopInfo *LI; 1869 1870 SCEVExpander SCEVExp; 1871 SCEVExpander MemCheckExp; 1872 1873 public: 1874 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1875 const DataLayout &DL) 1876 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1877 MemCheckExp(SE, DL, "scev.check") {} 1878 1879 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1880 /// accurately estimate the cost of the runtime checks. The blocks are 1881 /// un-linked from the IR and is added back during vector code generation. If 1882 /// there is no vector code generation, the check blocks are removed 1883 /// completely. 1884 void Create(Loop *L, const LoopAccessInfo &LAI, 1885 const SCEVUnionPredicate &UnionPred) { 1886 1887 BasicBlock *LoopHeader = L->getHeader(); 1888 BasicBlock *Preheader = L->getLoopPreheader(); 1889 1890 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1891 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1892 // may be used by SCEVExpander. The blocks will be un-linked from their 1893 // predecessors and removed from LI & DT at the end of the function. 1894 if (!UnionPred.isAlwaysTrue()) { 1895 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1896 nullptr, "vector.scevcheck"); 1897 1898 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1899 &UnionPred, SCEVCheckBlock->getTerminator()); 1900 } 1901 1902 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1903 if (RtPtrChecking.Need) { 1904 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1905 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1906 "vector.memcheck"); 1907 1908 std::tie(std::ignore, MemRuntimeCheckCond) = 1909 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1910 RtPtrChecking.getChecks(), MemCheckExp); 1911 assert(MemRuntimeCheckCond && 1912 "no RT checks generated although RtPtrChecking " 1913 "claimed checks are required"); 1914 } 1915 1916 if (!MemCheckBlock && !SCEVCheckBlock) 1917 return; 1918 1919 // Unhook the temporary block with the checks, update various places 1920 // accordingly. 1921 if (SCEVCheckBlock) 1922 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1923 if (MemCheckBlock) 1924 MemCheckBlock->replaceAllUsesWith(Preheader); 1925 1926 if (SCEVCheckBlock) { 1927 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1928 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1929 Preheader->getTerminator()->eraseFromParent(); 1930 } 1931 if (MemCheckBlock) { 1932 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1933 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1934 Preheader->getTerminator()->eraseFromParent(); 1935 } 1936 1937 DT->changeImmediateDominator(LoopHeader, Preheader); 1938 if (MemCheckBlock) { 1939 DT->eraseNode(MemCheckBlock); 1940 LI->removeBlock(MemCheckBlock); 1941 } 1942 if (SCEVCheckBlock) { 1943 DT->eraseNode(SCEVCheckBlock); 1944 LI->removeBlock(SCEVCheckBlock); 1945 } 1946 } 1947 1948 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1949 /// unused. 1950 ~GeneratedRTChecks() { 1951 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1952 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1953 if (!SCEVCheckCond) 1954 SCEVCleaner.markResultUsed(); 1955 1956 if (!MemRuntimeCheckCond) 1957 MemCheckCleaner.markResultUsed(); 1958 1959 if (MemRuntimeCheckCond) { 1960 auto &SE = *MemCheckExp.getSE(); 1961 // Memory runtime check generation creates compares that use expanded 1962 // values. Remove them before running the SCEVExpanderCleaners. 1963 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1964 if (MemCheckExp.isInsertedInstruction(&I)) 1965 continue; 1966 SE.forgetValue(&I); 1967 SE.eraseValueFromMap(&I); 1968 I.eraseFromParent(); 1969 } 1970 } 1971 MemCheckCleaner.cleanup(); 1972 SCEVCleaner.cleanup(); 1973 1974 if (SCEVCheckCond) 1975 SCEVCheckBlock->eraseFromParent(); 1976 if (MemRuntimeCheckCond) 1977 MemCheckBlock->eraseFromParent(); 1978 } 1979 1980 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1981 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1982 /// depending on the generated condition. 1983 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1984 BasicBlock *LoopVectorPreHeader, 1985 BasicBlock *LoopExitBlock) { 1986 if (!SCEVCheckCond) 1987 return nullptr; 1988 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 1989 if (C->isZero()) 1990 return nullptr; 1991 1992 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 1993 1994 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 1995 // Create new preheader for vector loop. 1996 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 1997 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 1998 1999 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2000 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2001 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2002 SCEVCheckBlock); 2003 2004 DT->addNewBlock(SCEVCheckBlock, Pred); 2005 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2006 2007 ReplaceInstWithInst( 2008 SCEVCheckBlock->getTerminator(), 2009 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2010 // Mark the check as used, to prevent it from being removed during cleanup. 2011 SCEVCheckCond = nullptr; 2012 return SCEVCheckBlock; 2013 } 2014 2015 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2016 /// the branches to branch to the vector preheader or \p Bypass, depending on 2017 /// the generated condition. 2018 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2019 BasicBlock *LoopVectorPreHeader) { 2020 // Check if we generated code that checks in runtime if arrays overlap. 2021 if (!MemRuntimeCheckCond) 2022 return nullptr; 2023 2024 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2025 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2026 MemCheckBlock); 2027 2028 DT->addNewBlock(MemCheckBlock, Pred); 2029 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2030 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2031 2032 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2033 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2034 2035 ReplaceInstWithInst( 2036 MemCheckBlock->getTerminator(), 2037 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2038 MemCheckBlock->getTerminator()->setDebugLoc( 2039 Pred->getTerminator()->getDebugLoc()); 2040 2041 // Mark the check as used, to prevent it from being removed during cleanup. 2042 MemRuntimeCheckCond = nullptr; 2043 return MemCheckBlock; 2044 } 2045 }; 2046 2047 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2048 // vectorization. The loop needs to be annotated with #pragma omp simd 2049 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2050 // vector length information is not provided, vectorization is not considered 2051 // explicit. Interleave hints are not allowed either. These limitations will be 2052 // relaxed in the future. 2053 // Please, note that we are currently forced to abuse the pragma 'clang 2054 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2055 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2056 // provides *explicit vectorization hints* (LV can bypass legal checks and 2057 // assume that vectorization is legal). However, both hints are implemented 2058 // using the same metadata (llvm.loop.vectorize, processed by 2059 // LoopVectorizeHints). This will be fixed in the future when the native IR 2060 // representation for pragma 'omp simd' is introduced. 2061 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2062 OptimizationRemarkEmitter *ORE) { 2063 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2064 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2065 2066 // Only outer loops with an explicit vectorization hint are supported. 2067 // Unannotated outer loops are ignored. 2068 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2069 return false; 2070 2071 Function *Fn = OuterLp->getHeader()->getParent(); 2072 if (!Hints.allowVectorization(Fn, OuterLp, 2073 true /*VectorizeOnlyWhenForced*/)) { 2074 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2075 return false; 2076 } 2077 2078 if (Hints.getInterleave() > 1) { 2079 // TODO: Interleave support is future work. 2080 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2081 "outer loops.\n"); 2082 Hints.emitRemarkWithHints(); 2083 return false; 2084 } 2085 2086 return true; 2087 } 2088 2089 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2090 OptimizationRemarkEmitter *ORE, 2091 SmallVectorImpl<Loop *> &V) { 2092 // Collect inner loops and outer loops without irreducible control flow. For 2093 // now, only collect outer loops that have explicit vectorization hints. If we 2094 // are stress testing the VPlan H-CFG construction, we collect the outermost 2095 // loop of every loop nest. 2096 if (L.isInnermost() || VPlanBuildStressTest || 2097 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2098 LoopBlocksRPO RPOT(&L); 2099 RPOT.perform(LI); 2100 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2101 V.push_back(&L); 2102 // TODO: Collect inner loops inside marked outer loops in case 2103 // vectorization fails for the outer loop. Do not invoke 2104 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2105 // already known to be reducible. We can use an inherited attribute for 2106 // that. 2107 return; 2108 } 2109 } 2110 for (Loop *InnerL : L) 2111 collectSupportedLoops(*InnerL, LI, ORE, V); 2112 } 2113 2114 namespace { 2115 2116 /// The LoopVectorize Pass. 2117 struct LoopVectorize : public FunctionPass { 2118 /// Pass identification, replacement for typeid 2119 static char ID; 2120 2121 LoopVectorizePass Impl; 2122 2123 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2124 bool VectorizeOnlyWhenForced = false) 2125 : FunctionPass(ID), 2126 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2127 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2128 } 2129 2130 bool runOnFunction(Function &F) override { 2131 if (skipFunction(F)) 2132 return false; 2133 2134 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2135 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2136 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2137 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2138 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2139 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2140 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2141 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2142 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2143 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2144 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2145 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2146 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2147 2148 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2149 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2150 2151 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2152 GetLAA, *ORE, PSI).MadeAnyChange; 2153 } 2154 2155 void getAnalysisUsage(AnalysisUsage &AU) const override { 2156 AU.addRequired<AssumptionCacheTracker>(); 2157 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2158 AU.addRequired<DominatorTreeWrapperPass>(); 2159 AU.addRequired<LoopInfoWrapperPass>(); 2160 AU.addRequired<ScalarEvolutionWrapperPass>(); 2161 AU.addRequired<TargetTransformInfoWrapperPass>(); 2162 AU.addRequired<AAResultsWrapperPass>(); 2163 AU.addRequired<LoopAccessLegacyAnalysis>(); 2164 AU.addRequired<DemandedBitsWrapperPass>(); 2165 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2166 AU.addRequired<InjectTLIMappingsLegacy>(); 2167 2168 // We currently do not preserve loopinfo/dominator analyses with outer loop 2169 // vectorization. Until this is addressed, mark these analyses as preserved 2170 // only for non-VPlan-native path. 2171 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2172 if (!EnableVPlanNativePath) { 2173 AU.addPreserved<LoopInfoWrapperPass>(); 2174 AU.addPreserved<DominatorTreeWrapperPass>(); 2175 } 2176 2177 AU.addPreserved<BasicAAWrapperPass>(); 2178 AU.addPreserved<GlobalsAAWrapperPass>(); 2179 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2180 } 2181 }; 2182 2183 } // end anonymous namespace 2184 2185 //===----------------------------------------------------------------------===// 2186 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2187 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2188 //===----------------------------------------------------------------------===// 2189 2190 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2191 // We need to place the broadcast of invariant variables outside the loop, 2192 // but only if it's proven safe to do so. Else, broadcast will be inside 2193 // vector loop body. 2194 Instruction *Instr = dyn_cast<Instruction>(V); 2195 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2196 (!Instr || 2197 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2198 // Place the code for broadcasting invariant variables in the new preheader. 2199 IRBuilder<>::InsertPointGuard Guard(Builder); 2200 if (SafeToHoist) 2201 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2202 2203 // Broadcast the scalar into all locations in the vector. 2204 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2205 2206 return Shuf; 2207 } 2208 2209 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2210 const InductionDescriptor &II, Value *Step, Value *Start, 2211 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2212 VPTransformState &State) { 2213 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2214 "Expected either an induction phi-node or a truncate of it!"); 2215 2216 // Construct the initial value of the vector IV in the vector loop preheader 2217 auto CurrIP = Builder.saveIP(); 2218 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2219 if (isa<TruncInst>(EntryVal)) { 2220 assert(Start->getType()->isIntegerTy() && 2221 "Truncation requires an integer type"); 2222 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2223 Step = Builder.CreateTrunc(Step, TruncType); 2224 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2225 } 2226 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2227 Value *SteppedStart = 2228 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2229 2230 // We create vector phi nodes for both integer and floating-point induction 2231 // variables. Here, we determine the kind of arithmetic we will perform. 2232 Instruction::BinaryOps AddOp; 2233 Instruction::BinaryOps MulOp; 2234 if (Step->getType()->isIntegerTy()) { 2235 AddOp = Instruction::Add; 2236 MulOp = Instruction::Mul; 2237 } else { 2238 AddOp = II.getInductionOpcode(); 2239 MulOp = Instruction::FMul; 2240 } 2241 2242 // Multiply the vectorization factor by the step using integer or 2243 // floating-point arithmetic as appropriate. 2244 Value *ConstVF = 2245 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2246 Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); 2247 2248 // Create a vector splat to use in the induction update. 2249 // 2250 // FIXME: If the step is non-constant, we create the vector splat with 2251 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2252 // handle a constant vector splat. 2253 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2254 Value *SplatVF = isa<Constant>(Mul) 2255 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2256 : Builder.CreateVectorSplat(VF, Mul); 2257 Builder.restoreIP(CurrIP); 2258 2259 // We may need to add the step a number of times, depending on the unroll 2260 // factor. The last of those goes into the PHI. 2261 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2262 &*LoopVectorBody->getFirstInsertionPt()); 2263 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2264 Instruction *LastInduction = VecInd; 2265 for (unsigned Part = 0; Part < UF; ++Part) { 2266 State.set(Def, LastInduction, Part); 2267 2268 if (isa<TruncInst>(EntryVal)) 2269 addMetadata(LastInduction, EntryVal); 2270 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2271 State, Part); 2272 2273 LastInduction = cast<Instruction>( 2274 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2275 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2276 } 2277 2278 // Move the last step to the end of the latch block. This ensures consistent 2279 // placement of all induction updates. 2280 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2281 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2282 auto *ICmp = cast<Instruction>(Br->getCondition()); 2283 LastInduction->moveBefore(ICmp); 2284 LastInduction->setName("vec.ind.next"); 2285 2286 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2287 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2288 } 2289 2290 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2291 return Cost->isScalarAfterVectorization(I, VF) || 2292 Cost->isProfitableToScalarize(I, VF); 2293 } 2294 2295 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2296 if (shouldScalarizeInstruction(IV)) 2297 return true; 2298 auto isScalarInst = [&](User *U) -> bool { 2299 auto *I = cast<Instruction>(U); 2300 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2301 }; 2302 return llvm::any_of(IV->users(), isScalarInst); 2303 } 2304 2305 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2306 const InductionDescriptor &ID, const Instruction *EntryVal, 2307 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2308 unsigned Part, unsigned Lane) { 2309 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2310 "Expected either an induction phi-node or a truncate of it!"); 2311 2312 // This induction variable is not the phi from the original loop but the 2313 // newly-created IV based on the proof that casted Phi is equal to the 2314 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2315 // re-uses the same InductionDescriptor that original IV uses but we don't 2316 // have to do any recording in this case - that is done when original IV is 2317 // processed. 2318 if (isa<TruncInst>(EntryVal)) 2319 return; 2320 2321 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2322 if (Casts.empty()) 2323 return; 2324 // Only the first Cast instruction in the Casts vector is of interest. 2325 // The rest of the Casts (if exist) have no uses outside the 2326 // induction update chain itself. 2327 if (Lane < UINT_MAX) 2328 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2329 else 2330 State.set(CastDef, VectorLoopVal, Part); 2331 } 2332 2333 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2334 TruncInst *Trunc, VPValue *Def, 2335 VPValue *CastDef, 2336 VPTransformState &State) { 2337 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2338 "Primary induction variable must have an integer type"); 2339 2340 auto II = Legal->getInductionVars().find(IV); 2341 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2342 2343 auto ID = II->second; 2344 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2345 2346 // The value from the original loop to which we are mapping the new induction 2347 // variable. 2348 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2349 2350 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2351 2352 // Generate code for the induction step. Note that induction steps are 2353 // required to be loop-invariant 2354 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2355 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2356 "Induction step should be loop invariant"); 2357 if (PSE.getSE()->isSCEVable(IV->getType())) { 2358 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2359 return Exp.expandCodeFor(Step, Step->getType(), 2360 LoopVectorPreHeader->getTerminator()); 2361 } 2362 return cast<SCEVUnknown>(Step)->getValue(); 2363 }; 2364 2365 // The scalar value to broadcast. This is derived from the canonical 2366 // induction variable. If a truncation type is given, truncate the canonical 2367 // induction variable and step. Otherwise, derive these values from the 2368 // induction descriptor. 2369 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2370 Value *ScalarIV = Induction; 2371 if (IV != OldInduction) { 2372 ScalarIV = IV->getType()->isIntegerTy() 2373 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2374 : Builder.CreateCast(Instruction::SIToFP, Induction, 2375 IV->getType()); 2376 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2377 ScalarIV->setName("offset.idx"); 2378 } 2379 if (Trunc) { 2380 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2381 assert(Step->getType()->isIntegerTy() && 2382 "Truncation requires an integer step"); 2383 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2384 Step = Builder.CreateTrunc(Step, TruncType); 2385 } 2386 return ScalarIV; 2387 }; 2388 2389 // Create the vector values from the scalar IV, in the absence of creating a 2390 // vector IV. 2391 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2392 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2393 for (unsigned Part = 0; Part < UF; ++Part) { 2394 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2395 Value *EntryPart = 2396 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2397 ID.getInductionOpcode()); 2398 State.set(Def, EntryPart, Part); 2399 if (Trunc) 2400 addMetadata(EntryPart, Trunc); 2401 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2402 State, Part); 2403 } 2404 }; 2405 2406 // Fast-math-flags propagate from the original induction instruction. 2407 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2408 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2409 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2410 2411 // Now do the actual transformations, and start with creating the step value. 2412 Value *Step = CreateStepValue(ID.getStep()); 2413 if (VF.isZero() || VF.isScalar()) { 2414 Value *ScalarIV = CreateScalarIV(Step); 2415 CreateSplatIV(ScalarIV, Step); 2416 return; 2417 } 2418 2419 // Determine if we want a scalar version of the induction variable. This is 2420 // true if the induction variable itself is not widened, or if it has at 2421 // least one user in the loop that is not widened. 2422 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2423 if (!NeedsScalarIV) { 2424 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2425 State); 2426 return; 2427 } 2428 2429 // Try to create a new independent vector induction variable. If we can't 2430 // create the phi node, we will splat the scalar induction variable in each 2431 // loop iteration. 2432 if (!shouldScalarizeInstruction(EntryVal)) { 2433 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2434 State); 2435 Value *ScalarIV = CreateScalarIV(Step); 2436 // Create scalar steps that can be used by instructions we will later 2437 // scalarize. Note that the addition of the scalar steps will not increase 2438 // the number of instructions in the loop in the common case prior to 2439 // InstCombine. We will be trading one vector extract for each scalar step. 2440 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2441 return; 2442 } 2443 2444 // All IV users are scalar instructions, so only emit a scalar IV, not a 2445 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2446 // predicate used by the masked loads/stores. 2447 Value *ScalarIV = CreateScalarIV(Step); 2448 if (!Cost->isScalarEpilogueAllowed()) 2449 CreateSplatIV(ScalarIV, Step); 2450 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2451 } 2452 2453 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2454 Instruction::BinaryOps BinOp) { 2455 // Create and check the types. 2456 assert(isa<FixedVectorType>(Val->getType()) && 2457 "Creation of scalable step vector not yet supported"); 2458 auto *ValVTy = cast<VectorType>(Val->getType()); 2459 ElementCount VLen = ValVTy->getElementCount(); 2460 2461 Type *STy = Val->getType()->getScalarType(); 2462 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2463 "Induction Step must be an integer or FP"); 2464 assert(Step->getType() == STy && "Step has wrong type"); 2465 2466 SmallVector<Constant *, 8> Indices; 2467 2468 // Create a vector of consecutive numbers from zero to VF. 2469 VectorType *InitVecValVTy = ValVTy; 2470 Type *InitVecValSTy = STy; 2471 if (STy->isFloatingPointTy()) { 2472 InitVecValSTy = 2473 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2474 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2475 } 2476 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2477 2478 // Add on StartIdx 2479 Value *StartIdxSplat = Builder.CreateVectorSplat( 2480 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2481 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2482 2483 if (STy->isIntegerTy()) { 2484 Step = Builder.CreateVectorSplat(VLen, Step); 2485 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2486 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2487 // which can be found from the original scalar operations. 2488 Step = Builder.CreateMul(InitVec, Step); 2489 return Builder.CreateAdd(Val, Step, "induction"); 2490 } 2491 2492 // Floating point induction. 2493 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2494 "Binary Opcode should be specified for FP induction"); 2495 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2496 Step = Builder.CreateVectorSplat(VLen, Step); 2497 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2498 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2499 } 2500 2501 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2502 Instruction *EntryVal, 2503 const InductionDescriptor &ID, 2504 VPValue *Def, VPValue *CastDef, 2505 VPTransformState &State) { 2506 // We shouldn't have to build scalar steps if we aren't vectorizing. 2507 assert(VF.isVector() && "VF should be greater than one"); 2508 // Get the value type and ensure it and the step have the same integer type. 2509 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2510 assert(ScalarIVTy == Step->getType() && 2511 "Val and Step should have the same type"); 2512 2513 // We build scalar steps for both integer and floating-point induction 2514 // variables. Here, we determine the kind of arithmetic we will perform. 2515 Instruction::BinaryOps AddOp; 2516 Instruction::BinaryOps MulOp; 2517 if (ScalarIVTy->isIntegerTy()) { 2518 AddOp = Instruction::Add; 2519 MulOp = Instruction::Mul; 2520 } else { 2521 AddOp = ID.getInductionOpcode(); 2522 MulOp = Instruction::FMul; 2523 } 2524 2525 // Determine the number of scalars we need to generate for each unroll 2526 // iteration. If EntryVal is uniform, we only need to generate the first 2527 // lane. Otherwise, we generate all VF values. 2528 unsigned Lanes = 2529 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2530 ? 1 2531 : VF.getKnownMinValue(); 2532 assert((!VF.isScalable() || Lanes == 1) && 2533 "Should never scalarize a scalable vector"); 2534 // Compute the scalar steps and save the results in State. 2535 for (unsigned Part = 0; Part < UF; ++Part) { 2536 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2537 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2538 ScalarIVTy->getScalarSizeInBits()); 2539 Value *StartIdx = 2540 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2541 if (ScalarIVTy->isFloatingPointTy()) 2542 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2543 StartIdx = Builder.CreateBinOp( 2544 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2545 // The step returned by `createStepForVF` is a runtime-evaluated value 2546 // when VF is scalable. Otherwise, it should be folded into a Constant. 2547 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2548 "Expected StartIdx to be folded to a constant when VF is not " 2549 "scalable"); 2550 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2551 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2552 State.set(Def, Add, VPIteration(Part, Lane)); 2553 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2554 Part, Lane); 2555 } 2556 } 2557 } 2558 2559 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2560 const VPIteration &Instance, 2561 VPTransformState &State) { 2562 Value *ScalarInst = State.get(Def, Instance); 2563 Value *VectorValue = State.get(Def, Instance.Part); 2564 VectorValue = Builder.CreateInsertElement( 2565 VectorValue, ScalarInst, 2566 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2567 State.set(Def, VectorValue, Instance.Part); 2568 } 2569 2570 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2571 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2572 return Builder.CreateVectorReverse(Vec, "reverse"); 2573 } 2574 2575 // Return whether we allow using masked interleave-groups (for dealing with 2576 // strided loads/stores that reside in predicated blocks, or for dealing 2577 // with gaps). 2578 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2579 // If an override option has been passed in for interleaved accesses, use it. 2580 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2581 return EnableMaskedInterleavedMemAccesses; 2582 2583 return TTI.enableMaskedInterleavedAccessVectorization(); 2584 } 2585 2586 // Try to vectorize the interleave group that \p Instr belongs to. 2587 // 2588 // E.g. Translate following interleaved load group (factor = 3): 2589 // for (i = 0; i < N; i+=3) { 2590 // R = Pic[i]; // Member of index 0 2591 // G = Pic[i+1]; // Member of index 1 2592 // B = Pic[i+2]; // Member of index 2 2593 // ... // do something to R, G, B 2594 // } 2595 // To: 2596 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2597 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2598 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2599 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2600 // 2601 // Or translate following interleaved store group (factor = 3): 2602 // for (i = 0; i < N; i+=3) { 2603 // ... do something to R, G, B 2604 // Pic[i] = R; // Member of index 0 2605 // Pic[i+1] = G; // Member of index 1 2606 // Pic[i+2] = B; // Member of index 2 2607 // } 2608 // To: 2609 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2610 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2611 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2612 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2613 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2614 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2615 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2616 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2617 VPValue *BlockInMask) { 2618 Instruction *Instr = Group->getInsertPos(); 2619 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2620 2621 // Prepare for the vector type of the interleaved load/store. 2622 Type *ScalarTy = getMemInstValueType(Instr); 2623 unsigned InterleaveFactor = Group->getFactor(); 2624 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2625 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2626 2627 // Prepare for the new pointers. 2628 SmallVector<Value *, 2> AddrParts; 2629 unsigned Index = Group->getIndex(Instr); 2630 2631 // TODO: extend the masked interleaved-group support to reversed access. 2632 assert((!BlockInMask || !Group->isReverse()) && 2633 "Reversed masked interleave-group not supported."); 2634 2635 // If the group is reverse, adjust the index to refer to the last vector lane 2636 // instead of the first. We adjust the index from the first vector lane, 2637 // rather than directly getting the pointer for lane VF - 1, because the 2638 // pointer operand of the interleaved access is supposed to be uniform. For 2639 // uniform instructions, we're only required to generate a value for the 2640 // first vector lane in each unroll iteration. 2641 assert(!VF.isScalable() && 2642 "scalable vector reverse operation is not implemented"); 2643 if (Group->isReverse()) 2644 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2645 2646 for (unsigned Part = 0; Part < UF; Part++) { 2647 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2648 setDebugLocFromInst(Builder, AddrPart); 2649 2650 // Notice current instruction could be any index. Need to adjust the address 2651 // to the member of index 0. 2652 // 2653 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2654 // b = A[i]; // Member of index 0 2655 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2656 // 2657 // E.g. A[i+1] = a; // Member of index 1 2658 // A[i] = b; // Member of index 0 2659 // A[i+2] = c; // Member of index 2 (Current instruction) 2660 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2661 2662 bool InBounds = false; 2663 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2664 InBounds = gep->isInBounds(); 2665 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2666 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2667 2668 // Cast to the vector pointer type. 2669 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2670 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2671 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2672 } 2673 2674 setDebugLocFromInst(Builder, Instr); 2675 Value *PoisonVec = PoisonValue::get(VecTy); 2676 2677 Value *MaskForGaps = nullptr; 2678 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2679 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2680 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2681 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2682 } 2683 2684 // Vectorize the interleaved load group. 2685 if (isa<LoadInst>(Instr)) { 2686 // For each unroll part, create a wide load for the group. 2687 SmallVector<Value *, 2> NewLoads; 2688 for (unsigned Part = 0; Part < UF; Part++) { 2689 Instruction *NewLoad; 2690 if (BlockInMask || MaskForGaps) { 2691 assert(useMaskedInterleavedAccesses(*TTI) && 2692 "masked interleaved groups are not allowed."); 2693 Value *GroupMask = MaskForGaps; 2694 if (BlockInMask) { 2695 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2696 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2697 Value *ShuffledMask = Builder.CreateShuffleVector( 2698 BlockInMaskPart, 2699 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2700 "interleaved.mask"); 2701 GroupMask = MaskForGaps 2702 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2703 MaskForGaps) 2704 : ShuffledMask; 2705 } 2706 NewLoad = 2707 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2708 GroupMask, PoisonVec, "wide.masked.vec"); 2709 } 2710 else 2711 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2712 Group->getAlign(), "wide.vec"); 2713 Group->addMetadata(NewLoad); 2714 NewLoads.push_back(NewLoad); 2715 } 2716 2717 // For each member in the group, shuffle out the appropriate data from the 2718 // wide loads. 2719 unsigned J = 0; 2720 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2721 Instruction *Member = Group->getMember(I); 2722 2723 // Skip the gaps in the group. 2724 if (!Member) 2725 continue; 2726 2727 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2728 auto StrideMask = 2729 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2730 for (unsigned Part = 0; Part < UF; Part++) { 2731 Value *StridedVec = Builder.CreateShuffleVector( 2732 NewLoads[Part], StrideMask, "strided.vec"); 2733 2734 // If this member has different type, cast the result type. 2735 if (Member->getType() != ScalarTy) { 2736 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2737 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2738 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2739 } 2740 2741 if (Group->isReverse()) 2742 StridedVec = reverseVector(StridedVec); 2743 2744 State.set(VPDefs[J], StridedVec, Part); 2745 } 2746 ++J; 2747 } 2748 return; 2749 } 2750 2751 // The sub vector type for current instruction. 2752 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2753 auto *SubVT = VectorType::get(ScalarTy, VF); 2754 2755 // Vectorize the interleaved store group. 2756 for (unsigned Part = 0; Part < UF; Part++) { 2757 // Collect the stored vector from each member. 2758 SmallVector<Value *, 4> StoredVecs; 2759 for (unsigned i = 0; i < InterleaveFactor; i++) { 2760 // Interleaved store group doesn't allow a gap, so each index has a member 2761 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2762 2763 Value *StoredVec = State.get(StoredValues[i], Part); 2764 2765 if (Group->isReverse()) 2766 StoredVec = reverseVector(StoredVec); 2767 2768 // If this member has different type, cast it to a unified type. 2769 2770 if (StoredVec->getType() != SubVT) 2771 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2772 2773 StoredVecs.push_back(StoredVec); 2774 } 2775 2776 // Concatenate all vectors into a wide vector. 2777 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2778 2779 // Interleave the elements in the wide vector. 2780 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2781 Value *IVec = Builder.CreateShuffleVector( 2782 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2783 "interleaved.vec"); 2784 2785 Instruction *NewStoreInstr; 2786 if (BlockInMask) { 2787 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2788 Value *ShuffledMask = Builder.CreateShuffleVector( 2789 BlockInMaskPart, 2790 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2791 "interleaved.mask"); 2792 NewStoreInstr = Builder.CreateMaskedStore( 2793 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2794 } 2795 else 2796 NewStoreInstr = 2797 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2798 2799 Group->addMetadata(NewStoreInstr); 2800 } 2801 } 2802 2803 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2804 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2805 VPValue *StoredValue, VPValue *BlockInMask) { 2806 // Attempt to issue a wide load. 2807 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2808 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2809 2810 assert((LI || SI) && "Invalid Load/Store instruction"); 2811 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2812 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2813 2814 LoopVectorizationCostModel::InstWidening Decision = 2815 Cost->getWideningDecision(Instr, VF); 2816 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2817 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2818 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2819 "CM decision is not to widen the memory instruction"); 2820 2821 Type *ScalarDataTy = getMemInstValueType(Instr); 2822 2823 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2824 const Align Alignment = getLoadStoreAlignment(Instr); 2825 2826 // Determine if the pointer operand of the access is either consecutive or 2827 // reverse consecutive. 2828 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2829 bool ConsecutiveStride = 2830 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2831 bool CreateGatherScatter = 2832 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2833 2834 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2835 // gather/scatter. Otherwise Decision should have been to Scalarize. 2836 assert((ConsecutiveStride || CreateGatherScatter) && 2837 "The instruction should be scalarized"); 2838 (void)ConsecutiveStride; 2839 2840 VectorParts BlockInMaskParts(UF); 2841 bool isMaskRequired = BlockInMask; 2842 if (isMaskRequired) 2843 for (unsigned Part = 0; Part < UF; ++Part) 2844 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2845 2846 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2847 // Calculate the pointer for the specific unroll-part. 2848 GetElementPtrInst *PartPtr = nullptr; 2849 2850 bool InBounds = false; 2851 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2852 InBounds = gep->isInBounds(); 2853 if (Reverse) { 2854 // If the address is consecutive but reversed, then the 2855 // wide store needs to start at the last vector element. 2856 // RunTimeVF = VScale * VF.getKnownMinValue() 2857 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2858 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2859 // NumElt = -Part * RunTimeVF 2860 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2861 // LastLane = 1 - RunTimeVF 2862 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2863 PartPtr = 2864 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2865 PartPtr->setIsInBounds(InBounds); 2866 PartPtr = cast<GetElementPtrInst>( 2867 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2868 PartPtr->setIsInBounds(InBounds); 2869 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2870 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2871 } else { 2872 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2873 PartPtr = cast<GetElementPtrInst>( 2874 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2875 PartPtr->setIsInBounds(InBounds); 2876 } 2877 2878 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2879 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2880 }; 2881 2882 // Handle Stores: 2883 if (SI) { 2884 setDebugLocFromInst(Builder, SI); 2885 2886 for (unsigned Part = 0; Part < UF; ++Part) { 2887 Instruction *NewSI = nullptr; 2888 Value *StoredVal = State.get(StoredValue, Part); 2889 if (CreateGatherScatter) { 2890 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2891 Value *VectorGep = State.get(Addr, Part); 2892 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2893 MaskPart); 2894 } else { 2895 if (Reverse) { 2896 // If we store to reverse consecutive memory locations, then we need 2897 // to reverse the order of elements in the stored value. 2898 StoredVal = reverseVector(StoredVal); 2899 // We don't want to update the value in the map as it might be used in 2900 // another expression. So don't call resetVectorValue(StoredVal). 2901 } 2902 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2903 if (isMaskRequired) 2904 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2905 BlockInMaskParts[Part]); 2906 else 2907 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2908 } 2909 addMetadata(NewSI, SI); 2910 } 2911 return; 2912 } 2913 2914 // Handle loads. 2915 assert(LI && "Must have a load instruction"); 2916 setDebugLocFromInst(Builder, LI); 2917 for (unsigned Part = 0; Part < UF; ++Part) { 2918 Value *NewLI; 2919 if (CreateGatherScatter) { 2920 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2921 Value *VectorGep = State.get(Addr, Part); 2922 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2923 nullptr, "wide.masked.gather"); 2924 addMetadata(NewLI, LI); 2925 } else { 2926 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2927 if (isMaskRequired) 2928 NewLI = Builder.CreateMaskedLoad( 2929 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2930 "wide.masked.load"); 2931 else 2932 NewLI = 2933 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2934 2935 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2936 addMetadata(NewLI, LI); 2937 if (Reverse) 2938 NewLI = reverseVector(NewLI); 2939 } 2940 2941 State.set(Def, NewLI, Part); 2942 } 2943 } 2944 2945 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2946 VPUser &User, 2947 const VPIteration &Instance, 2948 bool IfPredicateInstr, 2949 VPTransformState &State) { 2950 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2951 2952 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2953 // the first lane and part. 2954 if (isa<NoAliasScopeDeclInst>(Instr)) 2955 if (!Instance.isFirstIteration()) 2956 return; 2957 2958 setDebugLocFromInst(Builder, Instr); 2959 2960 // Does this instruction return a value ? 2961 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2962 2963 Instruction *Cloned = Instr->clone(); 2964 if (!IsVoidRetTy) 2965 Cloned->setName(Instr->getName() + ".cloned"); 2966 2967 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2968 Builder.GetInsertPoint()); 2969 // Replace the operands of the cloned instructions with their scalar 2970 // equivalents in the new loop. 2971 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2972 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2973 auto InputInstance = Instance; 2974 if (!Operand || !OrigLoop->contains(Operand) || 2975 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2976 InputInstance.Lane = VPLane::getFirstLane(); 2977 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2978 Cloned->setOperand(op, NewOp); 2979 } 2980 addNewMetadata(Cloned, Instr); 2981 2982 // Place the cloned scalar in the new loop. 2983 Builder.Insert(Cloned); 2984 2985 State.set(Def, Cloned, Instance); 2986 2987 // If we just cloned a new assumption, add it the assumption cache. 2988 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2989 if (II->getIntrinsicID() == Intrinsic::assume) 2990 AC->registerAssumption(II); 2991 2992 // End if-block. 2993 if (IfPredicateInstr) 2994 PredicatedInstructions.push_back(Cloned); 2995 } 2996 2997 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2998 Value *End, Value *Step, 2999 Instruction *DL) { 3000 BasicBlock *Header = L->getHeader(); 3001 BasicBlock *Latch = L->getLoopLatch(); 3002 // As we're just creating this loop, it's possible no latch exists 3003 // yet. If so, use the header as this will be a single block loop. 3004 if (!Latch) 3005 Latch = Header; 3006 3007 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3008 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3009 setDebugLocFromInst(Builder, OldInst); 3010 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3011 3012 Builder.SetInsertPoint(Latch->getTerminator()); 3013 setDebugLocFromInst(Builder, OldInst); 3014 3015 // Create i+1 and fill the PHINode. 3016 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3017 Induction->addIncoming(Start, L->getLoopPreheader()); 3018 Induction->addIncoming(Next, Latch); 3019 // Create the compare. 3020 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3021 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3022 3023 // Now we have two terminators. Remove the old one from the block. 3024 Latch->getTerminator()->eraseFromParent(); 3025 3026 return Induction; 3027 } 3028 3029 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3030 if (TripCount) 3031 return TripCount; 3032 3033 assert(L && "Create Trip Count for null loop."); 3034 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3035 // Find the loop boundaries. 3036 ScalarEvolution *SE = PSE.getSE(); 3037 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3038 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3039 "Invalid loop count"); 3040 3041 Type *IdxTy = Legal->getWidestInductionType(); 3042 assert(IdxTy && "No type for induction"); 3043 3044 // The exit count might have the type of i64 while the phi is i32. This can 3045 // happen if we have an induction variable that is sign extended before the 3046 // compare. The only way that we get a backedge taken count is that the 3047 // induction variable was signed and as such will not overflow. In such a case 3048 // truncation is legal. 3049 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3050 IdxTy->getPrimitiveSizeInBits()) 3051 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3052 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3053 3054 // Get the total trip count from the count by adding 1. 3055 const SCEV *ExitCount = SE->getAddExpr( 3056 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3057 3058 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3059 3060 // Expand the trip count and place the new instructions in the preheader. 3061 // Notice that the pre-header does not change, only the loop body. 3062 SCEVExpander Exp(*SE, DL, "induction"); 3063 3064 // Count holds the overall loop count (N). 3065 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3066 L->getLoopPreheader()->getTerminator()); 3067 3068 if (TripCount->getType()->isPointerTy()) 3069 TripCount = 3070 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3071 L->getLoopPreheader()->getTerminator()); 3072 3073 return TripCount; 3074 } 3075 3076 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3077 if (VectorTripCount) 3078 return VectorTripCount; 3079 3080 Value *TC = getOrCreateTripCount(L); 3081 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3082 3083 Type *Ty = TC->getType(); 3084 // This is where we can make the step a runtime constant. 3085 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3086 3087 // If the tail is to be folded by masking, round the number of iterations N 3088 // up to a multiple of Step instead of rounding down. This is done by first 3089 // adding Step-1 and then rounding down. Note that it's ok if this addition 3090 // overflows: the vector induction variable will eventually wrap to zero given 3091 // that it starts at zero and its Step is a power of two; the loop will then 3092 // exit, with the last early-exit vector comparison also producing all-true. 3093 if (Cost->foldTailByMasking()) { 3094 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3095 "VF*UF must be a power of 2 when folding tail by masking"); 3096 assert(!VF.isScalable() && 3097 "Tail folding not yet supported for scalable vectors"); 3098 TC = Builder.CreateAdd( 3099 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3100 } 3101 3102 // Now we need to generate the expression for the part of the loop that the 3103 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3104 // iterations are not required for correctness, or N - Step, otherwise. Step 3105 // is equal to the vectorization factor (number of SIMD elements) times the 3106 // unroll factor (number of SIMD instructions). 3107 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3108 3109 // There are two cases where we need to ensure (at least) the last iteration 3110 // runs in the scalar remainder loop. Thus, if the step evenly divides 3111 // the trip count, we set the remainder to be equal to the step. If the step 3112 // does not evenly divide the trip count, no adjustment is necessary since 3113 // there will already be scalar iterations. Note that the minimum iterations 3114 // check ensures that N >= Step. The cases are: 3115 // 1) If there is a non-reversed interleaved group that may speculatively 3116 // access memory out-of-bounds. 3117 // 2) If any instruction may follow a conditionally taken exit. That is, if 3118 // the loop contains multiple exiting blocks, or a single exiting block 3119 // which is not the latch. 3120 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3121 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3122 R = Builder.CreateSelect(IsZero, Step, R); 3123 } 3124 3125 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3126 3127 return VectorTripCount; 3128 } 3129 3130 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3131 const DataLayout &DL) { 3132 // Verify that V is a vector type with same number of elements as DstVTy. 3133 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3134 unsigned VF = DstFVTy->getNumElements(); 3135 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3136 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3137 Type *SrcElemTy = SrcVecTy->getElementType(); 3138 Type *DstElemTy = DstFVTy->getElementType(); 3139 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3140 "Vector elements must have same size"); 3141 3142 // Do a direct cast if element types are castable. 3143 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3144 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3145 } 3146 // V cannot be directly casted to desired vector type. 3147 // May happen when V is a floating point vector but DstVTy is a vector of 3148 // pointers or vice-versa. Handle this using a two-step bitcast using an 3149 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3150 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3151 "Only one type should be a pointer type"); 3152 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3153 "Only one type should be a floating point type"); 3154 Type *IntTy = 3155 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3156 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3157 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3158 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3159 } 3160 3161 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3162 BasicBlock *Bypass) { 3163 Value *Count = getOrCreateTripCount(L); 3164 // Reuse existing vector loop preheader for TC checks. 3165 // Note that new preheader block is generated for vector loop. 3166 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3167 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3168 3169 // Generate code to check if the loop's trip count is less than VF * UF, or 3170 // equal to it in case a scalar epilogue is required; this implies that the 3171 // vector trip count is zero. This check also covers the case where adding one 3172 // to the backedge-taken count overflowed leading to an incorrect trip count 3173 // of zero. In this case we will also jump to the scalar loop. 3174 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3175 : ICmpInst::ICMP_ULT; 3176 3177 // If tail is to be folded, vector loop takes care of all iterations. 3178 Value *CheckMinIters = Builder.getFalse(); 3179 if (!Cost->foldTailByMasking()) { 3180 Value *Step = 3181 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3182 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3183 } 3184 // Create new preheader for vector loop. 3185 LoopVectorPreHeader = 3186 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3187 "vector.ph"); 3188 3189 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3190 DT->getNode(Bypass)->getIDom()) && 3191 "TC check is expected to dominate Bypass"); 3192 3193 // Update dominator for Bypass & LoopExit. 3194 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3195 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3196 3197 ReplaceInstWithInst( 3198 TCCheckBlock->getTerminator(), 3199 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3200 LoopBypassBlocks.push_back(TCCheckBlock); 3201 } 3202 3203 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3204 3205 BasicBlock *const SCEVCheckBlock = 3206 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3207 if (!SCEVCheckBlock) 3208 return nullptr; 3209 3210 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3211 (OptForSizeBasedOnProfile && 3212 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3213 "Cannot SCEV check stride or overflow when optimizing for size"); 3214 3215 3216 // Update dominator only if this is first RT check. 3217 if (LoopBypassBlocks.empty()) { 3218 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3219 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3220 } 3221 3222 LoopBypassBlocks.push_back(SCEVCheckBlock); 3223 AddedSafetyChecks = true; 3224 return SCEVCheckBlock; 3225 } 3226 3227 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3228 BasicBlock *Bypass) { 3229 // VPlan-native path does not do any analysis for runtime checks currently. 3230 if (EnableVPlanNativePath) 3231 return nullptr; 3232 3233 BasicBlock *const MemCheckBlock = 3234 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3235 3236 // Check if we generated code that checks in runtime if arrays overlap. We put 3237 // the checks into a separate block to make the more common case of few 3238 // elements faster. 3239 if (!MemCheckBlock) 3240 return nullptr; 3241 3242 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3243 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3244 "Cannot emit memory checks when optimizing for size, unless forced " 3245 "to vectorize."); 3246 ORE->emit([&]() { 3247 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3248 L->getStartLoc(), L->getHeader()) 3249 << "Code-size may be reduced by not forcing " 3250 "vectorization, or by source-code modifications " 3251 "eliminating the need for runtime checks " 3252 "(e.g., adding 'restrict')."; 3253 }); 3254 } 3255 3256 LoopBypassBlocks.push_back(MemCheckBlock); 3257 3258 AddedSafetyChecks = true; 3259 3260 // We currently don't use LoopVersioning for the actual loop cloning but we 3261 // still use it to add the noalias metadata. 3262 LVer = std::make_unique<LoopVersioning>( 3263 *Legal->getLAI(), 3264 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3265 DT, PSE.getSE()); 3266 LVer->prepareNoAliasMetadata(); 3267 return MemCheckBlock; 3268 } 3269 3270 Value *InnerLoopVectorizer::emitTransformedIndex( 3271 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3272 const InductionDescriptor &ID) const { 3273 3274 SCEVExpander Exp(*SE, DL, "induction"); 3275 auto Step = ID.getStep(); 3276 auto StartValue = ID.getStartValue(); 3277 assert(Index->getType() == Step->getType() && 3278 "Index type does not match StepValue type"); 3279 3280 // Note: the IR at this point is broken. We cannot use SE to create any new 3281 // SCEV and then expand it, hoping that SCEV's simplification will give us 3282 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3283 // lead to various SCEV crashes. So all we can do is to use builder and rely 3284 // on InstCombine for future simplifications. Here we handle some trivial 3285 // cases only. 3286 auto CreateAdd = [&B](Value *X, Value *Y) { 3287 assert(X->getType() == Y->getType() && "Types don't match!"); 3288 if (auto *CX = dyn_cast<ConstantInt>(X)) 3289 if (CX->isZero()) 3290 return Y; 3291 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3292 if (CY->isZero()) 3293 return X; 3294 return B.CreateAdd(X, Y); 3295 }; 3296 3297 auto CreateMul = [&B](Value *X, Value *Y) { 3298 assert(X->getType() == Y->getType() && "Types don't match!"); 3299 if (auto *CX = dyn_cast<ConstantInt>(X)) 3300 if (CX->isOne()) 3301 return Y; 3302 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3303 if (CY->isOne()) 3304 return X; 3305 return B.CreateMul(X, Y); 3306 }; 3307 3308 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3309 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3310 // the DomTree is not kept up-to-date for additional blocks generated in the 3311 // vector loop. By using the header as insertion point, we guarantee that the 3312 // expanded instructions dominate all their uses. 3313 auto GetInsertPoint = [this, &B]() { 3314 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3315 if (InsertBB != LoopVectorBody && 3316 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3317 return LoopVectorBody->getTerminator(); 3318 return &*B.GetInsertPoint(); 3319 }; 3320 3321 switch (ID.getKind()) { 3322 case InductionDescriptor::IK_IntInduction: { 3323 assert(Index->getType() == StartValue->getType() && 3324 "Index type does not match StartValue type"); 3325 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3326 return B.CreateSub(StartValue, Index); 3327 auto *Offset = CreateMul( 3328 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3329 return CreateAdd(StartValue, Offset); 3330 } 3331 case InductionDescriptor::IK_PtrInduction: { 3332 assert(isa<SCEVConstant>(Step) && 3333 "Expected constant step for pointer induction"); 3334 return B.CreateGEP( 3335 StartValue->getType()->getPointerElementType(), StartValue, 3336 CreateMul(Index, 3337 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3338 } 3339 case InductionDescriptor::IK_FpInduction: { 3340 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3341 auto InductionBinOp = ID.getInductionBinOp(); 3342 assert(InductionBinOp && 3343 (InductionBinOp->getOpcode() == Instruction::FAdd || 3344 InductionBinOp->getOpcode() == Instruction::FSub) && 3345 "Original bin op should be defined for FP induction"); 3346 3347 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3348 Value *MulExp = B.CreateFMul(StepValue, Index); 3349 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3350 "induction"); 3351 } 3352 case InductionDescriptor::IK_NoInduction: 3353 return nullptr; 3354 } 3355 llvm_unreachable("invalid enum"); 3356 } 3357 3358 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3359 LoopScalarBody = OrigLoop->getHeader(); 3360 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3361 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3362 assert(LoopExitBlock && "Must have an exit block"); 3363 assert(LoopVectorPreHeader && "Invalid loop structure"); 3364 3365 LoopMiddleBlock = 3366 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3367 LI, nullptr, Twine(Prefix) + "middle.block"); 3368 LoopScalarPreHeader = 3369 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3370 nullptr, Twine(Prefix) + "scalar.ph"); 3371 3372 // Set up branch from middle block to the exit and scalar preheader blocks. 3373 // completeLoopSkeleton will update the condition to use an iteration check, 3374 // if required to decide whether to execute the remainder. 3375 BranchInst *BrInst = 3376 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3377 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3378 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3379 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3380 3381 // We intentionally don't let SplitBlock to update LoopInfo since 3382 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3383 // LoopVectorBody is explicitly added to the correct place few lines later. 3384 LoopVectorBody = 3385 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3386 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3387 3388 // Update dominator for loop exit. 3389 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3390 3391 // Create and register the new vector loop. 3392 Loop *Lp = LI->AllocateLoop(); 3393 Loop *ParentLoop = OrigLoop->getParentLoop(); 3394 3395 // Insert the new loop into the loop nest and register the new basic blocks 3396 // before calling any utilities such as SCEV that require valid LoopInfo. 3397 if (ParentLoop) { 3398 ParentLoop->addChildLoop(Lp); 3399 } else { 3400 LI->addTopLevelLoop(Lp); 3401 } 3402 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3403 return Lp; 3404 } 3405 3406 void InnerLoopVectorizer::createInductionResumeValues( 3407 Loop *L, Value *VectorTripCount, 3408 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3409 assert(VectorTripCount && L && "Expected valid arguments"); 3410 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3411 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3412 "Inconsistent information about additional bypass."); 3413 // We are going to resume the execution of the scalar loop. 3414 // Go over all of the induction variables that we found and fix the 3415 // PHIs that are left in the scalar version of the loop. 3416 // The starting values of PHI nodes depend on the counter of the last 3417 // iteration in the vectorized loop. 3418 // If we come from a bypass edge then we need to start from the original 3419 // start value. 3420 for (auto &InductionEntry : Legal->getInductionVars()) { 3421 PHINode *OrigPhi = InductionEntry.first; 3422 InductionDescriptor II = InductionEntry.second; 3423 3424 // Create phi nodes to merge from the backedge-taken check block. 3425 PHINode *BCResumeVal = 3426 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3427 LoopScalarPreHeader->getTerminator()); 3428 // Copy original phi DL over to the new one. 3429 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3430 Value *&EndValue = IVEndValues[OrigPhi]; 3431 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3432 if (OrigPhi == OldInduction) { 3433 // We know what the end value is. 3434 EndValue = VectorTripCount; 3435 } else { 3436 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3437 3438 // Fast-math-flags propagate from the original induction instruction. 3439 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3440 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3441 3442 Type *StepType = II.getStep()->getType(); 3443 Instruction::CastOps CastOp = 3444 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3445 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3446 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3447 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3448 EndValue->setName("ind.end"); 3449 3450 // Compute the end value for the additional bypass (if applicable). 3451 if (AdditionalBypass.first) { 3452 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3453 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3454 StepType, true); 3455 CRD = 3456 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3457 EndValueFromAdditionalBypass = 3458 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3459 EndValueFromAdditionalBypass->setName("ind.end"); 3460 } 3461 } 3462 // The new PHI merges the original incoming value, in case of a bypass, 3463 // or the value at the end of the vectorized loop. 3464 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3465 3466 // Fix the scalar body counter (PHI node). 3467 // The old induction's phi node in the scalar body needs the truncated 3468 // value. 3469 for (BasicBlock *BB : LoopBypassBlocks) 3470 BCResumeVal->addIncoming(II.getStartValue(), BB); 3471 3472 if (AdditionalBypass.first) 3473 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3474 EndValueFromAdditionalBypass); 3475 3476 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3477 } 3478 } 3479 3480 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3481 MDNode *OrigLoopID) { 3482 assert(L && "Expected valid loop."); 3483 3484 // The trip counts should be cached by now. 3485 Value *Count = getOrCreateTripCount(L); 3486 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3487 3488 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3489 3490 // Add a check in the middle block to see if we have completed 3491 // all of the iterations in the first vector loop. 3492 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3493 // If tail is to be folded, we know we don't need to run the remainder. 3494 if (!Cost->foldTailByMasking()) { 3495 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3496 Count, VectorTripCount, "cmp.n", 3497 LoopMiddleBlock->getTerminator()); 3498 3499 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3500 // of the corresponding compare because they may have ended up with 3501 // different line numbers and we want to avoid awkward line stepping while 3502 // debugging. Eg. if the compare has got a line number inside the loop. 3503 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3504 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3505 } 3506 3507 // Get ready to start creating new instructions into the vectorized body. 3508 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3509 "Inconsistent vector loop preheader"); 3510 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3511 3512 Optional<MDNode *> VectorizedLoopID = 3513 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3514 LLVMLoopVectorizeFollowupVectorized}); 3515 if (VectorizedLoopID.hasValue()) { 3516 L->setLoopID(VectorizedLoopID.getValue()); 3517 3518 // Do not setAlreadyVectorized if loop attributes have been defined 3519 // explicitly. 3520 return LoopVectorPreHeader; 3521 } 3522 3523 // Keep all loop hints from the original loop on the vector loop (we'll 3524 // replace the vectorizer-specific hints below). 3525 if (MDNode *LID = OrigLoop->getLoopID()) 3526 L->setLoopID(LID); 3527 3528 LoopVectorizeHints Hints(L, true, *ORE); 3529 Hints.setAlreadyVectorized(); 3530 3531 #ifdef EXPENSIVE_CHECKS 3532 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3533 LI->verify(*DT); 3534 #endif 3535 3536 return LoopVectorPreHeader; 3537 } 3538 3539 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3540 /* 3541 In this function we generate a new loop. The new loop will contain 3542 the vectorized instructions while the old loop will continue to run the 3543 scalar remainder. 3544 3545 [ ] <-- loop iteration number check. 3546 / | 3547 / v 3548 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3549 | / | 3550 | / v 3551 || [ ] <-- vector pre header. 3552 |/ | 3553 | v 3554 | [ ] \ 3555 | [ ]_| <-- vector loop. 3556 | | 3557 | v 3558 | -[ ] <--- middle-block. 3559 | / | 3560 | / v 3561 -|- >[ ] <--- new preheader. 3562 | | 3563 | v 3564 | [ ] \ 3565 | [ ]_| <-- old scalar loop to handle remainder. 3566 \ | 3567 \ v 3568 >[ ] <-- exit block. 3569 ... 3570 */ 3571 3572 // Get the metadata of the original loop before it gets modified. 3573 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3574 3575 // Create an empty vector loop, and prepare basic blocks for the runtime 3576 // checks. 3577 Loop *Lp = createVectorLoopSkeleton(""); 3578 3579 // Now, compare the new count to zero. If it is zero skip the vector loop and 3580 // jump to the scalar loop. This check also covers the case where the 3581 // backedge-taken count is uint##_max: adding one to it will overflow leading 3582 // to an incorrect trip count of zero. In this (rare) case we will also jump 3583 // to the scalar loop. 3584 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3585 3586 // Generate the code to check any assumptions that we've made for SCEV 3587 // expressions. 3588 emitSCEVChecks(Lp, LoopScalarPreHeader); 3589 3590 // Generate the code that checks in runtime if arrays overlap. We put the 3591 // checks into a separate block to make the more common case of few elements 3592 // faster. 3593 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3594 3595 // Some loops have a single integer induction variable, while other loops 3596 // don't. One example is c++ iterators that often have multiple pointer 3597 // induction variables. In the code below we also support a case where we 3598 // don't have a single induction variable. 3599 // 3600 // We try to obtain an induction variable from the original loop as hard 3601 // as possible. However if we don't find one that: 3602 // - is an integer 3603 // - counts from zero, stepping by one 3604 // - is the size of the widest induction variable type 3605 // then we create a new one. 3606 OldInduction = Legal->getPrimaryInduction(); 3607 Type *IdxTy = Legal->getWidestInductionType(); 3608 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3609 // The loop step is equal to the vectorization factor (num of SIMD elements) 3610 // times the unroll factor (num of SIMD instructions). 3611 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3612 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3613 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3614 Induction = 3615 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3616 getDebugLocFromInstOrOperands(OldInduction)); 3617 3618 // Emit phis for the new starting index of the scalar loop. 3619 createInductionResumeValues(Lp, CountRoundDown); 3620 3621 return completeLoopSkeleton(Lp, OrigLoopID); 3622 } 3623 3624 // Fix up external users of the induction variable. At this point, we are 3625 // in LCSSA form, with all external PHIs that use the IV having one input value, 3626 // coming from the remainder loop. We need those PHIs to also have a correct 3627 // value for the IV when arriving directly from the middle block. 3628 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3629 const InductionDescriptor &II, 3630 Value *CountRoundDown, Value *EndValue, 3631 BasicBlock *MiddleBlock) { 3632 // There are two kinds of external IV usages - those that use the value 3633 // computed in the last iteration (the PHI) and those that use the penultimate 3634 // value (the value that feeds into the phi from the loop latch). 3635 // We allow both, but they, obviously, have different values. 3636 3637 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3638 3639 DenseMap<Value *, Value *> MissingVals; 3640 3641 // An external user of the last iteration's value should see the value that 3642 // the remainder loop uses to initialize its own IV. 3643 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3644 for (User *U : PostInc->users()) { 3645 Instruction *UI = cast<Instruction>(U); 3646 if (!OrigLoop->contains(UI)) { 3647 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3648 MissingVals[UI] = EndValue; 3649 } 3650 } 3651 3652 // An external user of the penultimate value need to see EndValue - Step. 3653 // The simplest way to get this is to recompute it from the constituent SCEVs, 3654 // that is Start + (Step * (CRD - 1)). 3655 for (User *U : OrigPhi->users()) { 3656 auto *UI = cast<Instruction>(U); 3657 if (!OrigLoop->contains(UI)) { 3658 const DataLayout &DL = 3659 OrigLoop->getHeader()->getModule()->getDataLayout(); 3660 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3661 3662 IRBuilder<> B(MiddleBlock->getTerminator()); 3663 3664 // Fast-math-flags propagate from the original induction instruction. 3665 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3666 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3667 3668 Value *CountMinusOne = B.CreateSub( 3669 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3670 Value *CMO = 3671 !II.getStep()->getType()->isIntegerTy() 3672 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3673 II.getStep()->getType()) 3674 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3675 CMO->setName("cast.cmo"); 3676 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3677 Escape->setName("ind.escape"); 3678 MissingVals[UI] = Escape; 3679 } 3680 } 3681 3682 for (auto &I : MissingVals) { 3683 PHINode *PHI = cast<PHINode>(I.first); 3684 // One corner case we have to handle is two IVs "chasing" each-other, 3685 // that is %IV2 = phi [...], [ %IV1, %latch ] 3686 // In this case, if IV1 has an external use, we need to avoid adding both 3687 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3688 // don't already have an incoming value for the middle block. 3689 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3690 PHI->addIncoming(I.second, MiddleBlock); 3691 } 3692 } 3693 3694 namespace { 3695 3696 struct CSEDenseMapInfo { 3697 static bool canHandle(const Instruction *I) { 3698 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3699 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3700 } 3701 3702 static inline Instruction *getEmptyKey() { 3703 return DenseMapInfo<Instruction *>::getEmptyKey(); 3704 } 3705 3706 static inline Instruction *getTombstoneKey() { 3707 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3708 } 3709 3710 static unsigned getHashValue(const Instruction *I) { 3711 assert(canHandle(I) && "Unknown instruction!"); 3712 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3713 I->value_op_end())); 3714 } 3715 3716 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3717 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3718 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3719 return LHS == RHS; 3720 return LHS->isIdenticalTo(RHS); 3721 } 3722 }; 3723 3724 } // end anonymous namespace 3725 3726 ///Perform cse of induction variable instructions. 3727 static void cse(BasicBlock *BB) { 3728 // Perform simple cse. 3729 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3730 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3731 Instruction *In = &*I++; 3732 3733 if (!CSEDenseMapInfo::canHandle(In)) 3734 continue; 3735 3736 // Check if we can replace this instruction with any of the 3737 // visited instructions. 3738 if (Instruction *V = CSEMap.lookup(In)) { 3739 In->replaceAllUsesWith(V); 3740 In->eraseFromParent(); 3741 continue; 3742 } 3743 3744 CSEMap[In] = In; 3745 } 3746 } 3747 3748 InstructionCost 3749 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3750 bool &NeedToScalarize) { 3751 Function *F = CI->getCalledFunction(); 3752 Type *ScalarRetTy = CI->getType(); 3753 SmallVector<Type *, 4> Tys, ScalarTys; 3754 for (auto &ArgOp : CI->arg_operands()) 3755 ScalarTys.push_back(ArgOp->getType()); 3756 3757 // Estimate cost of scalarized vector call. The source operands are assumed 3758 // to be vectors, so we need to extract individual elements from there, 3759 // execute VF scalar calls, and then gather the result into the vector return 3760 // value. 3761 InstructionCost ScalarCallCost = 3762 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3763 if (VF.isScalar()) 3764 return ScalarCallCost; 3765 3766 // Compute corresponding vector type for return value and arguments. 3767 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3768 for (Type *ScalarTy : ScalarTys) 3769 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3770 3771 // Compute costs of unpacking argument values for the scalar calls and 3772 // packing the return values to a vector. 3773 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3774 3775 InstructionCost Cost = 3776 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3777 3778 // If we can't emit a vector call for this function, then the currently found 3779 // cost is the cost we need to return. 3780 NeedToScalarize = true; 3781 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3782 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3783 3784 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3785 return Cost; 3786 3787 // If the corresponding vector cost is cheaper, return its cost. 3788 InstructionCost VectorCallCost = 3789 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3790 if (VectorCallCost < Cost) { 3791 NeedToScalarize = false; 3792 Cost = VectorCallCost; 3793 } 3794 return Cost; 3795 } 3796 3797 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3798 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3799 return Elt; 3800 return VectorType::get(Elt, VF); 3801 } 3802 3803 InstructionCost 3804 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3805 ElementCount VF) { 3806 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3807 assert(ID && "Expected intrinsic call!"); 3808 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3809 FastMathFlags FMF; 3810 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3811 FMF = FPMO->getFastMathFlags(); 3812 3813 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3814 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3815 SmallVector<Type *> ParamTys; 3816 std::transform(FTy->param_begin(), FTy->param_end(), 3817 std::back_inserter(ParamTys), 3818 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3819 3820 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3821 dyn_cast<IntrinsicInst>(CI)); 3822 return TTI.getIntrinsicInstrCost(CostAttrs, 3823 TargetTransformInfo::TCK_RecipThroughput); 3824 } 3825 3826 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3827 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3828 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3829 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3830 } 3831 3832 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3833 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3834 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3835 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3836 } 3837 3838 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3839 // For every instruction `I` in MinBWs, truncate the operands, create a 3840 // truncated version of `I` and reextend its result. InstCombine runs 3841 // later and will remove any ext/trunc pairs. 3842 SmallPtrSet<Value *, 4> Erased; 3843 for (const auto &KV : Cost->getMinimalBitwidths()) { 3844 // If the value wasn't vectorized, we must maintain the original scalar 3845 // type. The absence of the value from State indicates that it 3846 // wasn't vectorized. 3847 VPValue *Def = State.Plan->getVPValue(KV.first); 3848 if (!State.hasAnyVectorValue(Def)) 3849 continue; 3850 for (unsigned Part = 0; Part < UF; ++Part) { 3851 Value *I = State.get(Def, Part); 3852 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3853 continue; 3854 Type *OriginalTy = I->getType(); 3855 Type *ScalarTruncatedTy = 3856 IntegerType::get(OriginalTy->getContext(), KV.second); 3857 auto *TruncatedTy = FixedVectorType::get( 3858 ScalarTruncatedTy, 3859 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3860 if (TruncatedTy == OriginalTy) 3861 continue; 3862 3863 IRBuilder<> B(cast<Instruction>(I)); 3864 auto ShrinkOperand = [&](Value *V) -> Value * { 3865 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3866 if (ZI->getSrcTy() == TruncatedTy) 3867 return ZI->getOperand(0); 3868 return B.CreateZExtOrTrunc(V, TruncatedTy); 3869 }; 3870 3871 // The actual instruction modification depends on the instruction type, 3872 // unfortunately. 3873 Value *NewI = nullptr; 3874 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3875 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3876 ShrinkOperand(BO->getOperand(1))); 3877 3878 // Any wrapping introduced by shrinking this operation shouldn't be 3879 // considered undefined behavior. So, we can't unconditionally copy 3880 // arithmetic wrapping flags to NewI. 3881 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3882 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3883 NewI = 3884 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3885 ShrinkOperand(CI->getOperand(1))); 3886 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3887 NewI = B.CreateSelect(SI->getCondition(), 3888 ShrinkOperand(SI->getTrueValue()), 3889 ShrinkOperand(SI->getFalseValue())); 3890 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3891 switch (CI->getOpcode()) { 3892 default: 3893 llvm_unreachable("Unhandled cast!"); 3894 case Instruction::Trunc: 3895 NewI = ShrinkOperand(CI->getOperand(0)); 3896 break; 3897 case Instruction::SExt: 3898 NewI = B.CreateSExtOrTrunc( 3899 CI->getOperand(0), 3900 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3901 break; 3902 case Instruction::ZExt: 3903 NewI = B.CreateZExtOrTrunc( 3904 CI->getOperand(0), 3905 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3906 break; 3907 } 3908 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3909 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3910 ->getNumElements(); 3911 auto *O0 = B.CreateZExtOrTrunc( 3912 SI->getOperand(0), 3913 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3914 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3915 ->getNumElements(); 3916 auto *O1 = B.CreateZExtOrTrunc( 3917 SI->getOperand(1), 3918 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3919 3920 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3921 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3922 // Don't do anything with the operands, just extend the result. 3923 continue; 3924 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3925 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3926 ->getNumElements(); 3927 auto *O0 = B.CreateZExtOrTrunc( 3928 IE->getOperand(0), 3929 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3930 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3931 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3932 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3933 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3934 ->getNumElements(); 3935 auto *O0 = B.CreateZExtOrTrunc( 3936 EE->getOperand(0), 3937 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3938 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3939 } else { 3940 // If we don't know what to do, be conservative and don't do anything. 3941 continue; 3942 } 3943 3944 // Lastly, extend the result. 3945 NewI->takeName(cast<Instruction>(I)); 3946 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3947 I->replaceAllUsesWith(Res); 3948 cast<Instruction>(I)->eraseFromParent(); 3949 Erased.insert(I); 3950 State.reset(Def, Res, Part); 3951 } 3952 } 3953 3954 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3955 for (const auto &KV : Cost->getMinimalBitwidths()) { 3956 // If the value wasn't vectorized, we must maintain the original scalar 3957 // type. The absence of the value from State indicates that it 3958 // wasn't vectorized. 3959 VPValue *Def = State.Plan->getVPValue(KV.first); 3960 if (!State.hasAnyVectorValue(Def)) 3961 continue; 3962 for (unsigned Part = 0; Part < UF; ++Part) { 3963 Value *I = State.get(Def, Part); 3964 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3965 if (Inst && Inst->use_empty()) { 3966 Value *NewI = Inst->getOperand(0); 3967 Inst->eraseFromParent(); 3968 State.reset(Def, NewI, Part); 3969 } 3970 } 3971 } 3972 } 3973 3974 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3975 // Insert truncates and extends for any truncated instructions as hints to 3976 // InstCombine. 3977 if (VF.isVector()) 3978 truncateToMinimalBitwidths(State); 3979 3980 // Fix widened non-induction PHIs by setting up the PHI operands. 3981 if (OrigPHIsToFix.size()) { 3982 assert(EnableVPlanNativePath && 3983 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3984 fixNonInductionPHIs(State); 3985 } 3986 3987 // At this point every instruction in the original loop is widened to a 3988 // vector form. Now we need to fix the recurrences in the loop. These PHI 3989 // nodes are currently empty because we did not want to introduce cycles. 3990 // This is the second stage of vectorizing recurrences. 3991 fixCrossIterationPHIs(State); 3992 3993 // Forget the original basic block. 3994 PSE.getSE()->forgetLoop(OrigLoop); 3995 3996 // Fix-up external users of the induction variables. 3997 for (auto &Entry : Legal->getInductionVars()) 3998 fixupIVUsers(Entry.first, Entry.second, 3999 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4000 IVEndValues[Entry.first], LoopMiddleBlock); 4001 4002 fixLCSSAPHIs(State); 4003 for (Instruction *PI : PredicatedInstructions) 4004 sinkScalarOperands(&*PI); 4005 4006 // Remove redundant induction instructions. 4007 cse(LoopVectorBody); 4008 4009 // Set/update profile weights for the vector and remainder loops as original 4010 // loop iterations are now distributed among them. Note that original loop 4011 // represented by LoopScalarBody becomes remainder loop after vectorization. 4012 // 4013 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4014 // end up getting slightly roughened result but that should be OK since 4015 // profile is not inherently precise anyway. Note also possible bypass of 4016 // vector code caused by legality checks is ignored, assigning all the weight 4017 // to the vector loop, optimistically. 4018 // 4019 // For scalable vectorization we can't know at compile time how many iterations 4020 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4021 // vscale of '1'. 4022 setProfileInfoAfterUnrolling( 4023 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4024 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4025 } 4026 4027 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4028 // In order to support recurrences we need to be able to vectorize Phi nodes. 4029 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4030 // stage #2: We now need to fix the recurrences by adding incoming edges to 4031 // the currently empty PHI nodes. At this point every instruction in the 4032 // original loop is widened to a vector form so we can use them to construct 4033 // the incoming edges. 4034 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4035 // Handle first-order recurrences and reductions that need to be fixed. 4036 if (Legal->isFirstOrderRecurrence(&Phi)) 4037 fixFirstOrderRecurrence(&Phi, State); 4038 else if (Legal->isReductionVariable(&Phi)) 4039 fixReduction(&Phi, State); 4040 } 4041 } 4042 4043 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4044 VPTransformState &State) { 4045 // This is the second phase of vectorizing first-order recurrences. An 4046 // overview of the transformation is described below. Suppose we have the 4047 // following loop. 4048 // 4049 // for (int i = 0; i < n; ++i) 4050 // b[i] = a[i] - a[i - 1]; 4051 // 4052 // There is a first-order recurrence on "a". For this loop, the shorthand 4053 // scalar IR looks like: 4054 // 4055 // scalar.ph: 4056 // s_init = a[-1] 4057 // br scalar.body 4058 // 4059 // scalar.body: 4060 // i = phi [0, scalar.ph], [i+1, scalar.body] 4061 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4062 // s2 = a[i] 4063 // b[i] = s2 - s1 4064 // br cond, scalar.body, ... 4065 // 4066 // In this example, s1 is a recurrence because it's value depends on the 4067 // previous iteration. In the first phase of vectorization, we created a 4068 // temporary value for s1. We now complete the vectorization and produce the 4069 // shorthand vector IR shown below (for VF = 4, UF = 1). 4070 // 4071 // vector.ph: 4072 // v_init = vector(..., ..., ..., a[-1]) 4073 // br vector.body 4074 // 4075 // vector.body 4076 // i = phi [0, vector.ph], [i+4, vector.body] 4077 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4078 // v2 = a[i, i+1, i+2, i+3]; 4079 // v3 = vector(v1(3), v2(0, 1, 2)) 4080 // b[i, i+1, i+2, i+3] = v2 - v3 4081 // br cond, vector.body, middle.block 4082 // 4083 // middle.block: 4084 // x = v2(3) 4085 // br scalar.ph 4086 // 4087 // scalar.ph: 4088 // s_init = phi [x, middle.block], [a[-1], otherwise] 4089 // br scalar.body 4090 // 4091 // After execution completes the vector loop, we extract the next value of 4092 // the recurrence (x) to use as the initial value in the scalar loop. 4093 4094 // Get the original loop preheader and single loop latch. 4095 auto *Preheader = OrigLoop->getLoopPreheader(); 4096 auto *Latch = OrigLoop->getLoopLatch(); 4097 4098 // Get the initial and previous values of the scalar recurrence. 4099 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4100 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4101 4102 // Create a vector from the initial value. 4103 auto *VectorInit = ScalarInit; 4104 if (VF.isVector()) { 4105 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4106 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4107 VectorInit = Builder.CreateInsertElement( 4108 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4109 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4110 } 4111 4112 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4113 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4114 // We constructed a temporary phi node in the first phase of vectorization. 4115 // This phi node will eventually be deleted. 4116 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4117 4118 // Create a phi node for the new recurrence. The current value will either be 4119 // the initial value inserted into a vector or loop-varying vector value. 4120 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4121 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4122 4123 // Get the vectorized previous value of the last part UF - 1. It appears last 4124 // among all unrolled iterations, due to the order of their construction. 4125 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4126 4127 // Find and set the insertion point after the previous value if it is an 4128 // instruction. 4129 BasicBlock::iterator InsertPt; 4130 // Note that the previous value may have been constant-folded so it is not 4131 // guaranteed to be an instruction in the vector loop. 4132 // FIXME: Loop invariant values do not form recurrences. We should deal with 4133 // them earlier. 4134 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4135 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4136 else { 4137 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4138 if (isa<PHINode>(PreviousLastPart)) 4139 // If the previous value is a phi node, we should insert after all the phi 4140 // nodes in the block containing the PHI to avoid breaking basic block 4141 // verification. Note that the basic block may be different to 4142 // LoopVectorBody, in case we predicate the loop. 4143 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4144 else 4145 InsertPt = ++PreviousInst->getIterator(); 4146 } 4147 Builder.SetInsertPoint(&*InsertPt); 4148 4149 // We will construct a vector for the recurrence by combining the values for 4150 // the current and previous iterations. This is the required shuffle mask. 4151 assert(!VF.isScalable()); 4152 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4153 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4154 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4155 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4156 4157 // The vector from which to take the initial value for the current iteration 4158 // (actual or unrolled). Initially, this is the vector phi node. 4159 Value *Incoming = VecPhi; 4160 4161 // Shuffle the current and previous vector and update the vector parts. 4162 for (unsigned Part = 0; Part < UF; ++Part) { 4163 Value *PreviousPart = State.get(PreviousDef, Part); 4164 Value *PhiPart = State.get(PhiDef, Part); 4165 auto *Shuffle = 4166 VF.isVector() 4167 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4168 : Incoming; 4169 PhiPart->replaceAllUsesWith(Shuffle); 4170 cast<Instruction>(PhiPart)->eraseFromParent(); 4171 State.reset(PhiDef, Shuffle, Part); 4172 Incoming = PreviousPart; 4173 } 4174 4175 // Fix the latch value of the new recurrence in the vector loop. 4176 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4177 4178 // Extract the last vector element in the middle block. This will be the 4179 // initial value for the recurrence when jumping to the scalar loop. 4180 auto *ExtractForScalar = Incoming; 4181 if (VF.isVector()) { 4182 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4183 ExtractForScalar = Builder.CreateExtractElement( 4184 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4185 "vector.recur.extract"); 4186 } 4187 // Extract the second last element in the middle block if the 4188 // Phi is used outside the loop. We need to extract the phi itself 4189 // and not the last element (the phi update in the current iteration). This 4190 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4191 // when the scalar loop is not run at all. 4192 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4193 if (VF.isVector()) 4194 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4195 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4196 "vector.recur.extract.for.phi"); 4197 // When loop is unrolled without vectorizing, initialize 4198 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4199 // `Incoming`. This is analogous to the vectorized case above: extracting the 4200 // second last element when VF > 1. 4201 else if (UF > 1) 4202 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4203 4204 // Fix the initial value of the original recurrence in the scalar loop. 4205 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4206 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4207 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4208 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4209 Start->addIncoming(Incoming, BB); 4210 } 4211 4212 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4213 Phi->setName("scalar.recur"); 4214 4215 // Finally, fix users of the recurrence outside the loop. The users will need 4216 // either the last value of the scalar recurrence or the last value of the 4217 // vector recurrence we extracted in the middle block. Since the loop is in 4218 // LCSSA form, we just need to find all the phi nodes for the original scalar 4219 // recurrence in the exit block, and then add an edge for the middle block. 4220 // Note that LCSSA does not imply single entry when the original scalar loop 4221 // had multiple exiting edges (as we always run the last iteration in the 4222 // scalar epilogue); in that case, the exiting path through middle will be 4223 // dynamically dead and the value picked for the phi doesn't matter. 4224 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4225 if (any_of(LCSSAPhi.incoming_values(), 4226 [Phi](Value *V) { return V == Phi; })) 4227 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4228 } 4229 4230 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4231 // Get it's reduction variable descriptor. 4232 assert(Legal->isReductionVariable(Phi) && 4233 "Unable to find the reduction variable"); 4234 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4235 4236 RecurKind RK = RdxDesc.getRecurrenceKind(); 4237 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4238 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4239 setDebugLocFromInst(Builder, ReductionStartValue); 4240 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4241 4242 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4243 // This is the vector-clone of the value that leaves the loop. 4244 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4245 4246 // Wrap flags are in general invalid after vectorization, clear them. 4247 clearReductionWrapFlags(RdxDesc, State); 4248 4249 // Fix the vector-loop phi. 4250 4251 // Reductions do not have to start at zero. They can start with 4252 // any loop invariant values. 4253 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4254 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4255 4256 for (unsigned Part = 0; Part < UF; ++Part) { 4257 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4258 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4259 cast<PHINode>(VecRdxPhi) 4260 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4261 } 4262 4263 // Before each round, move the insertion point right between 4264 // the PHIs and the values we are going to write. 4265 // This allows us to write both PHINodes and the extractelement 4266 // instructions. 4267 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4268 4269 setDebugLocFromInst(Builder, LoopExitInst); 4270 4271 // If tail is folded by masking, the vector value to leave the loop should be 4272 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4273 // instead of the former. For an inloop reduction the reduction will already 4274 // be predicated, and does not need to be handled here. 4275 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4276 for (unsigned Part = 0; Part < UF; ++Part) { 4277 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4278 Value *Sel = nullptr; 4279 for (User *U : VecLoopExitInst->users()) { 4280 if (isa<SelectInst>(U)) { 4281 assert(!Sel && "Reduction exit feeding two selects"); 4282 Sel = U; 4283 } else 4284 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4285 } 4286 assert(Sel && "Reduction exit feeds no select"); 4287 State.reset(LoopExitInstDef, Sel, Part); 4288 4289 // If the target can create a predicated operator for the reduction at no 4290 // extra cost in the loop (for example a predicated vadd), it can be 4291 // cheaper for the select to remain in the loop than be sunk out of it, 4292 // and so use the select value for the phi instead of the old 4293 // LoopExitValue. 4294 if (PreferPredicatedReductionSelect || 4295 TTI->preferPredicatedReductionSelect( 4296 RdxDesc.getOpcode(), Phi->getType(), 4297 TargetTransformInfo::ReductionFlags())) { 4298 auto *VecRdxPhi = 4299 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4300 VecRdxPhi->setIncomingValueForBlock( 4301 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4302 } 4303 } 4304 } 4305 4306 // If the vector reduction can be performed in a smaller type, we truncate 4307 // then extend the loop exit value to enable InstCombine to evaluate the 4308 // entire expression in the smaller type. 4309 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4310 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4311 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4312 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4313 Builder.SetInsertPoint( 4314 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4315 VectorParts RdxParts(UF); 4316 for (unsigned Part = 0; Part < UF; ++Part) { 4317 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4318 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4319 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4320 : Builder.CreateZExt(Trunc, VecTy); 4321 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4322 UI != RdxParts[Part]->user_end();) 4323 if (*UI != Trunc) { 4324 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4325 RdxParts[Part] = Extnd; 4326 } else { 4327 ++UI; 4328 } 4329 } 4330 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4331 for (unsigned Part = 0; Part < UF; ++Part) { 4332 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4333 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4334 } 4335 } 4336 4337 // Reduce all of the unrolled parts into a single vector. 4338 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4339 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4340 4341 // The middle block terminator has already been assigned a DebugLoc here (the 4342 // OrigLoop's single latch terminator). We want the whole middle block to 4343 // appear to execute on this line because: (a) it is all compiler generated, 4344 // (b) these instructions are always executed after evaluating the latch 4345 // conditional branch, and (c) other passes may add new predecessors which 4346 // terminate on this line. This is the easiest way to ensure we don't 4347 // accidentally cause an extra step back into the loop while debugging. 4348 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4349 { 4350 // Floating-point operations should have some FMF to enable the reduction. 4351 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4352 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4353 for (unsigned Part = 1; Part < UF; ++Part) { 4354 Value *RdxPart = State.get(LoopExitInstDef, Part); 4355 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4356 ReducedPartRdx = Builder.CreateBinOp( 4357 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4358 } else { 4359 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4360 } 4361 } 4362 } 4363 4364 // Create the reduction after the loop. Note that inloop reductions create the 4365 // target reduction in the loop using a Reduction recipe. 4366 if (VF.isVector() && !IsInLoopReductionPhi) { 4367 ReducedPartRdx = 4368 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4369 // If the reduction can be performed in a smaller type, we need to extend 4370 // the reduction to the wider type before we branch to the original loop. 4371 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4372 ReducedPartRdx = 4373 RdxDesc.isSigned() 4374 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4375 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4376 } 4377 4378 // Create a phi node that merges control-flow from the backedge-taken check 4379 // block and the middle block. 4380 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4381 LoopScalarPreHeader->getTerminator()); 4382 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4383 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4384 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4385 4386 // Now, we need to fix the users of the reduction variable 4387 // inside and outside of the scalar remainder loop. 4388 4389 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4390 // in the exit blocks. See comment on analogous loop in 4391 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4392 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4393 if (any_of(LCSSAPhi.incoming_values(), 4394 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4395 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4396 4397 // Fix the scalar loop reduction variable with the incoming reduction sum 4398 // from the vector body and from the backedge value. 4399 int IncomingEdgeBlockIdx = 4400 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4401 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4402 // Pick the other block. 4403 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4404 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4405 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4406 } 4407 4408 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4409 VPTransformState &State) { 4410 RecurKind RK = RdxDesc.getRecurrenceKind(); 4411 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4412 return; 4413 4414 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4415 assert(LoopExitInstr && "null loop exit instruction"); 4416 SmallVector<Instruction *, 8> Worklist; 4417 SmallPtrSet<Instruction *, 8> Visited; 4418 Worklist.push_back(LoopExitInstr); 4419 Visited.insert(LoopExitInstr); 4420 4421 while (!Worklist.empty()) { 4422 Instruction *Cur = Worklist.pop_back_val(); 4423 if (isa<OverflowingBinaryOperator>(Cur)) 4424 for (unsigned Part = 0; Part < UF; ++Part) { 4425 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4426 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4427 } 4428 4429 for (User *U : Cur->users()) { 4430 Instruction *UI = cast<Instruction>(U); 4431 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4432 Visited.insert(UI).second) 4433 Worklist.push_back(UI); 4434 } 4435 } 4436 } 4437 4438 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4439 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4440 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4441 // Some phis were already hand updated by the reduction and recurrence 4442 // code above, leave them alone. 4443 continue; 4444 4445 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4446 // Non-instruction incoming values will have only one value. 4447 4448 VPLane Lane = VPLane::getFirstLane(); 4449 if (isa<Instruction>(IncomingValue) && 4450 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4451 VF)) 4452 Lane = VPLane::getLastLaneForVF(VF); 4453 4454 // Can be a loop invariant incoming value or the last scalar value to be 4455 // extracted from the vectorized loop. 4456 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4457 Value *lastIncomingValue = 4458 OrigLoop->isLoopInvariant(IncomingValue) 4459 ? IncomingValue 4460 : State.get(State.Plan->getVPValue(IncomingValue), 4461 VPIteration(UF - 1, Lane)); 4462 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4463 } 4464 } 4465 4466 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4467 // The basic block and loop containing the predicated instruction. 4468 auto *PredBB = PredInst->getParent(); 4469 auto *VectorLoop = LI->getLoopFor(PredBB); 4470 4471 // Initialize a worklist with the operands of the predicated instruction. 4472 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4473 4474 // Holds instructions that we need to analyze again. An instruction may be 4475 // reanalyzed if we don't yet know if we can sink it or not. 4476 SmallVector<Instruction *, 8> InstsToReanalyze; 4477 4478 // Returns true if a given use occurs in the predicated block. Phi nodes use 4479 // their operands in their corresponding predecessor blocks. 4480 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4481 auto *I = cast<Instruction>(U.getUser()); 4482 BasicBlock *BB = I->getParent(); 4483 if (auto *Phi = dyn_cast<PHINode>(I)) 4484 BB = Phi->getIncomingBlock( 4485 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4486 return BB == PredBB; 4487 }; 4488 4489 // Iteratively sink the scalarized operands of the predicated instruction 4490 // into the block we created for it. When an instruction is sunk, it's 4491 // operands are then added to the worklist. The algorithm ends after one pass 4492 // through the worklist doesn't sink a single instruction. 4493 bool Changed; 4494 do { 4495 // Add the instructions that need to be reanalyzed to the worklist, and 4496 // reset the changed indicator. 4497 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4498 InstsToReanalyze.clear(); 4499 Changed = false; 4500 4501 while (!Worklist.empty()) { 4502 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4503 4504 // We can't sink an instruction if it is a phi node, is already in the 4505 // predicated block, is not in the loop, or may have side effects. 4506 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4507 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4508 continue; 4509 4510 // It's legal to sink the instruction if all its uses occur in the 4511 // predicated block. Otherwise, there's nothing to do yet, and we may 4512 // need to reanalyze the instruction. 4513 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4514 InstsToReanalyze.push_back(I); 4515 continue; 4516 } 4517 4518 // Move the instruction to the beginning of the predicated block, and add 4519 // it's operands to the worklist. 4520 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4521 Worklist.insert(I->op_begin(), I->op_end()); 4522 4523 // The sinking may have enabled other instructions to be sunk, so we will 4524 // need to iterate. 4525 Changed = true; 4526 } 4527 } while (Changed); 4528 } 4529 4530 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4531 for (PHINode *OrigPhi : OrigPHIsToFix) { 4532 VPWidenPHIRecipe *VPPhi = 4533 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4534 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4535 // Make sure the builder has a valid insert point. 4536 Builder.SetInsertPoint(NewPhi); 4537 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4538 VPValue *Inc = VPPhi->getIncomingValue(i); 4539 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4540 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4541 } 4542 } 4543 } 4544 4545 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4546 VPUser &Operands, unsigned UF, 4547 ElementCount VF, bool IsPtrLoopInvariant, 4548 SmallBitVector &IsIndexLoopInvariant, 4549 VPTransformState &State) { 4550 // Construct a vector GEP by widening the operands of the scalar GEP as 4551 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4552 // results in a vector of pointers when at least one operand of the GEP 4553 // is vector-typed. Thus, to keep the representation compact, we only use 4554 // vector-typed operands for loop-varying values. 4555 4556 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4557 // If we are vectorizing, but the GEP has only loop-invariant operands, 4558 // the GEP we build (by only using vector-typed operands for 4559 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4560 // produce a vector of pointers, we need to either arbitrarily pick an 4561 // operand to broadcast, or broadcast a clone of the original GEP. 4562 // Here, we broadcast a clone of the original. 4563 // 4564 // TODO: If at some point we decide to scalarize instructions having 4565 // loop-invariant operands, this special case will no longer be 4566 // required. We would add the scalarization decision to 4567 // collectLoopScalars() and teach getVectorValue() to broadcast 4568 // the lane-zero scalar value. 4569 auto *Clone = Builder.Insert(GEP->clone()); 4570 for (unsigned Part = 0; Part < UF; ++Part) { 4571 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4572 State.set(VPDef, EntryPart, Part); 4573 addMetadata(EntryPart, GEP); 4574 } 4575 } else { 4576 // If the GEP has at least one loop-varying operand, we are sure to 4577 // produce a vector of pointers. But if we are only unrolling, we want 4578 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4579 // produce with the code below will be scalar (if VF == 1) or vector 4580 // (otherwise). Note that for the unroll-only case, we still maintain 4581 // values in the vector mapping with initVector, as we do for other 4582 // instructions. 4583 for (unsigned Part = 0; Part < UF; ++Part) { 4584 // The pointer operand of the new GEP. If it's loop-invariant, we 4585 // won't broadcast it. 4586 auto *Ptr = IsPtrLoopInvariant 4587 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4588 : State.get(Operands.getOperand(0), Part); 4589 4590 // Collect all the indices for the new GEP. If any index is 4591 // loop-invariant, we won't broadcast it. 4592 SmallVector<Value *, 4> Indices; 4593 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4594 VPValue *Operand = Operands.getOperand(I); 4595 if (IsIndexLoopInvariant[I - 1]) 4596 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4597 else 4598 Indices.push_back(State.get(Operand, Part)); 4599 } 4600 4601 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4602 // but it should be a vector, otherwise. 4603 auto *NewGEP = 4604 GEP->isInBounds() 4605 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4606 Indices) 4607 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4608 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4609 "NewGEP is not a pointer vector"); 4610 State.set(VPDef, NewGEP, Part); 4611 addMetadata(NewGEP, GEP); 4612 } 4613 } 4614 } 4615 4616 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4617 RecurrenceDescriptor *RdxDesc, 4618 VPValue *StartVPV, VPValue *Def, 4619 VPTransformState &State) { 4620 PHINode *P = cast<PHINode>(PN); 4621 if (EnableVPlanNativePath) { 4622 // Currently we enter here in the VPlan-native path for non-induction 4623 // PHIs where all control flow is uniform. We simply widen these PHIs. 4624 // Create a vector phi with no operands - the vector phi operands will be 4625 // set at the end of vector code generation. 4626 Type *VecTy = (State.VF.isScalar()) 4627 ? PN->getType() 4628 : VectorType::get(PN->getType(), State.VF); 4629 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4630 State.set(Def, VecPhi, 0); 4631 OrigPHIsToFix.push_back(P); 4632 4633 return; 4634 } 4635 4636 assert(PN->getParent() == OrigLoop->getHeader() && 4637 "Non-header phis should have been handled elsewhere"); 4638 4639 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4640 // In order to support recurrences we need to be able to vectorize Phi nodes. 4641 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4642 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4643 // this value when we vectorize all of the instructions that use the PHI. 4644 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4645 Value *Iden = nullptr; 4646 bool ScalarPHI = 4647 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4648 Type *VecTy = 4649 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4650 4651 if (RdxDesc) { 4652 assert(Legal->isReductionVariable(P) && StartV && 4653 "RdxDesc should only be set for reduction variables; in that case " 4654 "a StartV is also required"); 4655 RecurKind RK = RdxDesc->getRecurrenceKind(); 4656 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4657 // MinMax reduction have the start value as their identify. 4658 if (ScalarPHI) { 4659 Iden = StartV; 4660 } else { 4661 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4662 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4663 StartV = Iden = 4664 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4665 } 4666 } else { 4667 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4668 RK, VecTy->getScalarType()); 4669 Iden = IdenC; 4670 4671 if (!ScalarPHI) { 4672 Iden = ConstantVector::getSplat(State.VF, IdenC); 4673 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4674 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4675 Constant *Zero = Builder.getInt32(0); 4676 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4677 } 4678 } 4679 } 4680 4681 for (unsigned Part = 0; Part < State.UF; ++Part) { 4682 // This is phase one of vectorizing PHIs. 4683 Value *EntryPart = PHINode::Create( 4684 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4685 State.set(Def, EntryPart, Part); 4686 if (StartV) { 4687 // Make sure to add the reduction start value only to the 4688 // first unroll part. 4689 Value *StartVal = (Part == 0) ? StartV : Iden; 4690 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4691 } 4692 } 4693 return; 4694 } 4695 4696 assert(!Legal->isReductionVariable(P) && 4697 "reductions should be handled above"); 4698 4699 setDebugLocFromInst(Builder, P); 4700 4701 // This PHINode must be an induction variable. 4702 // Make sure that we know about it. 4703 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4704 4705 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4706 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4707 4708 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4709 // which can be found from the original scalar operations. 4710 switch (II.getKind()) { 4711 case InductionDescriptor::IK_NoInduction: 4712 llvm_unreachable("Unknown induction"); 4713 case InductionDescriptor::IK_IntInduction: 4714 case InductionDescriptor::IK_FpInduction: 4715 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4716 case InductionDescriptor::IK_PtrInduction: { 4717 // Handle the pointer induction variable case. 4718 assert(P->getType()->isPointerTy() && "Unexpected type."); 4719 4720 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4721 // This is the normalized GEP that starts counting at zero. 4722 Value *PtrInd = 4723 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4724 // Determine the number of scalars we need to generate for each unroll 4725 // iteration. If the instruction is uniform, we only need to generate the 4726 // first lane. Otherwise, we generate all VF values. 4727 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF) 4728 ? 1 4729 : State.VF.getKnownMinValue(); 4730 for (unsigned Part = 0; Part < UF; ++Part) { 4731 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4732 Constant *Idx = ConstantInt::get( 4733 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue()); 4734 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4735 Value *SclrGep = 4736 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4737 SclrGep->setName("next.gep"); 4738 State.set(Def, SclrGep, VPIteration(Part, Lane)); 4739 } 4740 } 4741 return; 4742 } 4743 assert(isa<SCEVConstant>(II.getStep()) && 4744 "Induction step not a SCEV constant!"); 4745 Type *PhiType = II.getStep()->getType(); 4746 4747 // Build a pointer phi 4748 Value *ScalarStartValue = II.getStartValue(); 4749 Type *ScStValueType = ScalarStartValue->getType(); 4750 PHINode *NewPointerPhi = 4751 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4752 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4753 4754 // A pointer induction, performed by using a gep 4755 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4756 Instruction *InductionLoc = LoopLatch->getTerminator(); 4757 const SCEV *ScalarStep = II.getStep(); 4758 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4759 Value *ScalarStepValue = 4760 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4761 Value *InductionGEP = GetElementPtrInst::Create( 4762 ScStValueType->getPointerElementType(), NewPointerPhi, 4763 Builder.CreateMul( 4764 ScalarStepValue, 4765 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)), 4766 "ptr.ind", InductionLoc); 4767 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4768 4769 // Create UF many actual address geps that use the pointer 4770 // phi as base and a vectorized version of the step value 4771 // (<step*0, ..., step*N>) as offset. 4772 for (unsigned Part = 0; Part < State.UF; ++Part) { 4773 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4774 Value *StartOffset = 4775 ConstantInt::get(VecPhiType, Part * State.VF.getKnownMinValue()); 4776 // Create a vector of consecutive numbers from zero to VF. 4777 StartOffset = 4778 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4779 4780 Value *GEP = Builder.CreateGEP( 4781 ScStValueType->getPointerElementType(), NewPointerPhi, 4782 Builder.CreateMul(StartOffset, 4783 Builder.CreateVectorSplat( 4784 State.VF.getKnownMinValue(), ScalarStepValue), 4785 "vector.gep")); 4786 State.set(Def, GEP, Part); 4787 } 4788 } 4789 } 4790 } 4791 4792 /// A helper function for checking whether an integer division-related 4793 /// instruction may divide by zero (in which case it must be predicated if 4794 /// executed conditionally in the scalar code). 4795 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4796 /// Non-zero divisors that are non compile-time constants will not be 4797 /// converted into multiplication, so we will still end up scalarizing 4798 /// the division, but can do so w/o predication. 4799 static bool mayDivideByZero(Instruction &I) { 4800 assert((I.getOpcode() == Instruction::UDiv || 4801 I.getOpcode() == Instruction::SDiv || 4802 I.getOpcode() == Instruction::URem || 4803 I.getOpcode() == Instruction::SRem) && 4804 "Unexpected instruction"); 4805 Value *Divisor = I.getOperand(1); 4806 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4807 return !CInt || CInt->isZero(); 4808 } 4809 4810 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4811 VPUser &User, 4812 VPTransformState &State) { 4813 switch (I.getOpcode()) { 4814 case Instruction::Call: 4815 case Instruction::Br: 4816 case Instruction::PHI: 4817 case Instruction::GetElementPtr: 4818 case Instruction::Select: 4819 llvm_unreachable("This instruction is handled by a different recipe."); 4820 case Instruction::UDiv: 4821 case Instruction::SDiv: 4822 case Instruction::SRem: 4823 case Instruction::URem: 4824 case Instruction::Add: 4825 case Instruction::FAdd: 4826 case Instruction::Sub: 4827 case Instruction::FSub: 4828 case Instruction::FNeg: 4829 case Instruction::Mul: 4830 case Instruction::FMul: 4831 case Instruction::FDiv: 4832 case Instruction::FRem: 4833 case Instruction::Shl: 4834 case Instruction::LShr: 4835 case Instruction::AShr: 4836 case Instruction::And: 4837 case Instruction::Or: 4838 case Instruction::Xor: { 4839 // Just widen unops and binops. 4840 setDebugLocFromInst(Builder, &I); 4841 4842 for (unsigned Part = 0; Part < UF; ++Part) { 4843 SmallVector<Value *, 2> Ops; 4844 for (VPValue *VPOp : User.operands()) 4845 Ops.push_back(State.get(VPOp, Part)); 4846 4847 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4848 4849 if (auto *VecOp = dyn_cast<Instruction>(V)) 4850 VecOp->copyIRFlags(&I); 4851 4852 // Use this vector value for all users of the original instruction. 4853 State.set(Def, V, Part); 4854 addMetadata(V, &I); 4855 } 4856 4857 break; 4858 } 4859 case Instruction::ICmp: 4860 case Instruction::FCmp: { 4861 // Widen compares. Generate vector compares. 4862 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4863 auto *Cmp = cast<CmpInst>(&I); 4864 setDebugLocFromInst(Builder, Cmp); 4865 for (unsigned Part = 0; Part < UF; ++Part) { 4866 Value *A = State.get(User.getOperand(0), Part); 4867 Value *B = State.get(User.getOperand(1), Part); 4868 Value *C = nullptr; 4869 if (FCmp) { 4870 // Propagate fast math flags. 4871 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4872 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4873 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4874 } else { 4875 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4876 } 4877 State.set(Def, C, Part); 4878 addMetadata(C, &I); 4879 } 4880 4881 break; 4882 } 4883 4884 case Instruction::ZExt: 4885 case Instruction::SExt: 4886 case Instruction::FPToUI: 4887 case Instruction::FPToSI: 4888 case Instruction::FPExt: 4889 case Instruction::PtrToInt: 4890 case Instruction::IntToPtr: 4891 case Instruction::SIToFP: 4892 case Instruction::UIToFP: 4893 case Instruction::Trunc: 4894 case Instruction::FPTrunc: 4895 case Instruction::BitCast: { 4896 auto *CI = cast<CastInst>(&I); 4897 setDebugLocFromInst(Builder, CI); 4898 4899 /// Vectorize casts. 4900 Type *DestTy = 4901 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4902 4903 for (unsigned Part = 0; Part < UF; ++Part) { 4904 Value *A = State.get(User.getOperand(0), Part); 4905 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4906 State.set(Def, Cast, Part); 4907 addMetadata(Cast, &I); 4908 } 4909 break; 4910 } 4911 default: 4912 // This instruction is not vectorized by simple widening. 4913 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4914 llvm_unreachable("Unhandled instruction!"); 4915 } // end of switch. 4916 } 4917 4918 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4919 VPUser &ArgOperands, 4920 VPTransformState &State) { 4921 assert(!isa<DbgInfoIntrinsic>(I) && 4922 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4923 setDebugLocFromInst(Builder, &I); 4924 4925 Module *M = I.getParent()->getParent()->getParent(); 4926 auto *CI = cast<CallInst>(&I); 4927 4928 SmallVector<Type *, 4> Tys; 4929 for (Value *ArgOperand : CI->arg_operands()) 4930 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4931 4932 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4933 4934 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4935 // version of the instruction. 4936 // Is it beneficial to perform intrinsic call compared to lib call? 4937 bool NeedToScalarize = false; 4938 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4939 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4940 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4941 assert((UseVectorIntrinsic || !NeedToScalarize) && 4942 "Instruction should be scalarized elsewhere."); 4943 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4944 "Cannot have invalid costs while widening"); 4945 4946 for (unsigned Part = 0; Part < UF; ++Part) { 4947 SmallVector<Value *, 4> Args; 4948 for (auto &I : enumerate(ArgOperands.operands())) { 4949 // Some intrinsics have a scalar argument - don't replace it with a 4950 // vector. 4951 Value *Arg; 4952 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4953 Arg = State.get(I.value(), Part); 4954 else 4955 Arg = State.get(I.value(), VPIteration(0, 0)); 4956 Args.push_back(Arg); 4957 } 4958 4959 Function *VectorF; 4960 if (UseVectorIntrinsic) { 4961 // Use vector version of the intrinsic. 4962 Type *TysForDecl[] = {CI->getType()}; 4963 if (VF.isVector()) 4964 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4965 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4966 assert(VectorF && "Can't retrieve vector intrinsic."); 4967 } else { 4968 // Use vector version of the function call. 4969 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4970 #ifndef NDEBUG 4971 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4972 "Can't create vector function."); 4973 #endif 4974 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4975 } 4976 SmallVector<OperandBundleDef, 1> OpBundles; 4977 CI->getOperandBundlesAsDefs(OpBundles); 4978 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4979 4980 if (isa<FPMathOperator>(V)) 4981 V->copyFastMathFlags(CI); 4982 4983 State.set(Def, V, Part); 4984 addMetadata(V, &I); 4985 } 4986 } 4987 4988 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4989 VPUser &Operands, 4990 bool InvariantCond, 4991 VPTransformState &State) { 4992 setDebugLocFromInst(Builder, &I); 4993 4994 // The condition can be loop invariant but still defined inside the 4995 // loop. This means that we can't just use the original 'cond' value. 4996 // We have to take the 'vectorized' value and pick the first lane. 4997 // Instcombine will make this a no-op. 4998 auto *InvarCond = InvariantCond 4999 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5000 : nullptr; 5001 5002 for (unsigned Part = 0; Part < UF; ++Part) { 5003 Value *Cond = 5004 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5005 Value *Op0 = State.get(Operands.getOperand(1), Part); 5006 Value *Op1 = State.get(Operands.getOperand(2), Part); 5007 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5008 State.set(VPDef, Sel, Part); 5009 addMetadata(Sel, &I); 5010 } 5011 } 5012 5013 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5014 // We should not collect Scalars more than once per VF. Right now, this 5015 // function is called from collectUniformsAndScalars(), which already does 5016 // this check. Collecting Scalars for VF=1 does not make any sense. 5017 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5018 "This function should not be visited twice for the same VF"); 5019 5020 SmallSetVector<Instruction *, 8> Worklist; 5021 5022 // These sets are used to seed the analysis with pointers used by memory 5023 // accesses that will remain scalar. 5024 SmallSetVector<Instruction *, 8> ScalarPtrs; 5025 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5026 auto *Latch = TheLoop->getLoopLatch(); 5027 5028 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5029 // The pointer operands of loads and stores will be scalar as long as the 5030 // memory access is not a gather or scatter operation. The value operand of a 5031 // store will remain scalar if the store is scalarized. 5032 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5033 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5034 assert(WideningDecision != CM_Unknown && 5035 "Widening decision should be ready at this moment"); 5036 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5037 if (Ptr == Store->getValueOperand()) 5038 return WideningDecision == CM_Scalarize; 5039 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5040 "Ptr is neither a value or pointer operand"); 5041 return WideningDecision != CM_GatherScatter; 5042 }; 5043 5044 // A helper that returns true if the given value is a bitcast or 5045 // getelementptr instruction contained in the loop. 5046 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5047 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5048 isa<GetElementPtrInst>(V)) && 5049 !TheLoop->isLoopInvariant(V); 5050 }; 5051 5052 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5053 if (!isa<PHINode>(Ptr) || 5054 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5055 return false; 5056 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5057 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5058 return false; 5059 return isScalarUse(MemAccess, Ptr); 5060 }; 5061 5062 // A helper that evaluates a memory access's use of a pointer. If the 5063 // pointer is actually the pointer induction of a loop, it is being 5064 // inserted into Worklist. If the use will be a scalar use, and the 5065 // pointer is only used by memory accesses, we place the pointer in 5066 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5067 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5068 if (isScalarPtrInduction(MemAccess, Ptr)) { 5069 Worklist.insert(cast<Instruction>(Ptr)); 5070 Instruction *Update = cast<Instruction>( 5071 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5072 Worklist.insert(Update); 5073 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5074 << "\n"); 5075 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5076 << "\n"); 5077 return; 5078 } 5079 // We only care about bitcast and getelementptr instructions contained in 5080 // the loop. 5081 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5082 return; 5083 5084 // If the pointer has already been identified as scalar (e.g., if it was 5085 // also identified as uniform), there's nothing to do. 5086 auto *I = cast<Instruction>(Ptr); 5087 if (Worklist.count(I)) 5088 return; 5089 5090 // If the use of the pointer will be a scalar use, and all users of the 5091 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5092 // place the pointer in PossibleNonScalarPtrs. 5093 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5094 return isa<LoadInst>(U) || isa<StoreInst>(U); 5095 })) 5096 ScalarPtrs.insert(I); 5097 else 5098 PossibleNonScalarPtrs.insert(I); 5099 }; 5100 5101 // We seed the scalars analysis with three classes of instructions: (1) 5102 // instructions marked uniform-after-vectorization and (2) bitcast, 5103 // getelementptr and (pointer) phi instructions used by memory accesses 5104 // requiring a scalar use. 5105 // 5106 // (1) Add to the worklist all instructions that have been identified as 5107 // uniform-after-vectorization. 5108 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5109 5110 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5111 // memory accesses requiring a scalar use. The pointer operands of loads and 5112 // stores will be scalar as long as the memory accesses is not a gather or 5113 // scatter operation. The value operand of a store will remain scalar if the 5114 // store is scalarized. 5115 for (auto *BB : TheLoop->blocks()) 5116 for (auto &I : *BB) { 5117 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5118 evaluatePtrUse(Load, Load->getPointerOperand()); 5119 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5120 evaluatePtrUse(Store, Store->getPointerOperand()); 5121 evaluatePtrUse(Store, Store->getValueOperand()); 5122 } 5123 } 5124 for (auto *I : ScalarPtrs) 5125 if (!PossibleNonScalarPtrs.count(I)) { 5126 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5127 Worklist.insert(I); 5128 } 5129 5130 // Insert the forced scalars. 5131 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5132 // induction variable when the PHI user is scalarized. 5133 auto ForcedScalar = ForcedScalars.find(VF); 5134 if (ForcedScalar != ForcedScalars.end()) 5135 for (auto *I : ForcedScalar->second) 5136 Worklist.insert(I); 5137 5138 // Expand the worklist by looking through any bitcasts and getelementptr 5139 // instructions we've already identified as scalar. This is similar to the 5140 // expansion step in collectLoopUniforms(); however, here we're only 5141 // expanding to include additional bitcasts and getelementptr instructions. 5142 unsigned Idx = 0; 5143 while (Idx != Worklist.size()) { 5144 Instruction *Dst = Worklist[Idx++]; 5145 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5146 continue; 5147 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5148 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5149 auto *J = cast<Instruction>(U); 5150 return !TheLoop->contains(J) || Worklist.count(J) || 5151 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5152 isScalarUse(J, Src)); 5153 })) { 5154 Worklist.insert(Src); 5155 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5156 } 5157 } 5158 5159 // An induction variable will remain scalar if all users of the induction 5160 // variable and induction variable update remain scalar. 5161 for (auto &Induction : Legal->getInductionVars()) { 5162 auto *Ind = Induction.first; 5163 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5164 5165 // If tail-folding is applied, the primary induction variable will be used 5166 // to feed a vector compare. 5167 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5168 continue; 5169 5170 // Determine if all users of the induction variable are scalar after 5171 // vectorization. 5172 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5173 auto *I = cast<Instruction>(U); 5174 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5175 }); 5176 if (!ScalarInd) 5177 continue; 5178 5179 // Determine if all users of the induction variable update instruction are 5180 // scalar after vectorization. 5181 auto ScalarIndUpdate = 5182 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5183 auto *I = cast<Instruction>(U); 5184 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5185 }); 5186 if (!ScalarIndUpdate) 5187 continue; 5188 5189 // The induction variable and its update instruction will remain scalar. 5190 Worklist.insert(Ind); 5191 Worklist.insert(IndUpdate); 5192 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5193 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5194 << "\n"); 5195 } 5196 5197 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5198 } 5199 5200 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5201 ElementCount VF) { 5202 if (!blockNeedsPredication(I->getParent())) 5203 return false; 5204 switch(I->getOpcode()) { 5205 default: 5206 break; 5207 case Instruction::Load: 5208 case Instruction::Store: { 5209 if (!Legal->isMaskRequired(I)) 5210 return false; 5211 auto *Ptr = getLoadStorePointerOperand(I); 5212 auto *Ty = getMemInstValueType(I); 5213 // We have already decided how to vectorize this instruction, get that 5214 // result. 5215 if (VF.isVector()) { 5216 InstWidening WideningDecision = getWideningDecision(I, VF); 5217 assert(WideningDecision != CM_Unknown && 5218 "Widening decision should be ready at this moment"); 5219 return WideningDecision == CM_Scalarize; 5220 } 5221 const Align Alignment = getLoadStoreAlignment(I); 5222 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5223 isLegalMaskedGather(Ty, Alignment)) 5224 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5225 isLegalMaskedScatter(Ty, Alignment)); 5226 } 5227 case Instruction::UDiv: 5228 case Instruction::SDiv: 5229 case Instruction::SRem: 5230 case Instruction::URem: 5231 return mayDivideByZero(*I); 5232 } 5233 return false; 5234 } 5235 5236 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5237 Instruction *I, ElementCount VF) { 5238 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5239 assert(getWideningDecision(I, VF) == CM_Unknown && 5240 "Decision should not be set yet."); 5241 auto *Group = getInterleavedAccessGroup(I); 5242 assert(Group && "Must have a group."); 5243 5244 // If the instruction's allocated size doesn't equal it's type size, it 5245 // requires padding and will be scalarized. 5246 auto &DL = I->getModule()->getDataLayout(); 5247 auto *ScalarTy = getMemInstValueType(I); 5248 if (hasIrregularType(ScalarTy, DL)) 5249 return false; 5250 5251 // Check if masking is required. 5252 // A Group may need masking for one of two reasons: it resides in a block that 5253 // needs predication, or it was decided to use masking to deal with gaps. 5254 bool PredicatedAccessRequiresMasking = 5255 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5256 bool AccessWithGapsRequiresMasking = 5257 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5258 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5259 return true; 5260 5261 // If masked interleaving is required, we expect that the user/target had 5262 // enabled it, because otherwise it either wouldn't have been created or 5263 // it should have been invalidated by the CostModel. 5264 assert(useMaskedInterleavedAccesses(TTI) && 5265 "Masked interleave-groups for predicated accesses are not enabled."); 5266 5267 auto *Ty = getMemInstValueType(I); 5268 const Align Alignment = getLoadStoreAlignment(I); 5269 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5270 : TTI.isLegalMaskedStore(Ty, Alignment); 5271 } 5272 5273 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5274 Instruction *I, ElementCount VF) { 5275 // Get and ensure we have a valid memory instruction. 5276 LoadInst *LI = dyn_cast<LoadInst>(I); 5277 StoreInst *SI = dyn_cast<StoreInst>(I); 5278 assert((LI || SI) && "Invalid memory instruction"); 5279 5280 auto *Ptr = getLoadStorePointerOperand(I); 5281 5282 // In order to be widened, the pointer should be consecutive, first of all. 5283 if (!Legal->isConsecutivePtr(Ptr)) 5284 return false; 5285 5286 // If the instruction is a store located in a predicated block, it will be 5287 // scalarized. 5288 if (isScalarWithPredication(I)) 5289 return false; 5290 5291 // If the instruction's allocated size doesn't equal it's type size, it 5292 // requires padding and will be scalarized. 5293 auto &DL = I->getModule()->getDataLayout(); 5294 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5295 if (hasIrregularType(ScalarTy, DL)) 5296 return false; 5297 5298 return true; 5299 } 5300 5301 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5302 // We should not collect Uniforms more than once per VF. Right now, 5303 // this function is called from collectUniformsAndScalars(), which 5304 // already does this check. Collecting Uniforms for VF=1 does not make any 5305 // sense. 5306 5307 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5308 "This function should not be visited twice for the same VF"); 5309 5310 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5311 // not analyze again. Uniforms.count(VF) will return 1. 5312 Uniforms[VF].clear(); 5313 5314 // We now know that the loop is vectorizable! 5315 // Collect instructions inside the loop that will remain uniform after 5316 // vectorization. 5317 5318 // Global values, params and instructions outside of current loop are out of 5319 // scope. 5320 auto isOutOfScope = [&](Value *V) -> bool { 5321 Instruction *I = dyn_cast<Instruction>(V); 5322 return (!I || !TheLoop->contains(I)); 5323 }; 5324 5325 SetVector<Instruction *> Worklist; 5326 BasicBlock *Latch = TheLoop->getLoopLatch(); 5327 5328 // Instructions that are scalar with predication must not be considered 5329 // uniform after vectorization, because that would create an erroneous 5330 // replicating region where only a single instance out of VF should be formed. 5331 // TODO: optimize such seldom cases if found important, see PR40816. 5332 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5333 if (isOutOfScope(I)) { 5334 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5335 << *I << "\n"); 5336 return; 5337 } 5338 if (isScalarWithPredication(I, VF)) { 5339 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5340 << *I << "\n"); 5341 return; 5342 } 5343 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5344 Worklist.insert(I); 5345 }; 5346 5347 // Start with the conditional branch. If the branch condition is an 5348 // instruction contained in the loop that is only used by the branch, it is 5349 // uniform. 5350 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5351 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5352 addToWorklistIfAllowed(Cmp); 5353 5354 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5355 InstWidening WideningDecision = getWideningDecision(I, VF); 5356 assert(WideningDecision != CM_Unknown && 5357 "Widening decision should be ready at this moment"); 5358 5359 // A uniform memory op is itself uniform. We exclude uniform stores 5360 // here as they demand the last lane, not the first one. 5361 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5362 assert(WideningDecision == CM_Scalarize); 5363 return true; 5364 } 5365 5366 return (WideningDecision == CM_Widen || 5367 WideningDecision == CM_Widen_Reverse || 5368 WideningDecision == CM_Interleave); 5369 }; 5370 5371 5372 // Returns true if Ptr is the pointer operand of a memory access instruction 5373 // I, and I is known to not require scalarization. 5374 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5375 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5376 }; 5377 5378 // Holds a list of values which are known to have at least one uniform use. 5379 // Note that there may be other uses which aren't uniform. A "uniform use" 5380 // here is something which only demands lane 0 of the unrolled iterations; 5381 // it does not imply that all lanes produce the same value (e.g. this is not 5382 // the usual meaning of uniform) 5383 SmallPtrSet<Value *, 8> HasUniformUse; 5384 5385 // Scan the loop for instructions which are either a) known to have only 5386 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5387 for (auto *BB : TheLoop->blocks()) 5388 for (auto &I : *BB) { 5389 // If there's no pointer operand, there's nothing to do. 5390 auto *Ptr = getLoadStorePointerOperand(&I); 5391 if (!Ptr) 5392 continue; 5393 5394 // A uniform memory op is itself uniform. We exclude uniform stores 5395 // here as they demand the last lane, not the first one. 5396 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5397 addToWorklistIfAllowed(&I); 5398 5399 if (isUniformDecision(&I, VF)) { 5400 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5401 HasUniformUse.insert(Ptr); 5402 } 5403 } 5404 5405 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5406 // demanding) users. Since loops are assumed to be in LCSSA form, this 5407 // disallows uses outside the loop as well. 5408 for (auto *V : HasUniformUse) { 5409 if (isOutOfScope(V)) 5410 continue; 5411 auto *I = cast<Instruction>(V); 5412 auto UsersAreMemAccesses = 5413 llvm::all_of(I->users(), [&](User *U) -> bool { 5414 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5415 }); 5416 if (UsersAreMemAccesses) 5417 addToWorklistIfAllowed(I); 5418 } 5419 5420 // Expand Worklist in topological order: whenever a new instruction 5421 // is added , its users should be already inside Worklist. It ensures 5422 // a uniform instruction will only be used by uniform instructions. 5423 unsigned idx = 0; 5424 while (idx != Worklist.size()) { 5425 Instruction *I = Worklist[idx++]; 5426 5427 for (auto OV : I->operand_values()) { 5428 // isOutOfScope operands cannot be uniform instructions. 5429 if (isOutOfScope(OV)) 5430 continue; 5431 // First order recurrence Phi's should typically be considered 5432 // non-uniform. 5433 auto *OP = dyn_cast<PHINode>(OV); 5434 if (OP && Legal->isFirstOrderRecurrence(OP)) 5435 continue; 5436 // If all the users of the operand are uniform, then add the 5437 // operand into the uniform worklist. 5438 auto *OI = cast<Instruction>(OV); 5439 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5440 auto *J = cast<Instruction>(U); 5441 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5442 })) 5443 addToWorklistIfAllowed(OI); 5444 } 5445 } 5446 5447 // For an instruction to be added into Worklist above, all its users inside 5448 // the loop should also be in Worklist. However, this condition cannot be 5449 // true for phi nodes that form a cyclic dependence. We must process phi 5450 // nodes separately. An induction variable will remain uniform if all users 5451 // of the induction variable and induction variable update remain uniform. 5452 // The code below handles both pointer and non-pointer induction variables. 5453 for (auto &Induction : Legal->getInductionVars()) { 5454 auto *Ind = Induction.first; 5455 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5456 5457 // Determine if all users of the induction variable are uniform after 5458 // vectorization. 5459 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5460 auto *I = cast<Instruction>(U); 5461 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5462 isVectorizedMemAccessUse(I, Ind); 5463 }); 5464 if (!UniformInd) 5465 continue; 5466 5467 // Determine if all users of the induction variable update instruction are 5468 // uniform after vectorization. 5469 auto UniformIndUpdate = 5470 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5471 auto *I = cast<Instruction>(U); 5472 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5473 isVectorizedMemAccessUse(I, IndUpdate); 5474 }); 5475 if (!UniformIndUpdate) 5476 continue; 5477 5478 // The induction variable and its update instruction will remain uniform. 5479 addToWorklistIfAllowed(Ind); 5480 addToWorklistIfAllowed(IndUpdate); 5481 } 5482 5483 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5484 } 5485 5486 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5487 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5488 5489 if (Legal->getRuntimePointerChecking()->Need) { 5490 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5491 "runtime pointer checks needed. Enable vectorization of this " 5492 "loop with '#pragma clang loop vectorize(enable)' when " 5493 "compiling with -Os/-Oz", 5494 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5495 return true; 5496 } 5497 5498 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5499 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5500 "runtime SCEV checks needed. Enable vectorization of this " 5501 "loop with '#pragma clang loop vectorize(enable)' when " 5502 "compiling with -Os/-Oz", 5503 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5504 return true; 5505 } 5506 5507 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5508 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5509 reportVectorizationFailure("Runtime stride check for small trip count", 5510 "runtime stride == 1 checks needed. Enable vectorization of " 5511 "this loop without such check by compiling with -Os/-Oz", 5512 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5513 return true; 5514 } 5515 5516 return false; 5517 } 5518 5519 Optional<ElementCount> 5520 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5521 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5522 // TODO: It may by useful to do since it's still likely to be dynamically 5523 // uniform if the target can skip. 5524 reportVectorizationFailure( 5525 "Not inserting runtime ptr check for divergent target", 5526 "runtime pointer checks needed. Not enabled for divergent target", 5527 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5528 return None; 5529 } 5530 5531 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5532 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5533 if (TC == 1) { 5534 reportVectorizationFailure("Single iteration (non) loop", 5535 "loop trip count is one, irrelevant for vectorization", 5536 "SingleIterationLoop", ORE, TheLoop); 5537 return None; 5538 } 5539 5540 switch (ScalarEpilogueStatus) { 5541 case CM_ScalarEpilogueAllowed: 5542 return computeFeasibleMaxVF(TC, UserVF); 5543 case CM_ScalarEpilogueNotAllowedUsePredicate: 5544 LLVM_FALLTHROUGH; 5545 case CM_ScalarEpilogueNotNeededUsePredicate: 5546 LLVM_DEBUG( 5547 dbgs() << "LV: vector predicate hint/switch found.\n" 5548 << "LV: Not allowing scalar epilogue, creating predicated " 5549 << "vector loop.\n"); 5550 break; 5551 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5552 // fallthrough as a special case of OptForSize 5553 case CM_ScalarEpilogueNotAllowedOptSize: 5554 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5555 LLVM_DEBUG( 5556 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5557 else 5558 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5559 << "count.\n"); 5560 5561 // Bail if runtime checks are required, which are not good when optimising 5562 // for size. 5563 if (runtimeChecksRequired()) 5564 return None; 5565 5566 break; 5567 } 5568 5569 // The only loops we can vectorize without a scalar epilogue, are loops with 5570 // a bottom-test and a single exiting block. We'd have to handle the fact 5571 // that not every instruction executes on the last iteration. This will 5572 // require a lane mask which varies through the vector loop body. (TODO) 5573 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5574 // If there was a tail-folding hint/switch, but we can't fold the tail by 5575 // masking, fallback to a vectorization with a scalar epilogue. 5576 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5577 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5578 "scalar epilogue instead.\n"); 5579 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5580 return computeFeasibleMaxVF(TC, UserVF); 5581 } 5582 return None; 5583 } 5584 5585 // Now try the tail folding 5586 5587 // Invalidate interleave groups that require an epilogue if we can't mask 5588 // the interleave-group. 5589 if (!useMaskedInterleavedAccesses(TTI)) { 5590 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5591 "No decisions should have been taken at this point"); 5592 // Note: There is no need to invalidate any cost modeling decisions here, as 5593 // non where taken so far. 5594 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5595 } 5596 5597 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5598 assert(!MaxVF.isScalable() && 5599 "Scalable vectors do not yet support tail folding"); 5600 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5601 "MaxVF must be a power of 2"); 5602 unsigned MaxVFtimesIC = 5603 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5604 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5605 // chose. 5606 ScalarEvolution *SE = PSE.getSE(); 5607 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5608 const SCEV *ExitCount = SE->getAddExpr( 5609 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5610 const SCEV *Rem = SE->getURemExpr( 5611 SE->applyLoopGuards(ExitCount, TheLoop), 5612 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5613 if (Rem->isZero()) { 5614 // Accept MaxVF if we do not have a tail. 5615 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5616 return MaxVF; 5617 } 5618 5619 // If we don't know the precise trip count, or if the trip count that we 5620 // found modulo the vectorization factor is not zero, try to fold the tail 5621 // by masking. 5622 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5623 if (Legal->prepareToFoldTailByMasking()) { 5624 FoldTailByMasking = true; 5625 return MaxVF; 5626 } 5627 5628 // If there was a tail-folding hint/switch, but we can't fold the tail by 5629 // masking, fallback to a vectorization with a scalar epilogue. 5630 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5631 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5632 "scalar epilogue instead.\n"); 5633 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5634 return MaxVF; 5635 } 5636 5637 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5638 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5639 return None; 5640 } 5641 5642 if (TC == 0) { 5643 reportVectorizationFailure( 5644 "Unable to calculate the loop count due to complex control flow", 5645 "unable to calculate the loop count due to complex control flow", 5646 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5647 return None; 5648 } 5649 5650 reportVectorizationFailure( 5651 "Cannot optimize for size and vectorize at the same time.", 5652 "cannot optimize for size and vectorize at the same time. " 5653 "Enable vectorization of this loop with '#pragma clang loop " 5654 "vectorize(enable)' when compiling with -Os/-Oz", 5655 "NoTailLoopWithOptForSize", ORE, TheLoop); 5656 return None; 5657 } 5658 5659 ElementCount 5660 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5661 ElementCount UserVF) { 5662 bool IgnoreScalableUserVF = UserVF.isScalable() && 5663 !TTI.supportsScalableVectors() && 5664 !ForceTargetSupportsScalableVectors; 5665 if (IgnoreScalableUserVF) { 5666 LLVM_DEBUG( 5667 dbgs() << "LV: Ignoring VF=" << UserVF 5668 << " because target does not support scalable vectors.\n"); 5669 ORE->emit([&]() { 5670 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5671 TheLoop->getStartLoc(), 5672 TheLoop->getHeader()) 5673 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5674 << " because target does not support scalable vectors."; 5675 }); 5676 } 5677 5678 // Beyond this point two scenarios are handled. If UserVF isn't specified 5679 // then a suitable VF is chosen. If UserVF is specified and there are 5680 // dependencies, check if it's legal. However, if a UserVF is specified and 5681 // there are no dependencies, then there's nothing to do. 5682 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5683 if (!canVectorizeReductions(UserVF)) { 5684 reportVectorizationFailure( 5685 "LV: Scalable vectorization not supported for the reduction " 5686 "operations found in this loop. Using fixed-width " 5687 "vectorization instead.", 5688 "Scalable vectorization not supported for the reduction operations " 5689 "found in this loop. Using fixed-width vectorization instead.", 5690 "ScalableVFUnfeasible", ORE, TheLoop); 5691 return computeFeasibleMaxVF( 5692 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5693 } 5694 5695 if (Legal->isSafeForAnyVectorWidth()) 5696 return UserVF; 5697 } 5698 5699 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5700 unsigned SmallestType, WidestType; 5701 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5702 unsigned WidestRegister = 5703 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 5704 .getFixedSize(); 5705 5706 // Get the maximum safe dependence distance in bits computed by LAA. 5707 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5708 // the memory accesses that is most restrictive (involved in the smallest 5709 // dependence distance). 5710 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5711 5712 // If the user vectorization factor is legally unsafe, clamp it to a safe 5713 // value. Otherwise, return as is. 5714 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5715 unsigned MaxSafeElements = 5716 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5717 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5718 5719 if (UserVF.isScalable()) { 5720 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5721 5722 // Scale VF by vscale before checking if it's safe. 5723 MaxSafeVF = ElementCount::getScalable( 5724 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5725 5726 if (MaxSafeVF.isZero()) { 5727 // The dependence distance is too small to use scalable vectors, 5728 // fallback on fixed. 5729 LLVM_DEBUG( 5730 dbgs() 5731 << "LV: Max legal vector width too small, scalable vectorization " 5732 "unfeasible. Using fixed-width vectorization instead.\n"); 5733 ORE->emit([&]() { 5734 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5735 TheLoop->getStartLoc(), 5736 TheLoop->getHeader()) 5737 << "Max legal vector width too small, scalable vectorization " 5738 << "unfeasible. Using fixed-width vectorization instead."; 5739 }); 5740 return computeFeasibleMaxVF( 5741 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5742 } 5743 } 5744 5745 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5746 5747 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5748 return UserVF; 5749 5750 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5751 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5752 << ".\n"); 5753 ORE->emit([&]() { 5754 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5755 TheLoop->getStartLoc(), 5756 TheLoop->getHeader()) 5757 << "User-specified vectorization factor " 5758 << ore::NV("UserVectorizationFactor", UserVF) 5759 << " is unsafe, clamping to maximum safe vectorization factor " 5760 << ore::NV("VectorizationFactor", MaxSafeVF); 5761 }); 5762 return MaxSafeVF; 5763 } 5764 5765 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5766 5767 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5768 // Note that both WidestRegister and WidestType may not be a powers of 2. 5769 auto MaxVectorSize = 5770 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5771 5772 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5773 << " / " << WidestType << " bits.\n"); 5774 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5775 << WidestRegister << " bits.\n"); 5776 5777 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5778 "Did not expect to pack so many elements" 5779 " into one vector!"); 5780 if (MaxVectorSize.getFixedValue() == 0) { 5781 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5782 return ElementCount::getFixed(1); 5783 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5784 isPowerOf2_32(ConstTripCount)) { 5785 // We need to clamp the VF to be the ConstTripCount. There is no point in 5786 // choosing a higher viable VF as done in the loop below. 5787 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5788 << ConstTripCount << "\n"); 5789 return ElementCount::getFixed(ConstTripCount); 5790 } 5791 5792 ElementCount MaxVF = MaxVectorSize; 5793 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5794 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5795 // Collect all viable vectorization factors larger than the default MaxVF 5796 // (i.e. MaxVectorSize). 5797 SmallVector<ElementCount, 8> VFs; 5798 auto MaxVectorSizeMaxBW = 5799 ElementCount::getFixed(WidestRegister / SmallestType); 5800 for (ElementCount VS = MaxVectorSize * 2; 5801 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5802 VFs.push_back(VS); 5803 5804 // For each VF calculate its register usage. 5805 auto RUs = calculateRegisterUsage(VFs); 5806 5807 // Select the largest VF which doesn't require more registers than existing 5808 // ones. 5809 for (int i = RUs.size() - 1; i >= 0; --i) { 5810 bool Selected = true; 5811 for (auto &pair : RUs[i].MaxLocalUsers) { 5812 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5813 if (pair.second > TargetNumRegisters) 5814 Selected = false; 5815 } 5816 if (Selected) { 5817 MaxVF = VFs[i]; 5818 break; 5819 } 5820 } 5821 if (ElementCount MinVF = 5822 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5823 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5824 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5825 << ") with target's minimum: " << MinVF << '\n'); 5826 MaxVF = MinVF; 5827 } 5828 } 5829 } 5830 return MaxVF; 5831 } 5832 5833 VectorizationFactor 5834 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5835 // FIXME: This can be fixed for scalable vectors later, because at this stage 5836 // the LoopVectorizer will only consider vectorizing a loop with scalable 5837 // vectors when the loop has a hint to enable vectorization for a given VF. 5838 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5839 5840 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5841 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5842 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5843 5844 auto Width = ElementCount::getFixed(1); 5845 const float ScalarCost = *ExpectedCost.getValue(); 5846 float Cost = ScalarCost; 5847 5848 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5849 if (ForceVectorization && MaxVF.isVector()) { 5850 // Ignore scalar width, because the user explicitly wants vectorization. 5851 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5852 // evaluation. 5853 Cost = std::numeric_limits<float>::max(); 5854 } 5855 5856 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5857 i *= 2) { 5858 // Notice that the vector loop needs to be executed less times, so 5859 // we need to divide the cost of the vector loops by the width of 5860 // the vector elements. 5861 VectorizationCostTy C = expectedCost(i); 5862 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5863 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5864 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5865 << " costs: " << (int)VectorCost << ".\n"); 5866 if (!C.second && !ForceVectorization) { 5867 LLVM_DEBUG( 5868 dbgs() << "LV: Not considering vector loop of width " << i 5869 << " because it will not generate any vector instructions.\n"); 5870 continue; 5871 } 5872 5873 // If profitable add it to ProfitableVF list. 5874 if (VectorCost < ScalarCost) { 5875 ProfitableVFs.push_back(VectorizationFactor( 5876 {i, (unsigned)VectorCost})); 5877 } 5878 5879 if (VectorCost < Cost) { 5880 Cost = VectorCost; 5881 Width = i; 5882 } 5883 } 5884 5885 if (!EnableCondStoresVectorization && NumPredStores) { 5886 reportVectorizationFailure("There are conditional stores.", 5887 "store that is conditionally executed prevents vectorization", 5888 "ConditionalStore", ORE, TheLoop); 5889 Width = ElementCount::getFixed(1); 5890 Cost = ScalarCost; 5891 } 5892 5893 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5894 << "LV: Vectorization seems to be not beneficial, " 5895 << "but was forced by a user.\n"); 5896 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5897 VectorizationFactor Factor = {Width, 5898 (unsigned)(Width.getKnownMinValue() * Cost)}; 5899 return Factor; 5900 } 5901 5902 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5903 const Loop &L, ElementCount VF) const { 5904 // Cross iteration phis such as reductions need special handling and are 5905 // currently unsupported. 5906 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5907 return Legal->isFirstOrderRecurrence(&Phi) || 5908 Legal->isReductionVariable(&Phi); 5909 })) 5910 return false; 5911 5912 // Phis with uses outside of the loop require special handling and are 5913 // currently unsupported. 5914 for (auto &Entry : Legal->getInductionVars()) { 5915 // Look for uses of the value of the induction at the last iteration. 5916 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5917 for (User *U : PostInc->users()) 5918 if (!L.contains(cast<Instruction>(U))) 5919 return false; 5920 // Look for uses of penultimate value of the induction. 5921 for (User *U : Entry.first->users()) 5922 if (!L.contains(cast<Instruction>(U))) 5923 return false; 5924 } 5925 5926 // Induction variables that are widened require special handling that is 5927 // currently not supported. 5928 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5929 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5930 this->isProfitableToScalarize(Entry.first, VF)); 5931 })) 5932 return false; 5933 5934 return true; 5935 } 5936 5937 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5938 const ElementCount VF) const { 5939 // FIXME: We need a much better cost-model to take different parameters such 5940 // as register pressure, code size increase and cost of extra branches into 5941 // account. For now we apply a very crude heuristic and only consider loops 5942 // with vectorization factors larger than a certain value. 5943 // We also consider epilogue vectorization unprofitable for targets that don't 5944 // consider interleaving beneficial (eg. MVE). 5945 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5946 return false; 5947 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5948 return true; 5949 return false; 5950 } 5951 5952 VectorizationFactor 5953 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5954 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5955 VectorizationFactor Result = VectorizationFactor::Disabled(); 5956 if (!EnableEpilogueVectorization) { 5957 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5958 return Result; 5959 } 5960 5961 if (!isScalarEpilogueAllowed()) { 5962 LLVM_DEBUG( 5963 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5964 "allowed.\n";); 5965 return Result; 5966 } 5967 5968 // FIXME: This can be fixed for scalable vectors later, because at this stage 5969 // the LoopVectorizer will only consider vectorizing a loop with scalable 5970 // vectors when the loop has a hint to enable vectorization for a given VF. 5971 if (MainLoopVF.isScalable()) { 5972 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5973 "yet supported.\n"); 5974 return Result; 5975 } 5976 5977 // Not really a cost consideration, but check for unsupported cases here to 5978 // simplify the logic. 5979 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5980 LLVM_DEBUG( 5981 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5982 "not a supported candidate.\n";); 5983 return Result; 5984 } 5985 5986 if (EpilogueVectorizationForceVF > 1) { 5987 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5988 if (LVP.hasPlanWithVFs( 5989 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5990 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5991 else { 5992 LLVM_DEBUG( 5993 dbgs() 5994 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5995 return Result; 5996 } 5997 } 5998 5999 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6000 TheLoop->getHeader()->getParent()->hasMinSize()) { 6001 LLVM_DEBUG( 6002 dbgs() 6003 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6004 return Result; 6005 } 6006 6007 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6008 return Result; 6009 6010 for (auto &NextVF : ProfitableVFs) 6011 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6012 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6013 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6014 Result = NextVF; 6015 6016 if (Result != VectorizationFactor::Disabled()) 6017 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6018 << Result.Width.getFixedValue() << "\n";); 6019 return Result; 6020 } 6021 6022 std::pair<unsigned, unsigned> 6023 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6024 unsigned MinWidth = -1U; 6025 unsigned MaxWidth = 8; 6026 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6027 6028 // For each block. 6029 for (BasicBlock *BB : TheLoop->blocks()) { 6030 // For each instruction in the loop. 6031 for (Instruction &I : BB->instructionsWithoutDebug()) { 6032 Type *T = I.getType(); 6033 6034 // Skip ignored values. 6035 if (ValuesToIgnore.count(&I)) 6036 continue; 6037 6038 // Only examine Loads, Stores and PHINodes. 6039 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6040 continue; 6041 6042 // Examine PHI nodes that are reduction variables. Update the type to 6043 // account for the recurrence type. 6044 if (auto *PN = dyn_cast<PHINode>(&I)) { 6045 if (!Legal->isReductionVariable(PN)) 6046 continue; 6047 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6048 if (PreferInLoopReductions || 6049 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6050 RdxDesc.getRecurrenceType(), 6051 TargetTransformInfo::ReductionFlags())) 6052 continue; 6053 T = RdxDesc.getRecurrenceType(); 6054 } 6055 6056 // Examine the stored values. 6057 if (auto *ST = dyn_cast<StoreInst>(&I)) 6058 T = ST->getValueOperand()->getType(); 6059 6060 // Ignore loaded pointer types and stored pointer types that are not 6061 // vectorizable. 6062 // 6063 // FIXME: The check here attempts to predict whether a load or store will 6064 // be vectorized. We only know this for certain after a VF has 6065 // been selected. Here, we assume that if an access can be 6066 // vectorized, it will be. We should also look at extending this 6067 // optimization to non-pointer types. 6068 // 6069 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6070 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6071 continue; 6072 6073 MinWidth = std::min(MinWidth, 6074 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6075 MaxWidth = std::max(MaxWidth, 6076 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6077 } 6078 } 6079 6080 return {MinWidth, MaxWidth}; 6081 } 6082 6083 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6084 unsigned LoopCost) { 6085 // -- The interleave heuristics -- 6086 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6087 // There are many micro-architectural considerations that we can't predict 6088 // at this level. For example, frontend pressure (on decode or fetch) due to 6089 // code size, or the number and capabilities of the execution ports. 6090 // 6091 // We use the following heuristics to select the interleave count: 6092 // 1. If the code has reductions, then we interleave to break the cross 6093 // iteration dependency. 6094 // 2. If the loop is really small, then we interleave to reduce the loop 6095 // overhead. 6096 // 3. We don't interleave if we think that we will spill registers to memory 6097 // due to the increased register pressure. 6098 6099 if (!isScalarEpilogueAllowed()) 6100 return 1; 6101 6102 // We used the distance for the interleave count. 6103 if (Legal->getMaxSafeDepDistBytes() != -1U) 6104 return 1; 6105 6106 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6107 const bool HasReductions = !Legal->getReductionVars().empty(); 6108 // Do not interleave loops with a relatively small known or estimated trip 6109 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6110 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6111 // because with the above conditions interleaving can expose ILP and break 6112 // cross iteration dependences for reductions. 6113 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6114 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6115 return 1; 6116 6117 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6118 // We divide by these constants so assume that we have at least one 6119 // instruction that uses at least one register. 6120 for (auto& pair : R.MaxLocalUsers) { 6121 pair.second = std::max(pair.second, 1U); 6122 } 6123 6124 // We calculate the interleave count using the following formula. 6125 // Subtract the number of loop invariants from the number of available 6126 // registers. These registers are used by all of the interleaved instances. 6127 // Next, divide the remaining registers by the number of registers that is 6128 // required by the loop, in order to estimate how many parallel instances 6129 // fit without causing spills. All of this is rounded down if necessary to be 6130 // a power of two. We want power of two interleave count to simplify any 6131 // addressing operations or alignment considerations. 6132 // We also want power of two interleave counts to ensure that the induction 6133 // variable of the vector loop wraps to zero, when tail is folded by masking; 6134 // this currently happens when OptForSize, in which case IC is set to 1 above. 6135 unsigned IC = UINT_MAX; 6136 6137 for (auto& pair : R.MaxLocalUsers) { 6138 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6139 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6140 << " registers of " 6141 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6142 if (VF.isScalar()) { 6143 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6144 TargetNumRegisters = ForceTargetNumScalarRegs; 6145 } else { 6146 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6147 TargetNumRegisters = ForceTargetNumVectorRegs; 6148 } 6149 unsigned MaxLocalUsers = pair.second; 6150 unsigned LoopInvariantRegs = 0; 6151 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6152 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6153 6154 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6155 // Don't count the induction variable as interleaved. 6156 if (EnableIndVarRegisterHeur) { 6157 TmpIC = 6158 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6159 std::max(1U, (MaxLocalUsers - 1))); 6160 } 6161 6162 IC = std::min(IC, TmpIC); 6163 } 6164 6165 // Clamp the interleave ranges to reasonable counts. 6166 unsigned MaxInterleaveCount = 6167 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6168 6169 // Check if the user has overridden the max. 6170 if (VF.isScalar()) { 6171 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6172 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6173 } else { 6174 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6175 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6176 } 6177 6178 // If trip count is known or estimated compile time constant, limit the 6179 // interleave count to be less than the trip count divided by VF, provided it 6180 // is at least 1. 6181 // 6182 // For scalable vectors we can't know if interleaving is beneficial. It may 6183 // not be beneficial for small loops if none of the lanes in the second vector 6184 // iterations is enabled. However, for larger loops, there is likely to be a 6185 // similar benefit as for fixed-width vectors. For now, we choose to leave 6186 // the InterleaveCount as if vscale is '1', although if some information about 6187 // the vector is known (e.g. min vector size), we can make a better decision. 6188 if (BestKnownTC) { 6189 MaxInterleaveCount = 6190 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6191 // Make sure MaxInterleaveCount is greater than 0. 6192 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6193 } 6194 6195 assert(MaxInterleaveCount > 0 && 6196 "Maximum interleave count must be greater than 0"); 6197 6198 // Clamp the calculated IC to be between the 1 and the max interleave count 6199 // that the target and trip count allows. 6200 if (IC > MaxInterleaveCount) 6201 IC = MaxInterleaveCount; 6202 else 6203 // Make sure IC is greater than 0. 6204 IC = std::max(1u, IC); 6205 6206 assert(IC > 0 && "Interleave count must be greater than 0."); 6207 6208 // If we did not calculate the cost for VF (because the user selected the VF) 6209 // then we calculate the cost of VF here. 6210 if (LoopCost == 0) { 6211 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6212 LoopCost = *expectedCost(VF).first.getValue(); 6213 } 6214 6215 assert(LoopCost && "Non-zero loop cost expected"); 6216 6217 // Interleave if we vectorized this loop and there is a reduction that could 6218 // benefit from interleaving. 6219 if (VF.isVector() && HasReductions) { 6220 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6221 return IC; 6222 } 6223 6224 // Note that if we've already vectorized the loop we will have done the 6225 // runtime check and so interleaving won't require further checks. 6226 bool InterleavingRequiresRuntimePointerCheck = 6227 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6228 6229 // We want to interleave small loops in order to reduce the loop overhead and 6230 // potentially expose ILP opportunities. 6231 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6232 << "LV: IC is " << IC << '\n' 6233 << "LV: VF is " << VF << '\n'); 6234 const bool AggressivelyInterleaveReductions = 6235 TTI.enableAggressiveInterleaving(HasReductions); 6236 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6237 // We assume that the cost overhead is 1 and we use the cost model 6238 // to estimate the cost of the loop and interleave until the cost of the 6239 // loop overhead is about 5% of the cost of the loop. 6240 unsigned SmallIC = 6241 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6242 6243 // Interleave until store/load ports (estimated by max interleave count) are 6244 // saturated. 6245 unsigned NumStores = Legal->getNumStores(); 6246 unsigned NumLoads = Legal->getNumLoads(); 6247 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6248 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6249 6250 // If we have a scalar reduction (vector reductions are already dealt with 6251 // by this point), we can increase the critical path length if the loop 6252 // we're interleaving is inside another loop. Limit, by default to 2, so the 6253 // critical path only gets increased by one reduction operation. 6254 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6255 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6256 SmallIC = std::min(SmallIC, F); 6257 StoresIC = std::min(StoresIC, F); 6258 LoadsIC = std::min(LoadsIC, F); 6259 } 6260 6261 if (EnableLoadStoreRuntimeInterleave && 6262 std::max(StoresIC, LoadsIC) > SmallIC) { 6263 LLVM_DEBUG( 6264 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6265 return std::max(StoresIC, LoadsIC); 6266 } 6267 6268 // If there are scalar reductions and TTI has enabled aggressive 6269 // interleaving for reductions, we will interleave to expose ILP. 6270 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6271 AggressivelyInterleaveReductions) { 6272 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6273 // Interleave no less than SmallIC but not as aggressive as the normal IC 6274 // to satisfy the rare situation when resources are too limited. 6275 return std::max(IC / 2, SmallIC); 6276 } else { 6277 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6278 return SmallIC; 6279 } 6280 } 6281 6282 // Interleave if this is a large loop (small loops are already dealt with by 6283 // this point) that could benefit from interleaving. 6284 if (AggressivelyInterleaveReductions) { 6285 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6286 return IC; 6287 } 6288 6289 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6290 return 1; 6291 } 6292 6293 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6294 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6295 // This function calculates the register usage by measuring the highest number 6296 // of values that are alive at a single location. Obviously, this is a very 6297 // rough estimation. We scan the loop in a topological order in order and 6298 // assign a number to each instruction. We use RPO to ensure that defs are 6299 // met before their users. We assume that each instruction that has in-loop 6300 // users starts an interval. We record every time that an in-loop value is 6301 // used, so we have a list of the first and last occurrences of each 6302 // instruction. Next, we transpose this data structure into a multi map that 6303 // holds the list of intervals that *end* at a specific location. This multi 6304 // map allows us to perform a linear search. We scan the instructions linearly 6305 // and record each time that a new interval starts, by placing it in a set. 6306 // If we find this value in the multi-map then we remove it from the set. 6307 // The max register usage is the maximum size of the set. 6308 // We also search for instructions that are defined outside the loop, but are 6309 // used inside the loop. We need this number separately from the max-interval 6310 // usage number because when we unroll, loop-invariant values do not take 6311 // more register. 6312 LoopBlocksDFS DFS(TheLoop); 6313 DFS.perform(LI); 6314 6315 RegisterUsage RU; 6316 6317 // Each 'key' in the map opens a new interval. The values 6318 // of the map are the index of the 'last seen' usage of the 6319 // instruction that is the key. 6320 using IntervalMap = DenseMap<Instruction *, unsigned>; 6321 6322 // Maps instruction to its index. 6323 SmallVector<Instruction *, 64> IdxToInstr; 6324 // Marks the end of each interval. 6325 IntervalMap EndPoint; 6326 // Saves the list of instruction indices that are used in the loop. 6327 SmallPtrSet<Instruction *, 8> Ends; 6328 // Saves the list of values that are used in the loop but are 6329 // defined outside the loop, such as arguments and constants. 6330 SmallPtrSet<Value *, 8> LoopInvariants; 6331 6332 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6333 for (Instruction &I : BB->instructionsWithoutDebug()) { 6334 IdxToInstr.push_back(&I); 6335 6336 // Save the end location of each USE. 6337 for (Value *U : I.operands()) { 6338 auto *Instr = dyn_cast<Instruction>(U); 6339 6340 // Ignore non-instruction values such as arguments, constants, etc. 6341 if (!Instr) 6342 continue; 6343 6344 // If this instruction is outside the loop then record it and continue. 6345 if (!TheLoop->contains(Instr)) { 6346 LoopInvariants.insert(Instr); 6347 continue; 6348 } 6349 6350 // Overwrite previous end points. 6351 EndPoint[Instr] = IdxToInstr.size(); 6352 Ends.insert(Instr); 6353 } 6354 } 6355 } 6356 6357 // Saves the list of intervals that end with the index in 'key'. 6358 using InstrList = SmallVector<Instruction *, 2>; 6359 DenseMap<unsigned, InstrList> TransposeEnds; 6360 6361 // Transpose the EndPoints to a list of values that end at each index. 6362 for (auto &Interval : EndPoint) 6363 TransposeEnds[Interval.second].push_back(Interval.first); 6364 6365 SmallPtrSet<Instruction *, 8> OpenIntervals; 6366 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6367 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6368 6369 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6370 6371 // A lambda that gets the register usage for the given type and VF. 6372 const auto &TTICapture = TTI; 6373 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6374 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6375 return 0U; 6376 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6377 }; 6378 6379 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6380 Instruction *I = IdxToInstr[i]; 6381 6382 // Remove all of the instructions that end at this location. 6383 InstrList &List = TransposeEnds[i]; 6384 for (Instruction *ToRemove : List) 6385 OpenIntervals.erase(ToRemove); 6386 6387 // Ignore instructions that are never used within the loop. 6388 if (!Ends.count(I)) 6389 continue; 6390 6391 // Skip ignored values. 6392 if (ValuesToIgnore.count(I)) 6393 continue; 6394 6395 // For each VF find the maximum usage of registers. 6396 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6397 // Count the number of live intervals. 6398 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6399 6400 if (VFs[j].isScalar()) { 6401 for (auto Inst : OpenIntervals) { 6402 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6403 if (RegUsage.find(ClassID) == RegUsage.end()) 6404 RegUsage[ClassID] = 1; 6405 else 6406 RegUsage[ClassID] += 1; 6407 } 6408 } else { 6409 collectUniformsAndScalars(VFs[j]); 6410 for (auto Inst : OpenIntervals) { 6411 // Skip ignored values for VF > 1. 6412 if (VecValuesToIgnore.count(Inst)) 6413 continue; 6414 if (isScalarAfterVectorization(Inst, VFs[j])) { 6415 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6416 if (RegUsage.find(ClassID) == RegUsage.end()) 6417 RegUsage[ClassID] = 1; 6418 else 6419 RegUsage[ClassID] += 1; 6420 } else { 6421 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6422 if (RegUsage.find(ClassID) == RegUsage.end()) 6423 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6424 else 6425 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6426 } 6427 } 6428 } 6429 6430 for (auto& pair : RegUsage) { 6431 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6432 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6433 else 6434 MaxUsages[j][pair.first] = pair.second; 6435 } 6436 } 6437 6438 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6439 << OpenIntervals.size() << '\n'); 6440 6441 // Add the current instruction to the list of open intervals. 6442 OpenIntervals.insert(I); 6443 } 6444 6445 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6446 SmallMapVector<unsigned, unsigned, 4> Invariant; 6447 6448 for (auto Inst : LoopInvariants) { 6449 unsigned Usage = 6450 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6451 unsigned ClassID = 6452 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6453 if (Invariant.find(ClassID) == Invariant.end()) 6454 Invariant[ClassID] = Usage; 6455 else 6456 Invariant[ClassID] += Usage; 6457 } 6458 6459 LLVM_DEBUG({ 6460 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6461 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6462 << " item\n"; 6463 for (const auto &pair : MaxUsages[i]) { 6464 dbgs() << "LV(REG): RegisterClass: " 6465 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6466 << " registers\n"; 6467 } 6468 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6469 << " item\n"; 6470 for (const auto &pair : Invariant) { 6471 dbgs() << "LV(REG): RegisterClass: " 6472 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6473 << " registers\n"; 6474 } 6475 }); 6476 6477 RU.LoopInvariantRegs = Invariant; 6478 RU.MaxLocalUsers = MaxUsages[i]; 6479 RUs[i] = RU; 6480 } 6481 6482 return RUs; 6483 } 6484 6485 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6486 // TODO: Cost model for emulated masked load/store is completely 6487 // broken. This hack guides the cost model to use an artificially 6488 // high enough value to practically disable vectorization with such 6489 // operations, except where previously deployed legality hack allowed 6490 // using very low cost values. This is to avoid regressions coming simply 6491 // from moving "masked load/store" check from legality to cost model. 6492 // Masked Load/Gather emulation was previously never allowed. 6493 // Limited number of Masked Store/Scatter emulation was allowed. 6494 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6495 return isa<LoadInst>(I) || 6496 (isa<StoreInst>(I) && 6497 NumPredStores > NumberOfStoresToPredicate); 6498 } 6499 6500 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6501 // If we aren't vectorizing the loop, or if we've already collected the 6502 // instructions to scalarize, there's nothing to do. Collection may already 6503 // have occurred if we have a user-selected VF and are now computing the 6504 // expected cost for interleaving. 6505 if (VF.isScalar() || VF.isZero() || 6506 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6507 return; 6508 6509 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6510 // not profitable to scalarize any instructions, the presence of VF in the 6511 // map will indicate that we've analyzed it already. 6512 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6513 6514 // Find all the instructions that are scalar with predication in the loop and 6515 // determine if it would be better to not if-convert the blocks they are in. 6516 // If so, we also record the instructions to scalarize. 6517 for (BasicBlock *BB : TheLoop->blocks()) { 6518 if (!blockNeedsPredication(BB)) 6519 continue; 6520 for (Instruction &I : *BB) 6521 if (isScalarWithPredication(&I)) { 6522 ScalarCostsTy ScalarCosts; 6523 // Do not apply discount logic if hacked cost is needed 6524 // for emulated masked memrefs. 6525 if (!useEmulatedMaskMemRefHack(&I) && 6526 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6527 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6528 // Remember that BB will remain after vectorization. 6529 PredicatedBBsAfterVectorization.insert(BB); 6530 } 6531 } 6532 } 6533 6534 int LoopVectorizationCostModel::computePredInstDiscount( 6535 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6536 assert(!isUniformAfterVectorization(PredInst, VF) && 6537 "Instruction marked uniform-after-vectorization will be predicated"); 6538 6539 // Initialize the discount to zero, meaning that the scalar version and the 6540 // vector version cost the same. 6541 InstructionCost Discount = 0; 6542 6543 // Holds instructions to analyze. The instructions we visit are mapped in 6544 // ScalarCosts. Those instructions are the ones that would be scalarized if 6545 // we find that the scalar version costs less. 6546 SmallVector<Instruction *, 8> Worklist; 6547 6548 // Returns true if the given instruction can be scalarized. 6549 auto canBeScalarized = [&](Instruction *I) -> bool { 6550 // We only attempt to scalarize instructions forming a single-use chain 6551 // from the original predicated block that would otherwise be vectorized. 6552 // Although not strictly necessary, we give up on instructions we know will 6553 // already be scalar to avoid traversing chains that are unlikely to be 6554 // beneficial. 6555 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6556 isScalarAfterVectorization(I, VF)) 6557 return false; 6558 6559 // If the instruction is scalar with predication, it will be analyzed 6560 // separately. We ignore it within the context of PredInst. 6561 if (isScalarWithPredication(I)) 6562 return false; 6563 6564 // If any of the instruction's operands are uniform after vectorization, 6565 // the instruction cannot be scalarized. This prevents, for example, a 6566 // masked load from being scalarized. 6567 // 6568 // We assume we will only emit a value for lane zero of an instruction 6569 // marked uniform after vectorization, rather than VF identical values. 6570 // Thus, if we scalarize an instruction that uses a uniform, we would 6571 // create uses of values corresponding to the lanes we aren't emitting code 6572 // for. This behavior can be changed by allowing getScalarValue to clone 6573 // the lane zero values for uniforms rather than asserting. 6574 for (Use &U : I->operands()) 6575 if (auto *J = dyn_cast<Instruction>(U.get())) 6576 if (isUniformAfterVectorization(J, VF)) 6577 return false; 6578 6579 // Otherwise, we can scalarize the instruction. 6580 return true; 6581 }; 6582 6583 // Compute the expected cost discount from scalarizing the entire expression 6584 // feeding the predicated instruction. We currently only consider expressions 6585 // that are single-use instruction chains. 6586 Worklist.push_back(PredInst); 6587 while (!Worklist.empty()) { 6588 Instruction *I = Worklist.pop_back_val(); 6589 6590 // If we've already analyzed the instruction, there's nothing to do. 6591 if (ScalarCosts.find(I) != ScalarCosts.end()) 6592 continue; 6593 6594 // Compute the cost of the vector instruction. Note that this cost already 6595 // includes the scalarization overhead of the predicated instruction. 6596 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6597 6598 // Compute the cost of the scalarized instruction. This cost is the cost of 6599 // the instruction as if it wasn't if-converted and instead remained in the 6600 // predicated block. We will scale this cost by block probability after 6601 // computing the scalarization overhead. 6602 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6603 InstructionCost ScalarCost = 6604 VF.getKnownMinValue() * 6605 getInstructionCost(I, ElementCount::getFixed(1)).first; 6606 6607 // Compute the scalarization overhead of needed insertelement instructions 6608 // and phi nodes. 6609 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6610 ScalarCost += TTI.getScalarizationOverhead( 6611 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6612 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6613 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6614 ScalarCost += 6615 VF.getKnownMinValue() * 6616 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6617 } 6618 6619 // Compute the scalarization overhead of needed extractelement 6620 // instructions. For each of the instruction's operands, if the operand can 6621 // be scalarized, add it to the worklist; otherwise, account for the 6622 // overhead. 6623 for (Use &U : I->operands()) 6624 if (auto *J = dyn_cast<Instruction>(U.get())) { 6625 assert(VectorType::isValidElementType(J->getType()) && 6626 "Instruction has non-scalar type"); 6627 if (canBeScalarized(J)) 6628 Worklist.push_back(J); 6629 else if (needsExtract(J, VF)) { 6630 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6631 ScalarCost += TTI.getScalarizationOverhead( 6632 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6633 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6634 } 6635 } 6636 6637 // Scale the total scalar cost by block probability. 6638 ScalarCost /= getReciprocalPredBlockProb(); 6639 6640 // Compute the discount. A non-negative discount means the vector version 6641 // of the instruction costs more, and scalarizing would be beneficial. 6642 Discount += VectorCost - ScalarCost; 6643 ScalarCosts[I] = ScalarCost; 6644 } 6645 6646 return *Discount.getValue(); 6647 } 6648 6649 LoopVectorizationCostModel::VectorizationCostTy 6650 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6651 VectorizationCostTy Cost; 6652 6653 // For each block. 6654 for (BasicBlock *BB : TheLoop->blocks()) { 6655 VectorizationCostTy BlockCost; 6656 6657 // For each instruction in the old loop. 6658 for (Instruction &I : BB->instructionsWithoutDebug()) { 6659 // Skip ignored values. 6660 if (ValuesToIgnore.count(&I) || 6661 (VF.isVector() && VecValuesToIgnore.count(&I))) 6662 continue; 6663 6664 VectorizationCostTy C = getInstructionCost(&I, VF); 6665 6666 // Check if we should override the cost. 6667 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6668 C.first = InstructionCost(ForceTargetInstructionCost); 6669 6670 BlockCost.first += C.first; 6671 BlockCost.second |= C.second; 6672 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6673 << " for VF " << VF << " For instruction: " << I 6674 << '\n'); 6675 } 6676 6677 // If we are vectorizing a predicated block, it will have been 6678 // if-converted. This means that the block's instructions (aside from 6679 // stores and instructions that may divide by zero) will now be 6680 // unconditionally executed. For the scalar case, we may not always execute 6681 // the predicated block, if it is an if-else block. Thus, scale the block's 6682 // cost by the probability of executing it. blockNeedsPredication from 6683 // Legal is used so as to not include all blocks in tail folded loops. 6684 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6685 BlockCost.first /= getReciprocalPredBlockProb(); 6686 6687 Cost.first += BlockCost.first; 6688 Cost.second |= BlockCost.second; 6689 } 6690 6691 return Cost; 6692 } 6693 6694 /// Gets Address Access SCEV after verifying that the access pattern 6695 /// is loop invariant except the induction variable dependence. 6696 /// 6697 /// This SCEV can be sent to the Target in order to estimate the address 6698 /// calculation cost. 6699 static const SCEV *getAddressAccessSCEV( 6700 Value *Ptr, 6701 LoopVectorizationLegality *Legal, 6702 PredicatedScalarEvolution &PSE, 6703 const Loop *TheLoop) { 6704 6705 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6706 if (!Gep) 6707 return nullptr; 6708 6709 // We are looking for a gep with all loop invariant indices except for one 6710 // which should be an induction variable. 6711 auto SE = PSE.getSE(); 6712 unsigned NumOperands = Gep->getNumOperands(); 6713 for (unsigned i = 1; i < NumOperands; ++i) { 6714 Value *Opd = Gep->getOperand(i); 6715 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6716 !Legal->isInductionVariable(Opd)) 6717 return nullptr; 6718 } 6719 6720 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6721 return PSE.getSCEV(Ptr); 6722 } 6723 6724 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6725 return Legal->hasStride(I->getOperand(0)) || 6726 Legal->hasStride(I->getOperand(1)); 6727 } 6728 6729 InstructionCost 6730 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6731 ElementCount VF) { 6732 assert(VF.isVector() && 6733 "Scalarization cost of instruction implies vectorization."); 6734 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6735 Type *ValTy = getMemInstValueType(I); 6736 auto SE = PSE.getSE(); 6737 6738 unsigned AS = getLoadStoreAddressSpace(I); 6739 Value *Ptr = getLoadStorePointerOperand(I); 6740 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6741 6742 // Figure out whether the access is strided and get the stride value 6743 // if it's known in compile time 6744 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6745 6746 // Get the cost of the scalar memory instruction and address computation. 6747 InstructionCost Cost = 6748 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6749 6750 // Don't pass *I here, since it is scalar but will actually be part of a 6751 // vectorized loop where the user of it is a vectorized instruction. 6752 const Align Alignment = getLoadStoreAlignment(I); 6753 Cost += VF.getKnownMinValue() * 6754 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6755 AS, TTI::TCK_RecipThroughput); 6756 6757 // Get the overhead of the extractelement and insertelement instructions 6758 // we might create due to scalarization. 6759 Cost += getScalarizationOverhead(I, VF); 6760 6761 // If we have a predicated load/store, it will need extra i1 extracts and 6762 // conditional branches, but may not be executed for each vector lane. Scale 6763 // the cost by the probability of executing the predicated block. 6764 if (isPredicatedInst(I)) { 6765 Cost /= getReciprocalPredBlockProb(); 6766 6767 // Add the cost of an i1 extract and a branch 6768 auto *Vec_i1Ty = 6769 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6770 Cost += TTI.getScalarizationOverhead( 6771 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6772 /*Insert=*/false, /*Extract=*/true); 6773 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6774 6775 if (useEmulatedMaskMemRefHack(I)) 6776 // Artificially setting to a high enough value to practically disable 6777 // vectorization with such operations. 6778 Cost = 3000000; 6779 } 6780 6781 return Cost; 6782 } 6783 6784 InstructionCost 6785 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6786 ElementCount VF) { 6787 Type *ValTy = getMemInstValueType(I); 6788 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6789 Value *Ptr = getLoadStorePointerOperand(I); 6790 unsigned AS = getLoadStoreAddressSpace(I); 6791 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6792 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6793 6794 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6795 "Stride should be 1 or -1 for consecutive memory access"); 6796 const Align Alignment = getLoadStoreAlignment(I); 6797 InstructionCost Cost = 0; 6798 if (Legal->isMaskRequired(I)) 6799 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6800 CostKind); 6801 else 6802 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6803 CostKind, I); 6804 6805 bool Reverse = ConsecutiveStride < 0; 6806 if (Reverse) 6807 Cost += 6808 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6809 return Cost; 6810 } 6811 6812 InstructionCost 6813 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6814 ElementCount VF) { 6815 assert(Legal->isUniformMemOp(*I)); 6816 6817 Type *ValTy = getMemInstValueType(I); 6818 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6819 const Align Alignment = getLoadStoreAlignment(I); 6820 unsigned AS = getLoadStoreAddressSpace(I); 6821 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6822 if (isa<LoadInst>(I)) { 6823 return TTI.getAddressComputationCost(ValTy) + 6824 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6825 CostKind) + 6826 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6827 } 6828 StoreInst *SI = cast<StoreInst>(I); 6829 6830 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6831 return TTI.getAddressComputationCost(ValTy) + 6832 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6833 CostKind) + 6834 (isLoopInvariantStoreValue 6835 ? 0 6836 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6837 VF.getKnownMinValue() - 1)); 6838 } 6839 6840 InstructionCost 6841 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6842 ElementCount VF) { 6843 Type *ValTy = getMemInstValueType(I); 6844 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6845 const Align Alignment = getLoadStoreAlignment(I); 6846 const Value *Ptr = getLoadStorePointerOperand(I); 6847 6848 return TTI.getAddressComputationCost(VectorTy) + 6849 TTI.getGatherScatterOpCost( 6850 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6851 TargetTransformInfo::TCK_RecipThroughput, I); 6852 } 6853 6854 InstructionCost 6855 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6856 ElementCount VF) { 6857 // TODO: Once we have support for interleaving with scalable vectors 6858 // we can calculate the cost properly here. 6859 if (VF.isScalable()) 6860 return InstructionCost::getInvalid(); 6861 6862 Type *ValTy = getMemInstValueType(I); 6863 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6864 unsigned AS = getLoadStoreAddressSpace(I); 6865 6866 auto Group = getInterleavedAccessGroup(I); 6867 assert(Group && "Fail to get an interleaved access group."); 6868 6869 unsigned InterleaveFactor = Group->getFactor(); 6870 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6871 6872 // Holds the indices of existing members in an interleaved load group. 6873 // An interleaved store group doesn't need this as it doesn't allow gaps. 6874 SmallVector<unsigned, 4> Indices; 6875 if (isa<LoadInst>(I)) { 6876 for (unsigned i = 0; i < InterleaveFactor; i++) 6877 if (Group->getMember(i)) 6878 Indices.push_back(i); 6879 } 6880 6881 // Calculate the cost of the whole interleaved group. 6882 bool UseMaskForGaps = 6883 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6884 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6885 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6886 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6887 6888 if (Group->isReverse()) { 6889 // TODO: Add support for reversed masked interleaved access. 6890 assert(!Legal->isMaskRequired(I) && 6891 "Reverse masked interleaved access not supported."); 6892 Cost += 6893 Group->getNumMembers() * 6894 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6895 } 6896 return Cost; 6897 } 6898 6899 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6900 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6901 // Early exit for no inloop reductions 6902 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6903 return InstructionCost::getInvalid(); 6904 auto *VectorTy = cast<VectorType>(Ty); 6905 6906 // We are looking for a pattern of, and finding the minimal acceptable cost: 6907 // reduce(mul(ext(A), ext(B))) or 6908 // reduce(mul(A, B)) or 6909 // reduce(ext(A)) or 6910 // reduce(A). 6911 // The basic idea is that we walk down the tree to do that, finding the root 6912 // reduction instruction in InLoopReductionImmediateChains. From there we find 6913 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6914 // of the components. If the reduction cost is lower then we return it for the 6915 // reduction instruction and 0 for the other instructions in the pattern. If 6916 // it is not we return an invalid cost specifying the orignal cost method 6917 // should be used. 6918 Instruction *RetI = I; 6919 if ((RetI->getOpcode() == Instruction::SExt || 6920 RetI->getOpcode() == Instruction::ZExt)) { 6921 if (!RetI->hasOneUser()) 6922 return InstructionCost::getInvalid(); 6923 RetI = RetI->user_back(); 6924 } 6925 if (RetI->getOpcode() == Instruction::Mul && 6926 RetI->user_back()->getOpcode() == Instruction::Add) { 6927 if (!RetI->hasOneUser()) 6928 return InstructionCost::getInvalid(); 6929 RetI = RetI->user_back(); 6930 } 6931 6932 // Test if the found instruction is a reduction, and if not return an invalid 6933 // cost specifying the parent to use the original cost modelling. 6934 if (!InLoopReductionImmediateChains.count(RetI)) 6935 return InstructionCost::getInvalid(); 6936 6937 // Find the reduction this chain is a part of and calculate the basic cost of 6938 // the reduction on its own. 6939 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6940 Instruction *ReductionPhi = LastChain; 6941 while (!isa<PHINode>(ReductionPhi)) 6942 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6943 6944 RecurrenceDescriptor RdxDesc = 6945 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6946 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6947 VectorTy, false, CostKind); 6948 6949 // Get the operand that was not the reduction chain and match it to one of the 6950 // patterns, returning the better cost if it is found. 6951 Instruction *RedOp = RetI->getOperand(1) == LastChain 6952 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6953 : dyn_cast<Instruction>(RetI->getOperand(1)); 6954 6955 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6956 6957 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6958 !TheLoop->isLoopInvariant(RedOp)) { 6959 bool IsUnsigned = isa<ZExtInst>(RedOp); 6960 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6961 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6962 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6963 CostKind); 6964 6965 unsigned ExtCost = 6966 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6967 TTI::CastContextHint::None, CostKind, RedOp); 6968 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6969 return I == RetI ? *RedCost.getValue() : 0; 6970 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6971 Instruction *Mul = RedOp; 6972 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6973 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6974 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6975 Op0->getOpcode() == Op1->getOpcode() && 6976 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6977 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6978 bool IsUnsigned = isa<ZExtInst>(Op0); 6979 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6980 // reduce(mul(ext, ext)) 6981 unsigned ExtCost = 6982 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6983 TTI::CastContextHint::None, CostKind, Op0); 6984 InstructionCost MulCost = 6985 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6986 6987 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6988 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6989 CostKind); 6990 6991 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6992 return I == RetI ? *RedCost.getValue() : 0; 6993 } else { 6994 InstructionCost MulCost = 6995 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6996 6997 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6998 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6999 CostKind); 7000 7001 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7002 return I == RetI ? *RedCost.getValue() : 0; 7003 } 7004 } 7005 7006 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7007 } 7008 7009 InstructionCost 7010 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7011 ElementCount VF) { 7012 // Calculate scalar cost only. Vectorization cost should be ready at this 7013 // moment. 7014 if (VF.isScalar()) { 7015 Type *ValTy = getMemInstValueType(I); 7016 const Align Alignment = getLoadStoreAlignment(I); 7017 unsigned AS = getLoadStoreAddressSpace(I); 7018 7019 return TTI.getAddressComputationCost(ValTy) + 7020 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7021 TTI::TCK_RecipThroughput, I); 7022 } 7023 return getWideningCost(I, VF); 7024 } 7025 7026 LoopVectorizationCostModel::VectorizationCostTy 7027 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7028 ElementCount VF) { 7029 // If we know that this instruction will remain uniform, check the cost of 7030 // the scalar version. 7031 if (isUniformAfterVectorization(I, VF)) 7032 VF = ElementCount::getFixed(1); 7033 7034 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7035 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7036 7037 // Forced scalars do not have any scalarization overhead. 7038 auto ForcedScalar = ForcedScalars.find(VF); 7039 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7040 auto InstSet = ForcedScalar->second; 7041 if (InstSet.count(I)) 7042 return VectorizationCostTy( 7043 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7044 VF.getKnownMinValue()), 7045 false); 7046 } 7047 7048 Type *VectorTy; 7049 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7050 7051 bool TypeNotScalarized = 7052 VF.isVector() && VectorTy->isVectorTy() && 7053 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7054 return VectorizationCostTy(C, TypeNotScalarized); 7055 } 7056 7057 InstructionCost 7058 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7059 ElementCount VF) { 7060 7061 if (VF.isScalable()) 7062 return InstructionCost::getInvalid(); 7063 7064 if (VF.isScalar()) 7065 return 0; 7066 7067 InstructionCost Cost = 0; 7068 Type *RetTy = ToVectorTy(I->getType(), VF); 7069 if (!RetTy->isVoidTy() && 7070 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7071 Cost += TTI.getScalarizationOverhead( 7072 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7073 true, false); 7074 7075 // Some targets keep addresses scalar. 7076 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7077 return Cost; 7078 7079 // Some targets support efficient element stores. 7080 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7081 return Cost; 7082 7083 // Collect operands to consider. 7084 CallInst *CI = dyn_cast<CallInst>(I); 7085 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7086 7087 // Skip operands that do not require extraction/scalarization and do not incur 7088 // any overhead. 7089 SmallVector<Type *> Tys; 7090 for (auto *V : filterExtractingOperands(Ops, VF)) 7091 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7092 return Cost + TTI.getOperandsScalarizationOverhead( 7093 filterExtractingOperands(Ops, VF), Tys); 7094 } 7095 7096 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7097 if (VF.isScalar()) 7098 return; 7099 NumPredStores = 0; 7100 for (BasicBlock *BB : TheLoop->blocks()) { 7101 // For each instruction in the old loop. 7102 for (Instruction &I : *BB) { 7103 Value *Ptr = getLoadStorePointerOperand(&I); 7104 if (!Ptr) 7105 continue; 7106 7107 // TODO: We should generate better code and update the cost model for 7108 // predicated uniform stores. Today they are treated as any other 7109 // predicated store (see added test cases in 7110 // invariant-store-vectorization.ll). 7111 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7112 NumPredStores++; 7113 7114 if (Legal->isUniformMemOp(I)) { 7115 // TODO: Avoid replicating loads and stores instead of 7116 // relying on instcombine to remove them. 7117 // Load: Scalar load + broadcast 7118 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7119 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7120 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7121 continue; 7122 } 7123 7124 // We assume that widening is the best solution when possible. 7125 if (memoryInstructionCanBeWidened(&I, VF)) { 7126 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7127 int ConsecutiveStride = 7128 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7129 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7130 "Expected consecutive stride."); 7131 InstWidening Decision = 7132 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7133 setWideningDecision(&I, VF, Decision, Cost); 7134 continue; 7135 } 7136 7137 // Choose between Interleaving, Gather/Scatter or Scalarization. 7138 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7139 unsigned NumAccesses = 1; 7140 if (isAccessInterleaved(&I)) { 7141 auto Group = getInterleavedAccessGroup(&I); 7142 assert(Group && "Fail to get an interleaved access group."); 7143 7144 // Make one decision for the whole group. 7145 if (getWideningDecision(&I, VF) != CM_Unknown) 7146 continue; 7147 7148 NumAccesses = Group->getNumMembers(); 7149 if (interleavedAccessCanBeWidened(&I, VF)) 7150 InterleaveCost = getInterleaveGroupCost(&I, VF); 7151 } 7152 7153 InstructionCost GatherScatterCost = 7154 isLegalGatherOrScatter(&I) 7155 ? getGatherScatterCost(&I, VF) * NumAccesses 7156 : InstructionCost::getInvalid(); 7157 7158 InstructionCost ScalarizationCost = 7159 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7160 : InstructionCost::getInvalid(); 7161 7162 // Choose better solution for the current VF, 7163 // write down this decision and use it during vectorization. 7164 InstructionCost Cost; 7165 InstWidening Decision; 7166 if (InterleaveCost <= GatherScatterCost && 7167 InterleaveCost < ScalarizationCost) { 7168 Decision = CM_Interleave; 7169 Cost = InterleaveCost; 7170 } else if (GatherScatterCost < ScalarizationCost) { 7171 Decision = CM_GatherScatter; 7172 Cost = GatherScatterCost; 7173 } else { 7174 assert(!VF.isScalable() && 7175 "We cannot yet scalarise for scalable vectors"); 7176 Decision = CM_Scalarize; 7177 Cost = ScalarizationCost; 7178 } 7179 // If the instructions belongs to an interleave group, the whole group 7180 // receives the same decision. The whole group receives the cost, but 7181 // the cost will actually be assigned to one instruction. 7182 if (auto Group = getInterleavedAccessGroup(&I)) 7183 setWideningDecision(Group, VF, Decision, Cost); 7184 else 7185 setWideningDecision(&I, VF, Decision, Cost); 7186 } 7187 } 7188 7189 // Make sure that any load of address and any other address computation 7190 // remains scalar unless there is gather/scatter support. This avoids 7191 // inevitable extracts into address registers, and also has the benefit of 7192 // activating LSR more, since that pass can't optimize vectorized 7193 // addresses. 7194 if (TTI.prefersVectorizedAddressing()) 7195 return; 7196 7197 // Start with all scalar pointer uses. 7198 SmallPtrSet<Instruction *, 8> AddrDefs; 7199 for (BasicBlock *BB : TheLoop->blocks()) 7200 for (Instruction &I : *BB) { 7201 Instruction *PtrDef = 7202 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7203 if (PtrDef && TheLoop->contains(PtrDef) && 7204 getWideningDecision(&I, VF) != CM_GatherScatter) 7205 AddrDefs.insert(PtrDef); 7206 } 7207 7208 // Add all instructions used to generate the addresses. 7209 SmallVector<Instruction *, 4> Worklist; 7210 append_range(Worklist, AddrDefs); 7211 while (!Worklist.empty()) { 7212 Instruction *I = Worklist.pop_back_val(); 7213 for (auto &Op : I->operands()) 7214 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7215 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7216 AddrDefs.insert(InstOp).second) 7217 Worklist.push_back(InstOp); 7218 } 7219 7220 for (auto *I : AddrDefs) { 7221 if (isa<LoadInst>(I)) { 7222 // Setting the desired widening decision should ideally be handled in 7223 // by cost functions, but since this involves the task of finding out 7224 // if the loaded register is involved in an address computation, it is 7225 // instead changed here when we know this is the case. 7226 InstWidening Decision = getWideningDecision(I, VF); 7227 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7228 // Scalarize a widened load of address. 7229 setWideningDecision( 7230 I, VF, CM_Scalarize, 7231 (VF.getKnownMinValue() * 7232 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7233 else if (auto Group = getInterleavedAccessGroup(I)) { 7234 // Scalarize an interleave group of address loads. 7235 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7236 if (Instruction *Member = Group->getMember(I)) 7237 setWideningDecision( 7238 Member, VF, CM_Scalarize, 7239 (VF.getKnownMinValue() * 7240 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7241 } 7242 } 7243 } else 7244 // Make sure I gets scalarized and a cost estimate without 7245 // scalarization overhead. 7246 ForcedScalars[VF].insert(I); 7247 } 7248 } 7249 7250 InstructionCost 7251 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7252 Type *&VectorTy) { 7253 Type *RetTy = I->getType(); 7254 if (canTruncateToMinimalBitwidth(I, VF)) 7255 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7256 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7257 auto SE = PSE.getSE(); 7258 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7259 7260 // TODO: We need to estimate the cost of intrinsic calls. 7261 switch (I->getOpcode()) { 7262 case Instruction::GetElementPtr: 7263 // We mark this instruction as zero-cost because the cost of GEPs in 7264 // vectorized code depends on whether the corresponding memory instruction 7265 // is scalarized or not. Therefore, we handle GEPs with the memory 7266 // instruction cost. 7267 return 0; 7268 case Instruction::Br: { 7269 // In cases of scalarized and predicated instructions, there will be VF 7270 // predicated blocks in the vectorized loop. Each branch around these 7271 // blocks requires also an extract of its vector compare i1 element. 7272 bool ScalarPredicatedBB = false; 7273 BranchInst *BI = cast<BranchInst>(I); 7274 if (VF.isVector() && BI->isConditional() && 7275 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7276 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7277 ScalarPredicatedBB = true; 7278 7279 if (ScalarPredicatedBB) { 7280 // Return cost for branches around scalarized and predicated blocks. 7281 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7282 auto *Vec_i1Ty = 7283 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7284 return (TTI.getScalarizationOverhead( 7285 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7286 false, true) + 7287 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7288 VF.getKnownMinValue())); 7289 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7290 // The back-edge branch will remain, as will all scalar branches. 7291 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7292 else 7293 // This branch will be eliminated by if-conversion. 7294 return 0; 7295 // Note: We currently assume zero cost for an unconditional branch inside 7296 // a predicated block since it will become a fall-through, although we 7297 // may decide in the future to call TTI for all branches. 7298 } 7299 case Instruction::PHI: { 7300 auto *Phi = cast<PHINode>(I); 7301 7302 // First-order recurrences are replaced by vector shuffles inside the loop. 7303 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7304 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7305 return TTI.getShuffleCost( 7306 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7307 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7308 7309 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7310 // converted into select instructions. We require N - 1 selects per phi 7311 // node, where N is the number of incoming values. 7312 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7313 return (Phi->getNumIncomingValues() - 1) * 7314 TTI.getCmpSelInstrCost( 7315 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7316 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7317 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7318 7319 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7320 } 7321 case Instruction::UDiv: 7322 case Instruction::SDiv: 7323 case Instruction::URem: 7324 case Instruction::SRem: 7325 // If we have a predicated instruction, it may not be executed for each 7326 // vector lane. Get the scalarization cost and scale this amount by the 7327 // probability of executing the predicated block. If the instruction is not 7328 // predicated, we fall through to the next case. 7329 if (VF.isVector() && isScalarWithPredication(I)) { 7330 InstructionCost Cost = 0; 7331 7332 // These instructions have a non-void type, so account for the phi nodes 7333 // that we will create. This cost is likely to be zero. The phi node 7334 // cost, if any, should be scaled by the block probability because it 7335 // models a copy at the end of each predicated block. 7336 Cost += VF.getKnownMinValue() * 7337 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7338 7339 // The cost of the non-predicated instruction. 7340 Cost += VF.getKnownMinValue() * 7341 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7342 7343 // The cost of insertelement and extractelement instructions needed for 7344 // scalarization. 7345 Cost += getScalarizationOverhead(I, VF); 7346 7347 // Scale the cost by the probability of executing the predicated blocks. 7348 // This assumes the predicated block for each vector lane is equally 7349 // likely. 7350 return Cost / getReciprocalPredBlockProb(); 7351 } 7352 LLVM_FALLTHROUGH; 7353 case Instruction::Add: 7354 case Instruction::FAdd: 7355 case Instruction::Sub: 7356 case Instruction::FSub: 7357 case Instruction::Mul: 7358 case Instruction::FMul: 7359 case Instruction::FDiv: 7360 case Instruction::FRem: 7361 case Instruction::Shl: 7362 case Instruction::LShr: 7363 case Instruction::AShr: 7364 case Instruction::And: 7365 case Instruction::Or: 7366 case Instruction::Xor: { 7367 // Since we will replace the stride by 1 the multiplication should go away. 7368 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7369 return 0; 7370 7371 // Detect reduction patterns 7372 InstructionCost RedCost; 7373 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7374 .isValid()) 7375 return RedCost; 7376 7377 // Certain instructions can be cheaper to vectorize if they have a constant 7378 // second vector operand. One example of this are shifts on x86. 7379 Value *Op2 = I->getOperand(1); 7380 TargetTransformInfo::OperandValueProperties Op2VP; 7381 TargetTransformInfo::OperandValueKind Op2VK = 7382 TTI.getOperandInfo(Op2, Op2VP); 7383 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7384 Op2VK = TargetTransformInfo::OK_UniformValue; 7385 7386 SmallVector<const Value *, 4> Operands(I->operand_values()); 7387 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7388 return N * TTI.getArithmeticInstrCost( 7389 I->getOpcode(), VectorTy, CostKind, 7390 TargetTransformInfo::OK_AnyValue, 7391 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7392 } 7393 case Instruction::FNeg: { 7394 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7395 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7396 return N * TTI.getArithmeticInstrCost( 7397 I->getOpcode(), VectorTy, CostKind, 7398 TargetTransformInfo::OK_AnyValue, 7399 TargetTransformInfo::OK_AnyValue, 7400 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7401 I->getOperand(0), I); 7402 } 7403 case Instruction::Select: { 7404 SelectInst *SI = cast<SelectInst>(I); 7405 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7406 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7407 Type *CondTy = SI->getCondition()->getType(); 7408 if (!ScalarCond) 7409 CondTy = VectorType::get(CondTy, VF); 7410 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7411 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7412 } 7413 case Instruction::ICmp: 7414 case Instruction::FCmp: { 7415 Type *ValTy = I->getOperand(0)->getType(); 7416 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7417 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7418 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7419 VectorTy = ToVectorTy(ValTy, VF); 7420 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7421 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7422 } 7423 case Instruction::Store: 7424 case Instruction::Load: { 7425 ElementCount Width = VF; 7426 if (Width.isVector()) { 7427 InstWidening Decision = getWideningDecision(I, Width); 7428 assert(Decision != CM_Unknown && 7429 "CM decision should be taken at this point"); 7430 if (Decision == CM_Scalarize) 7431 Width = ElementCount::getFixed(1); 7432 } 7433 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7434 return getMemoryInstructionCost(I, VF); 7435 } 7436 case Instruction::ZExt: 7437 case Instruction::SExt: 7438 case Instruction::FPToUI: 7439 case Instruction::FPToSI: 7440 case Instruction::FPExt: 7441 case Instruction::PtrToInt: 7442 case Instruction::IntToPtr: 7443 case Instruction::SIToFP: 7444 case Instruction::UIToFP: 7445 case Instruction::Trunc: 7446 case Instruction::FPTrunc: 7447 case Instruction::BitCast: { 7448 // Computes the CastContextHint from a Load/Store instruction. 7449 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7450 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7451 "Expected a load or a store!"); 7452 7453 if (VF.isScalar() || !TheLoop->contains(I)) 7454 return TTI::CastContextHint::Normal; 7455 7456 switch (getWideningDecision(I, VF)) { 7457 case LoopVectorizationCostModel::CM_GatherScatter: 7458 return TTI::CastContextHint::GatherScatter; 7459 case LoopVectorizationCostModel::CM_Interleave: 7460 return TTI::CastContextHint::Interleave; 7461 case LoopVectorizationCostModel::CM_Scalarize: 7462 case LoopVectorizationCostModel::CM_Widen: 7463 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7464 : TTI::CastContextHint::Normal; 7465 case LoopVectorizationCostModel::CM_Widen_Reverse: 7466 return TTI::CastContextHint::Reversed; 7467 case LoopVectorizationCostModel::CM_Unknown: 7468 llvm_unreachable("Instr did not go through cost modelling?"); 7469 } 7470 7471 llvm_unreachable("Unhandled case!"); 7472 }; 7473 7474 unsigned Opcode = I->getOpcode(); 7475 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7476 // For Trunc, the context is the only user, which must be a StoreInst. 7477 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7478 if (I->hasOneUse()) 7479 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7480 CCH = ComputeCCH(Store); 7481 } 7482 // For Z/Sext, the context is the operand, which must be a LoadInst. 7483 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7484 Opcode == Instruction::FPExt) { 7485 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7486 CCH = ComputeCCH(Load); 7487 } 7488 7489 // We optimize the truncation of induction variables having constant 7490 // integer steps. The cost of these truncations is the same as the scalar 7491 // operation. 7492 if (isOptimizableIVTruncate(I, VF)) { 7493 auto *Trunc = cast<TruncInst>(I); 7494 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7495 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7496 } 7497 7498 // Detect reduction patterns 7499 InstructionCost RedCost; 7500 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7501 .isValid()) 7502 return RedCost; 7503 7504 Type *SrcScalarTy = I->getOperand(0)->getType(); 7505 Type *SrcVecTy = 7506 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7507 if (canTruncateToMinimalBitwidth(I, VF)) { 7508 // This cast is going to be shrunk. This may remove the cast or it might 7509 // turn it into slightly different cast. For example, if MinBW == 16, 7510 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7511 // 7512 // Calculate the modified src and dest types. 7513 Type *MinVecTy = VectorTy; 7514 if (Opcode == Instruction::Trunc) { 7515 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7516 VectorTy = 7517 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7518 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7519 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7520 VectorTy = 7521 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7522 } 7523 } 7524 7525 unsigned N; 7526 if (isScalarAfterVectorization(I, VF)) { 7527 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7528 N = VF.getKnownMinValue(); 7529 } else 7530 N = 1; 7531 return N * 7532 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7533 } 7534 case Instruction::Call: { 7535 bool NeedToScalarize; 7536 CallInst *CI = cast<CallInst>(I); 7537 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7538 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7539 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7540 return std::min(CallCost, IntrinsicCost); 7541 } 7542 return CallCost; 7543 } 7544 case Instruction::ExtractValue: 7545 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7546 default: 7547 // The cost of executing VF copies of the scalar instruction. This opcode 7548 // is unknown. Assume that it is the same as 'mul'. 7549 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7550 Instruction::Mul, VectorTy, CostKind) + 7551 getScalarizationOverhead(I, VF); 7552 } // end of switch. 7553 } 7554 7555 char LoopVectorize::ID = 0; 7556 7557 static const char lv_name[] = "Loop Vectorization"; 7558 7559 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7560 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7561 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7562 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7563 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7564 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7565 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7566 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7567 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7568 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7569 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7570 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7571 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7572 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7573 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7574 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7575 7576 namespace llvm { 7577 7578 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7579 7580 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7581 bool VectorizeOnlyWhenForced) { 7582 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7583 } 7584 7585 } // end namespace llvm 7586 7587 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7588 // Check if the pointer operand of a load or store instruction is 7589 // consecutive. 7590 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7591 return Legal->isConsecutivePtr(Ptr); 7592 return false; 7593 } 7594 7595 void LoopVectorizationCostModel::collectValuesToIgnore() { 7596 // Ignore ephemeral values. 7597 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7598 7599 // Ignore type-promoting instructions we identified during reduction 7600 // detection. 7601 for (auto &Reduction : Legal->getReductionVars()) { 7602 RecurrenceDescriptor &RedDes = Reduction.second; 7603 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7604 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7605 } 7606 // Ignore type-casting instructions we identified during induction 7607 // detection. 7608 for (auto &Induction : Legal->getInductionVars()) { 7609 InductionDescriptor &IndDes = Induction.second; 7610 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7611 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7612 } 7613 } 7614 7615 void LoopVectorizationCostModel::collectInLoopReductions() { 7616 for (auto &Reduction : Legal->getReductionVars()) { 7617 PHINode *Phi = Reduction.first; 7618 RecurrenceDescriptor &RdxDesc = Reduction.second; 7619 7620 // We don't collect reductions that are type promoted (yet). 7621 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7622 continue; 7623 7624 // If the target would prefer this reduction to happen "in-loop", then we 7625 // want to record it as such. 7626 unsigned Opcode = RdxDesc.getOpcode(); 7627 if (!PreferInLoopReductions && 7628 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7629 TargetTransformInfo::ReductionFlags())) 7630 continue; 7631 7632 // Check that we can correctly put the reductions into the loop, by 7633 // finding the chain of operations that leads from the phi to the loop 7634 // exit value. 7635 SmallVector<Instruction *, 4> ReductionOperations = 7636 RdxDesc.getReductionOpChain(Phi, TheLoop); 7637 bool InLoop = !ReductionOperations.empty(); 7638 if (InLoop) { 7639 InLoopReductionChains[Phi] = ReductionOperations; 7640 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7641 Instruction *LastChain = Phi; 7642 for (auto *I : ReductionOperations) { 7643 InLoopReductionImmediateChains[I] = LastChain; 7644 LastChain = I; 7645 } 7646 } 7647 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7648 << " reduction for phi: " << *Phi << "\n"); 7649 } 7650 } 7651 7652 // TODO: we could return a pair of values that specify the max VF and 7653 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7654 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7655 // doesn't have a cost model that can choose which plan to execute if 7656 // more than one is generated. 7657 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7658 LoopVectorizationCostModel &CM) { 7659 unsigned WidestType; 7660 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7661 return WidestVectorRegBits / WidestType; 7662 } 7663 7664 VectorizationFactor 7665 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7666 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7667 ElementCount VF = UserVF; 7668 // Outer loop handling: They may require CFG and instruction level 7669 // transformations before even evaluating whether vectorization is profitable. 7670 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7671 // the vectorization pipeline. 7672 if (!OrigLoop->isInnermost()) { 7673 // If the user doesn't provide a vectorization factor, determine a 7674 // reasonable one. 7675 if (UserVF.isZero()) { 7676 VF = ElementCount::getFixed(determineVPlanVF( 7677 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7678 .getFixedSize(), 7679 CM)); 7680 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7681 7682 // Make sure we have a VF > 1 for stress testing. 7683 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7684 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7685 << "overriding computed VF.\n"); 7686 VF = ElementCount::getFixed(4); 7687 } 7688 } 7689 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7690 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7691 "VF needs to be a power of two"); 7692 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7693 << "VF " << VF << " to build VPlans.\n"); 7694 buildVPlans(VF, VF); 7695 7696 // For VPlan build stress testing, we bail out after VPlan construction. 7697 if (VPlanBuildStressTest) 7698 return VectorizationFactor::Disabled(); 7699 7700 return {VF, 0 /*Cost*/}; 7701 } 7702 7703 LLVM_DEBUG( 7704 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7705 "VPlan-native path.\n"); 7706 return VectorizationFactor::Disabled(); 7707 } 7708 7709 Optional<VectorizationFactor> 7710 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7711 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7712 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7713 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7714 return None; 7715 7716 // Invalidate interleave groups if all blocks of loop will be predicated. 7717 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7718 !useMaskedInterleavedAccesses(*TTI)) { 7719 LLVM_DEBUG( 7720 dbgs() 7721 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7722 "which requires masked-interleaved support.\n"); 7723 if (CM.InterleaveInfo.invalidateGroups()) 7724 // Invalidating interleave groups also requires invalidating all decisions 7725 // based on them, which includes widening decisions and uniform and scalar 7726 // values. 7727 CM.invalidateCostModelingDecisions(); 7728 } 7729 7730 ElementCount MaxVF = MaybeMaxVF.getValue(); 7731 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7732 7733 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7734 if (!UserVF.isZero() && 7735 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7736 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7737 // VFs here, this should be reverted to only use legal UserVFs once the 7738 // loop below supports scalable VFs. 7739 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7740 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7741 << " VF " << VF << ".\n"); 7742 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7743 "VF needs to be a power of two"); 7744 // Collect the instructions (and their associated costs) that will be more 7745 // profitable to scalarize. 7746 CM.selectUserVectorizationFactor(VF); 7747 CM.collectInLoopReductions(); 7748 buildVPlansWithVPRecipes(VF, VF); 7749 LLVM_DEBUG(printPlans(dbgs())); 7750 return {{VF, 0}}; 7751 } 7752 7753 assert(!MaxVF.isScalable() && 7754 "Scalable vectors not yet supported beyond this point"); 7755 7756 for (ElementCount VF = ElementCount::getFixed(1); 7757 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7758 // Collect Uniform and Scalar instructions after vectorization with VF. 7759 CM.collectUniformsAndScalars(VF); 7760 7761 // Collect the instructions (and their associated costs) that will be more 7762 // profitable to scalarize. 7763 if (VF.isVector()) 7764 CM.collectInstsToScalarize(VF); 7765 } 7766 7767 CM.collectInLoopReductions(); 7768 7769 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7770 LLVM_DEBUG(printPlans(dbgs())); 7771 if (MaxVF.isScalar()) 7772 return VectorizationFactor::Disabled(); 7773 7774 // Select the optimal vectorization factor. 7775 return CM.selectVectorizationFactor(MaxVF); 7776 } 7777 7778 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7779 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7780 << '\n'); 7781 BestVF = VF; 7782 BestUF = UF; 7783 7784 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7785 return !Plan->hasVF(VF); 7786 }); 7787 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7788 } 7789 7790 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7791 DominatorTree *DT) { 7792 // Perform the actual loop transformation. 7793 7794 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7795 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7796 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7797 7798 VPTransformState State{ 7799 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7800 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7801 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7802 State.CanonicalIV = ILV.Induction; 7803 7804 ILV.printDebugTracesAtStart(); 7805 7806 //===------------------------------------------------===// 7807 // 7808 // Notice: any optimization or new instruction that go 7809 // into the code below should also be implemented in 7810 // the cost-model. 7811 // 7812 //===------------------------------------------------===// 7813 7814 // 2. Copy and widen instructions from the old loop into the new loop. 7815 VPlans.front()->execute(&State); 7816 7817 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7818 // predication, updating analyses. 7819 ILV.fixVectorizedLoop(State); 7820 7821 ILV.printDebugTracesAtEnd(); 7822 } 7823 7824 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7825 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7826 for (const auto &Plan : VPlans) 7827 if (PrintVPlansInDotFormat) 7828 Plan->printDOT(O); 7829 else 7830 Plan->print(O); 7831 } 7832 #endif 7833 7834 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7835 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7836 7837 // We create new control-flow for the vectorized loop, so the original exit 7838 // conditions will be dead after vectorization if it's only used by the 7839 // terminator 7840 SmallVector<BasicBlock*> ExitingBlocks; 7841 OrigLoop->getExitingBlocks(ExitingBlocks); 7842 for (auto *BB : ExitingBlocks) { 7843 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7844 if (!Cmp || !Cmp->hasOneUse()) 7845 continue; 7846 7847 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7848 if (!DeadInstructions.insert(Cmp).second) 7849 continue; 7850 7851 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7852 // TODO: can recurse through operands in general 7853 for (Value *Op : Cmp->operands()) { 7854 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7855 DeadInstructions.insert(cast<Instruction>(Op)); 7856 } 7857 } 7858 7859 // We create new "steps" for induction variable updates to which the original 7860 // induction variables map. An original update instruction will be dead if 7861 // all its users except the induction variable are dead. 7862 auto *Latch = OrigLoop->getLoopLatch(); 7863 for (auto &Induction : Legal->getInductionVars()) { 7864 PHINode *Ind = Induction.first; 7865 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7866 7867 // If the tail is to be folded by masking, the primary induction variable, 7868 // if exists, isn't dead: it will be used for masking. Don't kill it. 7869 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7870 continue; 7871 7872 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7873 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7874 })) 7875 DeadInstructions.insert(IndUpdate); 7876 7877 // We record as "Dead" also the type-casting instructions we had identified 7878 // during induction analysis. We don't need any handling for them in the 7879 // vectorized loop because we have proven that, under a proper runtime 7880 // test guarding the vectorized loop, the value of the phi, and the casted 7881 // value of the phi, are the same. The last instruction in this casting chain 7882 // will get its scalar/vector/widened def from the scalar/vector/widened def 7883 // of the respective phi node. Any other casts in the induction def-use chain 7884 // have no other uses outside the phi update chain, and will be ignored. 7885 InductionDescriptor &IndDes = Induction.second; 7886 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7887 DeadInstructions.insert(Casts.begin(), Casts.end()); 7888 } 7889 } 7890 7891 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7892 7893 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7894 7895 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7896 Instruction::BinaryOps BinOp) { 7897 // When unrolling and the VF is 1, we only need to add a simple scalar. 7898 Type *Ty = Val->getType(); 7899 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7900 7901 if (Ty->isFloatingPointTy()) { 7902 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7903 7904 // Floating-point operations inherit FMF via the builder's flags. 7905 Value *MulOp = Builder.CreateFMul(C, Step); 7906 return Builder.CreateBinOp(BinOp, Val, MulOp); 7907 } 7908 Constant *C = ConstantInt::get(Ty, StartIdx); 7909 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7910 } 7911 7912 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7913 SmallVector<Metadata *, 4> MDs; 7914 // Reserve first location for self reference to the LoopID metadata node. 7915 MDs.push_back(nullptr); 7916 bool IsUnrollMetadata = false; 7917 MDNode *LoopID = L->getLoopID(); 7918 if (LoopID) { 7919 // First find existing loop unrolling disable metadata. 7920 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7921 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7922 if (MD) { 7923 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7924 IsUnrollMetadata = 7925 S && S->getString().startswith("llvm.loop.unroll.disable"); 7926 } 7927 MDs.push_back(LoopID->getOperand(i)); 7928 } 7929 } 7930 7931 if (!IsUnrollMetadata) { 7932 // Add runtime unroll disable metadata. 7933 LLVMContext &Context = L->getHeader()->getContext(); 7934 SmallVector<Metadata *, 1> DisableOperands; 7935 DisableOperands.push_back( 7936 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7937 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7938 MDs.push_back(DisableNode); 7939 MDNode *NewLoopID = MDNode::get(Context, MDs); 7940 // Set operand 0 to refer to the loop id itself. 7941 NewLoopID->replaceOperandWith(0, NewLoopID); 7942 L->setLoopID(NewLoopID); 7943 } 7944 } 7945 7946 //===--------------------------------------------------------------------===// 7947 // EpilogueVectorizerMainLoop 7948 //===--------------------------------------------------------------------===// 7949 7950 /// This function is partially responsible for generating the control flow 7951 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7952 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7953 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7954 Loop *Lp = createVectorLoopSkeleton(""); 7955 7956 // Generate the code to check the minimum iteration count of the vector 7957 // epilogue (see below). 7958 EPI.EpilogueIterationCountCheck = 7959 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7960 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7961 7962 // Generate the code to check any assumptions that we've made for SCEV 7963 // expressions. 7964 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 7965 7966 // Generate the code that checks at runtime if arrays overlap. We put the 7967 // checks into a separate block to make the more common case of few elements 7968 // faster. 7969 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7970 7971 // Generate the iteration count check for the main loop, *after* the check 7972 // for the epilogue loop, so that the path-length is shorter for the case 7973 // that goes directly through the vector epilogue. The longer-path length for 7974 // the main loop is compensated for, by the gain from vectorizing the larger 7975 // trip count. Note: the branch will get updated later on when we vectorize 7976 // the epilogue. 7977 EPI.MainLoopIterationCountCheck = 7978 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7979 7980 // Generate the induction variable. 7981 OldInduction = Legal->getPrimaryInduction(); 7982 Type *IdxTy = Legal->getWidestInductionType(); 7983 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7984 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7985 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7986 EPI.VectorTripCount = CountRoundDown; 7987 Induction = 7988 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7989 getDebugLocFromInstOrOperands(OldInduction)); 7990 7991 // Skip induction resume value creation here because they will be created in 7992 // the second pass. If we created them here, they wouldn't be used anyway, 7993 // because the vplan in the second pass still contains the inductions from the 7994 // original loop. 7995 7996 return completeLoopSkeleton(Lp, OrigLoopID); 7997 } 7998 7999 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8000 LLVM_DEBUG({ 8001 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8002 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8003 << ", Main Loop UF:" << EPI.MainLoopUF 8004 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8005 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8006 }); 8007 } 8008 8009 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8010 DEBUG_WITH_TYPE(VerboseDebug, { 8011 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8012 }); 8013 } 8014 8015 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8016 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8017 assert(L && "Expected valid Loop."); 8018 assert(Bypass && "Expected valid bypass basic block."); 8019 unsigned VFactor = 8020 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8021 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8022 Value *Count = getOrCreateTripCount(L); 8023 // Reuse existing vector loop preheader for TC checks. 8024 // Note that new preheader block is generated for vector loop. 8025 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8026 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8027 8028 // Generate code to check if the loop's trip count is less than VF * UF of the 8029 // main vector loop. 8030 auto P = 8031 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8032 8033 Value *CheckMinIters = Builder.CreateICmp( 8034 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8035 "min.iters.check"); 8036 8037 if (!ForEpilogue) 8038 TCCheckBlock->setName("vector.main.loop.iter.check"); 8039 8040 // Create new preheader for vector loop. 8041 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8042 DT, LI, nullptr, "vector.ph"); 8043 8044 if (ForEpilogue) { 8045 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8046 DT->getNode(Bypass)->getIDom()) && 8047 "TC check is expected to dominate Bypass"); 8048 8049 // Update dominator for Bypass & LoopExit. 8050 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8051 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8052 8053 LoopBypassBlocks.push_back(TCCheckBlock); 8054 8055 // Save the trip count so we don't have to regenerate it in the 8056 // vec.epilog.iter.check. This is safe to do because the trip count 8057 // generated here dominates the vector epilog iter check. 8058 EPI.TripCount = Count; 8059 } 8060 8061 ReplaceInstWithInst( 8062 TCCheckBlock->getTerminator(), 8063 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8064 8065 return TCCheckBlock; 8066 } 8067 8068 //===--------------------------------------------------------------------===// 8069 // EpilogueVectorizerEpilogueLoop 8070 //===--------------------------------------------------------------------===// 8071 8072 /// This function is partially responsible for generating the control flow 8073 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8074 BasicBlock * 8075 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8076 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8077 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8078 8079 // Now, compare the remaining count and if there aren't enough iterations to 8080 // execute the vectorized epilogue skip to the scalar part. 8081 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8082 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8083 LoopVectorPreHeader = 8084 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8085 LI, nullptr, "vec.epilog.ph"); 8086 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8087 VecEpilogueIterationCountCheck); 8088 8089 // Adjust the control flow taking the state info from the main loop 8090 // vectorization into account. 8091 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8092 "expected this to be saved from the previous pass."); 8093 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8094 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8095 8096 DT->changeImmediateDominator(LoopVectorPreHeader, 8097 EPI.MainLoopIterationCountCheck); 8098 8099 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8100 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8101 8102 if (EPI.SCEVSafetyCheck) 8103 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8104 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8105 if (EPI.MemSafetyCheck) 8106 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8107 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8108 8109 DT->changeImmediateDominator( 8110 VecEpilogueIterationCountCheck, 8111 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8112 8113 DT->changeImmediateDominator(LoopScalarPreHeader, 8114 EPI.EpilogueIterationCountCheck); 8115 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8116 8117 // Keep track of bypass blocks, as they feed start values to the induction 8118 // phis in the scalar loop preheader. 8119 if (EPI.SCEVSafetyCheck) 8120 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8121 if (EPI.MemSafetyCheck) 8122 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8123 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8124 8125 // Generate a resume induction for the vector epilogue and put it in the 8126 // vector epilogue preheader 8127 Type *IdxTy = Legal->getWidestInductionType(); 8128 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8129 LoopVectorPreHeader->getFirstNonPHI()); 8130 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8131 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8132 EPI.MainLoopIterationCountCheck); 8133 8134 // Generate the induction variable. 8135 OldInduction = Legal->getPrimaryInduction(); 8136 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8137 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8138 Value *StartIdx = EPResumeVal; 8139 Induction = 8140 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8141 getDebugLocFromInstOrOperands(OldInduction)); 8142 8143 // Generate induction resume values. These variables save the new starting 8144 // indexes for the scalar loop. They are used to test if there are any tail 8145 // iterations left once the vector loop has completed. 8146 // Note that when the vectorized epilogue is skipped due to iteration count 8147 // check, then the resume value for the induction variable comes from 8148 // the trip count of the main vector loop, hence passing the AdditionalBypass 8149 // argument. 8150 createInductionResumeValues(Lp, CountRoundDown, 8151 {VecEpilogueIterationCountCheck, 8152 EPI.VectorTripCount} /* AdditionalBypass */); 8153 8154 AddRuntimeUnrollDisableMetaData(Lp); 8155 return completeLoopSkeleton(Lp, OrigLoopID); 8156 } 8157 8158 BasicBlock * 8159 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8160 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8161 8162 assert(EPI.TripCount && 8163 "Expected trip count to have been safed in the first pass."); 8164 assert( 8165 (!isa<Instruction>(EPI.TripCount) || 8166 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8167 "saved trip count does not dominate insertion point."); 8168 Value *TC = EPI.TripCount; 8169 IRBuilder<> Builder(Insert->getTerminator()); 8170 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8171 8172 // Generate code to check if the loop's trip count is less than VF * UF of the 8173 // vector epilogue loop. 8174 auto P = 8175 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8176 8177 Value *CheckMinIters = Builder.CreateICmp( 8178 P, Count, 8179 ConstantInt::get(Count->getType(), 8180 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8181 "min.epilog.iters.check"); 8182 8183 ReplaceInstWithInst( 8184 Insert->getTerminator(), 8185 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8186 8187 LoopBypassBlocks.push_back(Insert); 8188 return Insert; 8189 } 8190 8191 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8192 LLVM_DEBUG({ 8193 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8194 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8195 << ", Main Loop UF:" << EPI.MainLoopUF 8196 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8197 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8198 }); 8199 } 8200 8201 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8202 DEBUG_WITH_TYPE(VerboseDebug, { 8203 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8204 }); 8205 } 8206 8207 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8208 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8209 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8210 bool PredicateAtRangeStart = Predicate(Range.Start); 8211 8212 for (ElementCount TmpVF = Range.Start * 2; 8213 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8214 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8215 Range.End = TmpVF; 8216 break; 8217 } 8218 8219 return PredicateAtRangeStart; 8220 } 8221 8222 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8223 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8224 /// of VF's starting at a given VF and extending it as much as possible. Each 8225 /// vectorization decision can potentially shorten this sub-range during 8226 /// buildVPlan(). 8227 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8228 ElementCount MaxVF) { 8229 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8230 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8231 VFRange SubRange = {VF, MaxVFPlusOne}; 8232 VPlans.push_back(buildVPlan(SubRange)); 8233 VF = SubRange.End; 8234 } 8235 } 8236 8237 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8238 VPlanPtr &Plan) { 8239 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8240 8241 // Look for cached value. 8242 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8243 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8244 if (ECEntryIt != EdgeMaskCache.end()) 8245 return ECEntryIt->second; 8246 8247 VPValue *SrcMask = createBlockInMask(Src, Plan); 8248 8249 // The terminator has to be a branch inst! 8250 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8251 assert(BI && "Unexpected terminator found"); 8252 8253 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8254 return EdgeMaskCache[Edge] = SrcMask; 8255 8256 // If source is an exiting block, we know the exit edge is dynamically dead 8257 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8258 // adding uses of an otherwise potentially dead instruction. 8259 if (OrigLoop->isLoopExiting(Src)) 8260 return EdgeMaskCache[Edge] = SrcMask; 8261 8262 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8263 assert(EdgeMask && "No Edge Mask found for condition"); 8264 8265 if (BI->getSuccessor(0) != Dst) 8266 EdgeMask = Builder.createNot(EdgeMask); 8267 8268 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8269 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8270 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8271 // The select version does not introduce new UB if SrcMask is false and 8272 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8273 VPValue *False = Plan->getOrAddVPValue( 8274 ConstantInt::getFalse(BI->getCondition()->getType())); 8275 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8276 } 8277 8278 return EdgeMaskCache[Edge] = EdgeMask; 8279 } 8280 8281 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8282 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8283 8284 // Look for cached value. 8285 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8286 if (BCEntryIt != BlockMaskCache.end()) 8287 return BCEntryIt->second; 8288 8289 // All-one mask is modelled as no-mask following the convention for masked 8290 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8291 VPValue *BlockMask = nullptr; 8292 8293 if (OrigLoop->getHeader() == BB) { 8294 if (!CM.blockNeedsPredication(BB)) 8295 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8296 8297 // Create the block in mask as the first non-phi instruction in the block. 8298 VPBuilder::InsertPointGuard Guard(Builder); 8299 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8300 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8301 8302 // Introduce the early-exit compare IV <= BTC to form header block mask. 8303 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8304 // Start by constructing the desired canonical IV. 8305 VPValue *IV = nullptr; 8306 if (Legal->getPrimaryInduction()) 8307 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8308 else { 8309 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8310 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8311 IV = IVRecipe->getVPValue(); 8312 } 8313 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8314 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8315 8316 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8317 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8318 // as a second argument, we only pass the IV here and extract the 8319 // tripcount from the transform state where codegen of the VP instructions 8320 // happen. 8321 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8322 } else { 8323 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8324 } 8325 return BlockMaskCache[BB] = BlockMask; 8326 } 8327 8328 // This is the block mask. We OR all incoming edges. 8329 for (auto *Predecessor : predecessors(BB)) { 8330 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8331 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8332 return BlockMaskCache[BB] = EdgeMask; 8333 8334 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8335 BlockMask = EdgeMask; 8336 continue; 8337 } 8338 8339 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8340 } 8341 8342 return BlockMaskCache[BB] = BlockMask; 8343 } 8344 8345 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8346 VPlanPtr &Plan) { 8347 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8348 "Must be called with either a load or store"); 8349 8350 auto willWiden = [&](ElementCount VF) -> bool { 8351 if (VF.isScalar()) 8352 return false; 8353 LoopVectorizationCostModel::InstWidening Decision = 8354 CM.getWideningDecision(I, VF); 8355 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8356 "CM decision should be taken at this point."); 8357 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8358 return true; 8359 if (CM.isScalarAfterVectorization(I, VF) || 8360 CM.isProfitableToScalarize(I, VF)) 8361 return false; 8362 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8363 }; 8364 8365 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8366 return nullptr; 8367 8368 VPValue *Mask = nullptr; 8369 if (Legal->isMaskRequired(I)) 8370 Mask = createBlockInMask(I->getParent(), Plan); 8371 8372 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8373 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8374 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8375 8376 StoreInst *Store = cast<StoreInst>(I); 8377 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8378 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8379 } 8380 8381 VPWidenIntOrFpInductionRecipe * 8382 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8383 // Check if this is an integer or fp induction. If so, build the recipe that 8384 // produces its scalar and vector values. 8385 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8386 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8387 II.getKind() == InductionDescriptor::IK_FpInduction) { 8388 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8389 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8390 return new VPWidenIntOrFpInductionRecipe( 8391 Phi, Start, Casts.empty() ? nullptr : Casts.front()); 8392 } 8393 8394 return nullptr; 8395 } 8396 8397 VPWidenIntOrFpInductionRecipe * 8398 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8399 VPlan &Plan) const { 8400 // Optimize the special case where the source is a constant integer 8401 // induction variable. Notice that we can only optimize the 'trunc' case 8402 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8403 // (c) other casts depend on pointer size. 8404 8405 // Determine whether \p K is a truncation based on an induction variable that 8406 // can be optimized. 8407 auto isOptimizableIVTruncate = 8408 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8409 return [=](ElementCount VF) -> bool { 8410 return CM.isOptimizableIVTruncate(K, VF); 8411 }; 8412 }; 8413 8414 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8415 isOptimizableIVTruncate(I), Range)) { 8416 8417 InductionDescriptor II = 8418 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8419 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8420 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8421 Start, nullptr, I); 8422 } 8423 return nullptr; 8424 } 8425 8426 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8427 // If all incoming values are equal, the incoming VPValue can be used directly 8428 // instead of creating a new VPBlendRecipe. 8429 Value *FirstIncoming = Phi->getIncomingValue(0); 8430 if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) { 8431 return FirstIncoming == Inc; 8432 })) { 8433 return Plan->getOrAddVPValue(Phi->getIncomingValue(0)); 8434 } 8435 8436 // We know that all PHIs in non-header blocks are converted into selects, so 8437 // we don't have to worry about the insertion order and we can just use the 8438 // builder. At this point we generate the predication tree. There may be 8439 // duplications since this is a simple recursive scan, but future 8440 // optimizations will clean it up. 8441 SmallVector<VPValue *, 2> Operands; 8442 unsigned NumIncoming = Phi->getNumIncomingValues(); 8443 8444 for (unsigned In = 0; In < NumIncoming; In++) { 8445 VPValue *EdgeMask = 8446 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8447 assert((EdgeMask || NumIncoming == 1) && 8448 "Multiple predecessors with one having a full mask"); 8449 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8450 if (EdgeMask) 8451 Operands.push_back(EdgeMask); 8452 } 8453 return toVPRecipeResult(new VPBlendRecipe(Phi, Operands)); 8454 } 8455 8456 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8457 VPlan &Plan) const { 8458 8459 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8460 [this, CI](ElementCount VF) { 8461 return CM.isScalarWithPredication(CI, VF); 8462 }, 8463 Range); 8464 8465 if (IsPredicated) 8466 return nullptr; 8467 8468 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8469 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8470 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8471 ID == Intrinsic::pseudoprobe || 8472 ID == Intrinsic::experimental_noalias_scope_decl)) 8473 return nullptr; 8474 8475 auto willWiden = [&](ElementCount VF) -> bool { 8476 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8477 // The following case may be scalarized depending on the VF. 8478 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8479 // version of the instruction. 8480 // Is it beneficial to perform intrinsic call compared to lib call? 8481 bool NeedToScalarize = false; 8482 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8483 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8484 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8485 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8486 "Cannot have invalid costs while widening"); 8487 return UseVectorIntrinsic || !NeedToScalarize; 8488 }; 8489 8490 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8491 return nullptr; 8492 8493 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8494 } 8495 8496 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8497 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8498 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8499 // Instruction should be widened, unless it is scalar after vectorization, 8500 // scalarization is profitable or it is predicated. 8501 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8502 return CM.isScalarAfterVectorization(I, VF) || 8503 CM.isProfitableToScalarize(I, VF) || 8504 CM.isScalarWithPredication(I, VF); 8505 }; 8506 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8507 Range); 8508 } 8509 8510 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8511 auto IsVectorizableOpcode = [](unsigned Opcode) { 8512 switch (Opcode) { 8513 case Instruction::Add: 8514 case Instruction::And: 8515 case Instruction::AShr: 8516 case Instruction::BitCast: 8517 case Instruction::FAdd: 8518 case Instruction::FCmp: 8519 case Instruction::FDiv: 8520 case Instruction::FMul: 8521 case Instruction::FNeg: 8522 case Instruction::FPExt: 8523 case Instruction::FPToSI: 8524 case Instruction::FPToUI: 8525 case Instruction::FPTrunc: 8526 case Instruction::FRem: 8527 case Instruction::FSub: 8528 case Instruction::ICmp: 8529 case Instruction::IntToPtr: 8530 case Instruction::LShr: 8531 case Instruction::Mul: 8532 case Instruction::Or: 8533 case Instruction::PtrToInt: 8534 case Instruction::SDiv: 8535 case Instruction::Select: 8536 case Instruction::SExt: 8537 case Instruction::Shl: 8538 case Instruction::SIToFP: 8539 case Instruction::SRem: 8540 case Instruction::Sub: 8541 case Instruction::Trunc: 8542 case Instruction::UDiv: 8543 case Instruction::UIToFP: 8544 case Instruction::URem: 8545 case Instruction::Xor: 8546 case Instruction::ZExt: 8547 return true; 8548 } 8549 return false; 8550 }; 8551 8552 if (!IsVectorizableOpcode(I->getOpcode())) 8553 return nullptr; 8554 8555 // Success: widen this instruction. 8556 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8557 } 8558 8559 VPBasicBlock *VPRecipeBuilder::handleReplication( 8560 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8561 VPlanPtr &Plan) { 8562 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8563 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8564 Range); 8565 8566 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8567 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8568 Range); 8569 8570 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8571 IsUniform, IsPredicated); 8572 setRecipe(I, Recipe); 8573 Plan->addVPValue(I, Recipe); 8574 8575 // Find if I uses a predicated instruction. If so, it will use its scalar 8576 // value. Avoid hoisting the insert-element which packs the scalar value into 8577 // a vector value, as that happens iff all users use the vector value. 8578 for (VPValue *Op : Recipe->operands()) { 8579 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8580 if (!PredR) 8581 continue; 8582 auto *RepR = 8583 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8584 assert(RepR->isPredicated() && 8585 "expected Replicate recipe to be predicated"); 8586 RepR->setAlsoPack(false); 8587 } 8588 8589 // Finalize the recipe for Instr, first if it is not predicated. 8590 if (!IsPredicated) { 8591 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8592 VPBB->appendRecipe(Recipe); 8593 return VPBB; 8594 } 8595 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8596 assert(VPBB->getSuccessors().empty() && 8597 "VPBB has successors when handling predicated replication."); 8598 // Record predicated instructions for above packing optimizations. 8599 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8600 VPBlockUtils::insertBlockAfter(Region, VPBB); 8601 auto *RegSucc = new VPBasicBlock(); 8602 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8603 return RegSucc; 8604 } 8605 8606 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8607 VPRecipeBase *PredRecipe, 8608 VPlanPtr &Plan) { 8609 // Instructions marked for predication are replicated and placed under an 8610 // if-then construct to prevent side-effects. 8611 8612 // Generate recipes to compute the block mask for this region. 8613 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8614 8615 // Build the triangular if-then region. 8616 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8617 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8618 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8619 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8620 auto *PHIRecipe = Instr->getType()->isVoidTy() 8621 ? nullptr 8622 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8623 if (PHIRecipe) { 8624 Plan->removeVPValueFor(Instr); 8625 Plan->addVPValue(Instr, PHIRecipe); 8626 } 8627 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8628 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8629 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8630 8631 // Note: first set Entry as region entry and then connect successors starting 8632 // from it in order, to propagate the "parent" of each VPBasicBlock. 8633 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8634 VPBlockUtils::connectBlocks(Pred, Exit); 8635 8636 return Region; 8637 } 8638 8639 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8640 VFRange &Range, 8641 VPlanPtr &Plan) { 8642 // First, check for specific widening recipes that deal with calls, memory 8643 // operations, inductions and Phi nodes. 8644 if (auto *CI = dyn_cast<CallInst>(Instr)) 8645 return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan)); 8646 8647 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8648 return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan)); 8649 8650 VPRecipeBase *Recipe; 8651 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8652 if (Phi->getParent() != OrigLoop->getHeader()) 8653 return tryToBlend(Phi, Plan); 8654 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8655 return toVPRecipeResult(Recipe); 8656 8657 if (Legal->isReductionVariable(Phi)) { 8658 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8659 VPValue *StartV = 8660 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8661 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8662 } 8663 8664 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8665 } 8666 8667 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8668 cast<TruncInst>(Instr), Range, *Plan))) 8669 return toVPRecipeResult(Recipe); 8670 8671 if (!shouldWiden(Instr, Range)) 8672 return nullptr; 8673 8674 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8675 return toVPRecipeResult(new VPWidenGEPRecipe( 8676 GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop)); 8677 8678 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8679 bool InvariantCond = 8680 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8681 return toVPRecipeResult(new VPWidenSelectRecipe( 8682 *SI, Plan->mapToVPValues(SI->operands()), InvariantCond)); 8683 } 8684 8685 return toVPRecipeResult(tryToWiden(Instr, *Plan)); 8686 } 8687 8688 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8689 ElementCount MaxVF) { 8690 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8691 8692 // Collect instructions from the original loop that will become trivially dead 8693 // in the vectorized loop. We don't need to vectorize these instructions. For 8694 // example, original induction update instructions can become dead because we 8695 // separately emit induction "steps" when generating code for the new loop. 8696 // Similarly, we create a new latch condition when setting up the structure 8697 // of the new loop, so the old one can become dead. 8698 SmallPtrSet<Instruction *, 4> DeadInstructions; 8699 collectTriviallyDeadInstructions(DeadInstructions); 8700 8701 // Add assume instructions we need to drop to DeadInstructions, to prevent 8702 // them from being added to the VPlan. 8703 // TODO: We only need to drop assumes in blocks that get flattend. If the 8704 // control flow is preserved, we should keep them. 8705 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8706 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8707 8708 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8709 // Dead instructions do not need sinking. Remove them from SinkAfter. 8710 for (Instruction *I : DeadInstructions) 8711 SinkAfter.erase(I); 8712 8713 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8714 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8715 VFRange SubRange = {VF, MaxVFPlusOne}; 8716 VPlans.push_back( 8717 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8718 VF = SubRange.End; 8719 } 8720 } 8721 8722 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8723 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8724 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8725 8726 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8727 8728 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8729 8730 // --------------------------------------------------------------------------- 8731 // Pre-construction: record ingredients whose recipes we'll need to further 8732 // process after constructing the initial VPlan. 8733 // --------------------------------------------------------------------------- 8734 8735 // Mark instructions we'll need to sink later and their targets as 8736 // ingredients whose recipe we'll need to record. 8737 for (auto &Entry : SinkAfter) { 8738 RecipeBuilder.recordRecipeOf(Entry.first); 8739 RecipeBuilder.recordRecipeOf(Entry.second); 8740 } 8741 for (auto &Reduction : CM.getInLoopReductionChains()) { 8742 PHINode *Phi = Reduction.first; 8743 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8744 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8745 8746 RecipeBuilder.recordRecipeOf(Phi); 8747 for (auto &R : ReductionOperations) { 8748 RecipeBuilder.recordRecipeOf(R); 8749 // For min/max reducitons, where we have a pair of icmp/select, we also 8750 // need to record the ICmp recipe, so it can be removed later. 8751 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8752 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8753 } 8754 } 8755 8756 // For each interleave group which is relevant for this (possibly trimmed) 8757 // Range, add it to the set of groups to be later applied to the VPlan and add 8758 // placeholders for its members' Recipes which we'll be replacing with a 8759 // single VPInterleaveRecipe. 8760 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8761 auto applyIG = [IG, this](ElementCount VF) -> bool { 8762 return (VF.isVector() && // Query is illegal for VF == 1 8763 CM.getWideningDecision(IG->getInsertPos(), VF) == 8764 LoopVectorizationCostModel::CM_Interleave); 8765 }; 8766 if (!getDecisionAndClampRange(applyIG, Range)) 8767 continue; 8768 InterleaveGroups.insert(IG); 8769 for (unsigned i = 0; i < IG->getFactor(); i++) 8770 if (Instruction *Member = IG->getMember(i)) 8771 RecipeBuilder.recordRecipeOf(Member); 8772 }; 8773 8774 // --------------------------------------------------------------------------- 8775 // Build initial VPlan: Scan the body of the loop in a topological order to 8776 // visit each basic block after having visited its predecessor basic blocks. 8777 // --------------------------------------------------------------------------- 8778 8779 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8780 auto Plan = std::make_unique<VPlan>(); 8781 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8782 Plan->setEntry(VPBB); 8783 8784 // Scan the body of the loop in a topological order to visit each basic block 8785 // after having visited its predecessor basic blocks. 8786 LoopBlocksDFS DFS(OrigLoop); 8787 DFS.perform(LI); 8788 8789 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8790 // Relevant instructions from basic block BB will be grouped into VPRecipe 8791 // ingredients and fill a new VPBasicBlock. 8792 unsigned VPBBsForBB = 0; 8793 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8794 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8795 VPBB = FirstVPBBForBB; 8796 Builder.setInsertPoint(VPBB); 8797 8798 // Introduce each ingredient into VPlan. 8799 // TODO: Model and preserve debug instrinsics in VPlan. 8800 for (Instruction &I : BB->instructionsWithoutDebug()) { 8801 Instruction *Instr = &I; 8802 8803 // First filter out irrelevant instructions, to ensure no recipes are 8804 // built for them. 8805 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8806 continue; 8807 8808 if (auto RecipeOrValue = 8809 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8810 // If Instr can be simplified to an existing VPValue, use it. 8811 if (RecipeOrValue.is<VPValue *>()) { 8812 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8813 continue; 8814 } 8815 // Otherwise, add the new recipe. 8816 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8817 for (auto *Def : Recipe->definedValues()) { 8818 auto *UV = Def->getUnderlyingValue(); 8819 Plan->addVPValue(UV, Def); 8820 } 8821 8822 RecipeBuilder.setRecipe(Instr, Recipe); 8823 VPBB->appendRecipe(Recipe); 8824 continue; 8825 } 8826 8827 // Otherwise, if all widening options failed, Instruction is to be 8828 // replicated. This may create a successor for VPBB. 8829 VPBasicBlock *NextVPBB = 8830 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8831 if (NextVPBB != VPBB) { 8832 VPBB = NextVPBB; 8833 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8834 : ""); 8835 } 8836 } 8837 } 8838 8839 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8840 // may also be empty, such as the last one VPBB, reflecting original 8841 // basic-blocks with no recipes. 8842 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8843 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8844 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8845 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8846 delete PreEntry; 8847 8848 // --------------------------------------------------------------------------- 8849 // Transform initial VPlan: Apply previously taken decisions, in order, to 8850 // bring the VPlan to its final state. 8851 // --------------------------------------------------------------------------- 8852 8853 // Apply Sink-After legal constraints. 8854 for (auto &Entry : SinkAfter) { 8855 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8856 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8857 // If the target is in a replication region, make sure to move Sink to the 8858 // block after it, not into the replication region itself. 8859 if (auto *Region = 8860 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8861 if (Region->isReplicator()) { 8862 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8863 VPBasicBlock *NextBlock = 8864 cast<VPBasicBlock>(Region->getSuccessors().front()); 8865 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8866 continue; 8867 } 8868 } 8869 Sink->moveAfter(Target); 8870 } 8871 8872 // Interleave memory: for each Interleave Group we marked earlier as relevant 8873 // for this VPlan, replace the Recipes widening its memory instructions with a 8874 // single VPInterleaveRecipe at its insertion point. 8875 for (auto IG : InterleaveGroups) { 8876 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8877 RecipeBuilder.getRecipe(IG->getInsertPos())); 8878 SmallVector<VPValue *, 4> StoredValues; 8879 for (unsigned i = 0; i < IG->getFactor(); ++i) 8880 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8881 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8882 8883 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8884 Recipe->getMask()); 8885 VPIG->insertBefore(Recipe); 8886 unsigned J = 0; 8887 for (unsigned i = 0; i < IG->getFactor(); ++i) 8888 if (Instruction *Member = IG->getMember(i)) { 8889 if (!Member->getType()->isVoidTy()) { 8890 VPValue *OriginalV = Plan->getVPValue(Member); 8891 Plan->removeVPValueFor(Member); 8892 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8893 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8894 J++; 8895 } 8896 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8897 } 8898 } 8899 8900 // Adjust the recipes for any inloop reductions. 8901 if (Range.Start.isVector()) 8902 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8903 8904 // Finally, if tail is folded by masking, introduce selects between the phi 8905 // and the live-out instruction of each reduction, at the end of the latch. 8906 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8907 Builder.setInsertPoint(VPBB); 8908 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8909 for (auto &Reduction : Legal->getReductionVars()) { 8910 if (CM.isInLoopReduction(Reduction.first)) 8911 continue; 8912 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8913 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8914 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8915 } 8916 } 8917 8918 std::string PlanName; 8919 raw_string_ostream RSO(PlanName); 8920 ElementCount VF = Range.Start; 8921 Plan->addVF(VF); 8922 RSO << "Initial VPlan for VF={" << VF; 8923 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8924 Plan->addVF(VF); 8925 RSO << "," << VF; 8926 } 8927 RSO << "},UF>=1"; 8928 RSO.flush(); 8929 Plan->setName(PlanName); 8930 8931 return Plan; 8932 } 8933 8934 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8935 // Outer loop handling: They may require CFG and instruction level 8936 // transformations before even evaluating whether vectorization is profitable. 8937 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8938 // the vectorization pipeline. 8939 assert(!OrigLoop->isInnermost()); 8940 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8941 8942 // Create new empty VPlan 8943 auto Plan = std::make_unique<VPlan>(); 8944 8945 // Build hierarchical CFG 8946 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8947 HCFGBuilder.buildHierarchicalCFG(); 8948 8949 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8950 VF *= 2) 8951 Plan->addVF(VF); 8952 8953 if (EnableVPlanPredication) { 8954 VPlanPredicator VPP(*Plan); 8955 VPP.predicate(); 8956 8957 // Avoid running transformation to recipes until masked code generation in 8958 // VPlan-native path is in place. 8959 return Plan; 8960 } 8961 8962 SmallPtrSet<Instruction *, 1> DeadInstructions; 8963 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 8964 Legal->getInductionVars(), 8965 DeadInstructions, *PSE.getSE()); 8966 return Plan; 8967 } 8968 8969 // Adjust the recipes for any inloop reductions. The chain of instructions 8970 // leading from the loop exit instr to the phi need to be converted to 8971 // reductions, with one operand being vector and the other being the scalar 8972 // reduction chain. 8973 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8974 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8975 for (auto &Reduction : CM.getInLoopReductionChains()) { 8976 PHINode *Phi = Reduction.first; 8977 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8978 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8979 8980 // ReductionOperations are orders top-down from the phi's use to the 8981 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8982 // which of the two operands will remain scalar and which will be reduced. 8983 // For minmax the chain will be the select instructions. 8984 Instruction *Chain = Phi; 8985 for (Instruction *R : ReductionOperations) { 8986 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8987 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8988 8989 VPValue *ChainOp = Plan->getVPValue(Chain); 8990 unsigned FirstOpId; 8991 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8992 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8993 "Expected to replace a VPWidenSelectSC"); 8994 FirstOpId = 1; 8995 } else { 8996 assert(isa<VPWidenRecipe>(WidenRecipe) && 8997 "Expected to replace a VPWidenSC"); 8998 FirstOpId = 0; 8999 } 9000 unsigned VecOpId = 9001 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9002 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9003 9004 auto *CondOp = CM.foldTailByMasking() 9005 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9006 : nullptr; 9007 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9008 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9009 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9010 Plan->removeVPValueFor(R); 9011 Plan->addVPValue(R, RedRecipe); 9012 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9013 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9014 WidenRecipe->eraseFromParent(); 9015 9016 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9017 VPRecipeBase *CompareRecipe = 9018 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9019 assert(isa<VPWidenRecipe>(CompareRecipe) && 9020 "Expected to replace a VPWidenSC"); 9021 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9022 "Expected no remaining users"); 9023 CompareRecipe->eraseFromParent(); 9024 } 9025 Chain = R; 9026 } 9027 } 9028 } 9029 9030 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9031 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9032 VPSlotTracker &SlotTracker) const { 9033 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9034 IG->getInsertPos()->printAsOperand(O, false); 9035 O << ", "; 9036 getAddr()->printAsOperand(O, SlotTracker); 9037 VPValue *Mask = getMask(); 9038 if (Mask) { 9039 O << ", "; 9040 Mask->printAsOperand(O, SlotTracker); 9041 } 9042 for (unsigned i = 0; i < IG->getFactor(); ++i) 9043 if (Instruction *I = IG->getMember(i)) 9044 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9045 } 9046 #endif 9047 9048 void VPWidenCallRecipe::execute(VPTransformState &State) { 9049 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9050 *this, State); 9051 } 9052 9053 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9054 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9055 this, *this, InvariantCond, State); 9056 } 9057 9058 void VPWidenRecipe::execute(VPTransformState &State) { 9059 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9060 } 9061 9062 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9063 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9064 *this, State.UF, State.VF, IsPtrLoopInvariant, 9065 IsIndexLoopInvariant, State); 9066 } 9067 9068 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9069 assert(!State.Instance && "Int or FP induction being replicated."); 9070 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9071 getTruncInst(), getVPValue(0), 9072 getCastValue(), State); 9073 } 9074 9075 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9076 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9077 getStartValue(), this, State); 9078 } 9079 9080 void VPBlendRecipe::execute(VPTransformState &State) { 9081 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9082 // We know that all PHIs in non-header blocks are converted into 9083 // selects, so we don't have to worry about the insertion order and we 9084 // can just use the builder. 9085 // At this point we generate the predication tree. There may be 9086 // duplications since this is a simple recursive scan, but future 9087 // optimizations will clean it up. 9088 9089 unsigned NumIncoming = getNumIncomingValues(); 9090 9091 // Generate a sequence of selects of the form: 9092 // SELECT(Mask3, In3, 9093 // SELECT(Mask2, In2, 9094 // SELECT(Mask1, In1, 9095 // In0))) 9096 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9097 // are essentially undef are taken from In0. 9098 InnerLoopVectorizer::VectorParts Entry(State.UF); 9099 for (unsigned In = 0; In < NumIncoming; ++In) { 9100 for (unsigned Part = 0; Part < State.UF; ++Part) { 9101 // We might have single edge PHIs (blocks) - use an identity 9102 // 'select' for the first PHI operand. 9103 Value *In0 = State.get(getIncomingValue(In), Part); 9104 if (In == 0) 9105 Entry[Part] = In0; // Initialize with the first incoming value. 9106 else { 9107 // Select between the current value and the previous incoming edge 9108 // based on the incoming mask. 9109 Value *Cond = State.get(getMask(In), Part); 9110 Entry[Part] = 9111 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9112 } 9113 } 9114 } 9115 for (unsigned Part = 0; Part < State.UF; ++Part) 9116 State.set(this, Entry[Part], Part); 9117 } 9118 9119 void VPInterleaveRecipe::execute(VPTransformState &State) { 9120 assert(!State.Instance && "Interleave group being replicated."); 9121 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9122 getStoredValues(), getMask()); 9123 } 9124 9125 void VPReductionRecipe::execute(VPTransformState &State) { 9126 assert(!State.Instance && "Reduction being replicated."); 9127 for (unsigned Part = 0; Part < State.UF; ++Part) { 9128 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9129 Value *NewVecOp = State.get(getVecOp(), Part); 9130 if (VPValue *Cond = getCondOp()) { 9131 Value *NewCond = State.get(Cond, Part); 9132 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9133 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9134 Kind, VecTy->getElementType()); 9135 Constant *IdenVec = 9136 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9137 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9138 NewVecOp = Select; 9139 } 9140 Value *NewRed = 9141 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9142 Value *PrevInChain = State.get(getChainOp(), Part); 9143 Value *NextInChain; 9144 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9145 NextInChain = 9146 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9147 NewRed, PrevInChain); 9148 } else { 9149 NextInChain = State.Builder.CreateBinOp( 9150 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9151 PrevInChain); 9152 } 9153 State.set(this, NextInChain, Part); 9154 } 9155 } 9156 9157 void VPReplicateRecipe::execute(VPTransformState &State) { 9158 if (State.Instance) { // Generate a single instance. 9159 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9160 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9161 *State.Instance, IsPredicated, State); 9162 // Insert scalar instance packing it into a vector. 9163 if (AlsoPack && State.VF.isVector()) { 9164 // If we're constructing lane 0, initialize to start from poison. 9165 if (State.Instance->Lane.isFirstLane()) { 9166 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9167 Value *Poison = PoisonValue::get( 9168 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9169 State.set(this, Poison, State.Instance->Part); 9170 } 9171 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9172 } 9173 return; 9174 } 9175 9176 // Generate scalar instances for all VF lanes of all UF parts, unless the 9177 // instruction is uniform inwhich case generate only the first lane for each 9178 // of the UF parts. 9179 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9180 assert((!State.VF.isScalable() || IsUniform) && 9181 "Can't scalarize a scalable vector"); 9182 for (unsigned Part = 0; Part < State.UF; ++Part) 9183 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9184 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9185 VPIteration(Part, Lane), IsPredicated, 9186 State); 9187 } 9188 9189 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9190 assert(State.Instance && "Branch on Mask works only on single instance."); 9191 9192 unsigned Part = State.Instance->Part; 9193 unsigned Lane = State.Instance->Lane.getKnownLane(); 9194 9195 Value *ConditionBit = nullptr; 9196 VPValue *BlockInMask = getMask(); 9197 if (BlockInMask) { 9198 ConditionBit = State.get(BlockInMask, Part); 9199 if (ConditionBit->getType()->isVectorTy()) 9200 ConditionBit = State.Builder.CreateExtractElement( 9201 ConditionBit, State.Builder.getInt32(Lane)); 9202 } else // Block in mask is all-one. 9203 ConditionBit = State.Builder.getTrue(); 9204 9205 // Replace the temporary unreachable terminator with a new conditional branch, 9206 // whose two destinations will be set later when they are created. 9207 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9208 assert(isa<UnreachableInst>(CurrentTerminator) && 9209 "Expected to replace unreachable terminator with conditional branch."); 9210 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9211 CondBr->setSuccessor(0, nullptr); 9212 ReplaceInstWithInst(CurrentTerminator, CondBr); 9213 } 9214 9215 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9216 assert(State.Instance && "Predicated instruction PHI works per instance."); 9217 Instruction *ScalarPredInst = 9218 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9219 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9220 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9221 assert(PredicatingBB && "Predicated block has no single predecessor."); 9222 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9223 "operand must be VPReplicateRecipe"); 9224 9225 // By current pack/unpack logic we need to generate only a single phi node: if 9226 // a vector value for the predicated instruction exists at this point it means 9227 // the instruction has vector users only, and a phi for the vector value is 9228 // needed. In this case the recipe of the predicated instruction is marked to 9229 // also do that packing, thereby "hoisting" the insert-element sequence. 9230 // Otherwise, a phi node for the scalar value is needed. 9231 unsigned Part = State.Instance->Part; 9232 if (State.hasVectorValue(getOperand(0), Part)) { 9233 Value *VectorValue = State.get(getOperand(0), Part); 9234 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9235 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9236 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9237 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9238 if (State.hasVectorValue(this, Part)) 9239 State.reset(this, VPhi, Part); 9240 else 9241 State.set(this, VPhi, Part); 9242 // NOTE: Currently we need to update the value of the operand, so the next 9243 // predicated iteration inserts its generated value in the correct vector. 9244 State.reset(getOperand(0), VPhi, Part); 9245 } else { 9246 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9247 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9248 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9249 PredicatingBB); 9250 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9251 if (State.hasScalarValue(this, *State.Instance)) 9252 State.reset(this, Phi, *State.Instance); 9253 else 9254 State.set(this, Phi, *State.Instance); 9255 // NOTE: Currently we need to update the value of the operand, so the next 9256 // predicated iteration inserts its generated value in the correct vector. 9257 State.reset(getOperand(0), Phi, *State.Instance); 9258 } 9259 } 9260 9261 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9262 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9263 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9264 StoredValue ? nullptr : getVPValue(), 9265 getAddr(), StoredValue, getMask()); 9266 } 9267 9268 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9269 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9270 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9271 // for predication. 9272 static ScalarEpilogueLowering getScalarEpilogueLowering( 9273 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9274 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9275 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9276 LoopVectorizationLegality &LVL) { 9277 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9278 // don't look at hints or options, and don't request a scalar epilogue. 9279 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9280 // LoopAccessInfo (due to code dependency and not being able to reliably get 9281 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9282 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9283 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9284 // back to the old way and vectorize with versioning when forced. See D81345.) 9285 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9286 PGSOQueryType::IRPass) && 9287 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9288 return CM_ScalarEpilogueNotAllowedOptSize; 9289 9290 // 2) If set, obey the directives 9291 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9292 switch (PreferPredicateOverEpilogue) { 9293 case PreferPredicateTy::ScalarEpilogue: 9294 return CM_ScalarEpilogueAllowed; 9295 case PreferPredicateTy::PredicateElseScalarEpilogue: 9296 return CM_ScalarEpilogueNotNeededUsePredicate; 9297 case PreferPredicateTy::PredicateOrDontVectorize: 9298 return CM_ScalarEpilogueNotAllowedUsePredicate; 9299 }; 9300 } 9301 9302 // 3) If set, obey the hints 9303 switch (Hints.getPredicate()) { 9304 case LoopVectorizeHints::FK_Enabled: 9305 return CM_ScalarEpilogueNotNeededUsePredicate; 9306 case LoopVectorizeHints::FK_Disabled: 9307 return CM_ScalarEpilogueAllowed; 9308 }; 9309 9310 // 4) if the TTI hook indicates this is profitable, request predication. 9311 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9312 LVL.getLAI())) 9313 return CM_ScalarEpilogueNotNeededUsePredicate; 9314 9315 return CM_ScalarEpilogueAllowed; 9316 } 9317 9318 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9319 // If Values have been set for this Def return the one relevant for \p Part. 9320 if (hasVectorValue(Def, Part)) 9321 return Data.PerPartOutput[Def][Part]; 9322 9323 if (!hasScalarValue(Def, {Part, 0})) { 9324 Value *IRV = Def->getLiveInIRValue(); 9325 Value *B = ILV->getBroadcastInstrs(IRV); 9326 set(Def, B, Part); 9327 return B; 9328 } 9329 9330 Value *ScalarValue = get(Def, {Part, 0}); 9331 // If we aren't vectorizing, we can just copy the scalar map values over 9332 // to the vector map. 9333 if (VF.isScalar()) { 9334 set(Def, ScalarValue, Part); 9335 return ScalarValue; 9336 } 9337 9338 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9339 bool IsUniform = RepR && RepR->isUniform(); 9340 9341 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9342 // Check if there is a scalar value for the selected lane. 9343 if (!hasScalarValue(Def, {Part, LastLane})) { 9344 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9345 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9346 "unexpected recipe found to be invariant"); 9347 IsUniform = true; 9348 LastLane = 0; 9349 } 9350 9351 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9352 9353 // Set the insert point after the last scalarized instruction. This 9354 // ensures the insertelement sequence will directly follow the scalar 9355 // definitions. 9356 auto OldIP = Builder.saveIP(); 9357 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9358 Builder.SetInsertPoint(&*NewIP); 9359 9360 // However, if we are vectorizing, we need to construct the vector values. 9361 // If the value is known to be uniform after vectorization, we can just 9362 // broadcast the scalar value corresponding to lane zero for each unroll 9363 // iteration. Otherwise, we construct the vector values using 9364 // insertelement instructions. Since the resulting vectors are stored in 9365 // State, we will only generate the insertelements once. 9366 Value *VectorValue = nullptr; 9367 if (IsUniform) { 9368 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9369 set(Def, VectorValue, Part); 9370 } else { 9371 // Initialize packing with insertelements to start from undef. 9372 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9373 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9374 set(Def, Undef, Part); 9375 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9376 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9377 VectorValue = get(Def, Part); 9378 } 9379 Builder.restoreIP(OldIP); 9380 return VectorValue; 9381 } 9382 9383 // Process the loop in the VPlan-native vectorization path. This path builds 9384 // VPlan upfront in the vectorization pipeline, which allows to apply 9385 // VPlan-to-VPlan transformations from the very beginning without modifying the 9386 // input LLVM IR. 9387 static bool processLoopInVPlanNativePath( 9388 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9389 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9390 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9391 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9392 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9393 9394 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9395 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9396 return false; 9397 } 9398 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9399 Function *F = L->getHeader()->getParent(); 9400 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9401 9402 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9403 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9404 9405 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9406 &Hints, IAI); 9407 // Use the planner for outer loop vectorization. 9408 // TODO: CM is not used at this point inside the planner. Turn CM into an 9409 // optional argument if we don't need it in the future. 9410 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9411 9412 // Get user vectorization factor. 9413 ElementCount UserVF = Hints.getWidth(); 9414 9415 // Plan how to best vectorize, return the best VF and its cost. 9416 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9417 9418 // If we are stress testing VPlan builds, do not attempt to generate vector 9419 // code. Masked vector code generation support will follow soon. 9420 // Also, do not attempt to vectorize if no vector code will be produced. 9421 if (VPlanBuildStressTest || EnableVPlanPredication || 9422 VectorizationFactor::Disabled() == VF) 9423 return false; 9424 9425 LVP.setBestPlan(VF.Width, 1); 9426 9427 { 9428 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9429 F->getParent()->getDataLayout()); 9430 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9431 &CM, BFI, PSI, Checks); 9432 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9433 << L->getHeader()->getParent()->getName() << "\"\n"); 9434 LVP.executePlan(LB, DT); 9435 } 9436 9437 // Mark the loop as already vectorized to avoid vectorizing again. 9438 Hints.setAlreadyVectorized(); 9439 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9440 return true; 9441 } 9442 9443 // Emit a remark if there are stores to floats that required a floating point 9444 // extension. If the vectorized loop was generated with floating point there 9445 // will be a performance penalty from the conversion overhead and the change in 9446 // the vector width. 9447 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9448 SmallVector<Instruction *, 4> Worklist; 9449 for (BasicBlock *BB : L->getBlocks()) { 9450 for (Instruction &Inst : *BB) { 9451 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9452 if (S->getValueOperand()->getType()->isFloatTy()) 9453 Worklist.push_back(S); 9454 } 9455 } 9456 } 9457 9458 // Traverse the floating point stores upwards searching, for floating point 9459 // conversions. 9460 SmallPtrSet<const Instruction *, 4> Visited; 9461 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9462 while (!Worklist.empty()) { 9463 auto *I = Worklist.pop_back_val(); 9464 if (!L->contains(I)) 9465 continue; 9466 if (!Visited.insert(I).second) 9467 continue; 9468 9469 // Emit a remark if the floating point store required a floating 9470 // point conversion. 9471 // TODO: More work could be done to identify the root cause such as a 9472 // constant or a function return type and point the user to it. 9473 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9474 ORE->emit([&]() { 9475 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9476 I->getDebugLoc(), L->getHeader()) 9477 << "floating point conversion changes vector width. " 9478 << "Mixed floating point precision requires an up/down " 9479 << "cast that will negatively impact performance."; 9480 }); 9481 9482 for (Use &Op : I->operands()) 9483 if (auto *OpI = dyn_cast<Instruction>(Op)) 9484 Worklist.push_back(OpI); 9485 } 9486 } 9487 9488 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9489 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9490 !EnableLoopInterleaving), 9491 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9492 !EnableLoopVectorization) {} 9493 9494 bool LoopVectorizePass::processLoop(Loop *L) { 9495 assert((EnableVPlanNativePath || L->isInnermost()) && 9496 "VPlan-native path is not enabled. Only process inner loops."); 9497 9498 #ifndef NDEBUG 9499 const std::string DebugLocStr = getDebugLocString(L); 9500 #endif /* NDEBUG */ 9501 9502 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9503 << L->getHeader()->getParent()->getName() << "\" from " 9504 << DebugLocStr << "\n"); 9505 9506 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9507 9508 LLVM_DEBUG( 9509 dbgs() << "LV: Loop hints:" 9510 << " force=" 9511 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9512 ? "disabled" 9513 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9514 ? "enabled" 9515 : "?")) 9516 << " width=" << Hints.getWidth() 9517 << " unroll=" << Hints.getInterleave() << "\n"); 9518 9519 // Function containing loop 9520 Function *F = L->getHeader()->getParent(); 9521 9522 // Looking at the diagnostic output is the only way to determine if a loop 9523 // was vectorized (other than looking at the IR or machine code), so it 9524 // is important to generate an optimization remark for each loop. Most of 9525 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9526 // generated as OptimizationRemark and OptimizationRemarkMissed are 9527 // less verbose reporting vectorized loops and unvectorized loops that may 9528 // benefit from vectorization, respectively. 9529 9530 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9531 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9532 return false; 9533 } 9534 9535 PredicatedScalarEvolution PSE(*SE, *L); 9536 9537 // Check if it is legal to vectorize the loop. 9538 LoopVectorizationRequirements Requirements(*ORE); 9539 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9540 &Requirements, &Hints, DB, AC, BFI, PSI); 9541 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9542 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9543 Hints.emitRemarkWithHints(); 9544 return false; 9545 } 9546 9547 // Check the function attributes and profiles to find out if this function 9548 // should be optimized for size. 9549 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9550 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9551 9552 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9553 // here. They may require CFG and instruction level transformations before 9554 // even evaluating whether vectorization is profitable. Since we cannot modify 9555 // the incoming IR, we need to build VPlan upfront in the vectorization 9556 // pipeline. 9557 if (!L->isInnermost()) 9558 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9559 ORE, BFI, PSI, Hints); 9560 9561 assert(L->isInnermost() && "Inner loop expected."); 9562 9563 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9564 // count by optimizing for size, to minimize overheads. 9565 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9566 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9567 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9568 << "This loop is worth vectorizing only if no scalar " 9569 << "iteration overheads are incurred."); 9570 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9571 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9572 else { 9573 LLVM_DEBUG(dbgs() << "\n"); 9574 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9575 } 9576 } 9577 9578 // Check the function attributes to see if implicit floats are allowed. 9579 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9580 // an integer loop and the vector instructions selected are purely integer 9581 // vector instructions? 9582 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9583 reportVectorizationFailure( 9584 "Can't vectorize when the NoImplicitFloat attribute is used", 9585 "loop not vectorized due to NoImplicitFloat attribute", 9586 "NoImplicitFloat", ORE, L); 9587 Hints.emitRemarkWithHints(); 9588 return false; 9589 } 9590 9591 // Check if the target supports potentially unsafe FP vectorization. 9592 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9593 // for the target we're vectorizing for, to make sure none of the 9594 // additional fp-math flags can help. 9595 if (Hints.isPotentiallyUnsafe() && 9596 TTI->isFPVectorizationPotentiallyUnsafe()) { 9597 reportVectorizationFailure( 9598 "Potentially unsafe FP op prevents vectorization", 9599 "loop not vectorized due to unsafe FP support.", 9600 "UnsafeFP", ORE, L); 9601 Hints.emitRemarkWithHints(); 9602 return false; 9603 } 9604 9605 if (!Requirements.canVectorizeFPMath(Hints)) { 9606 ORE->emit([&]() { 9607 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9608 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9609 ExactFPMathInst->getDebugLoc(), 9610 ExactFPMathInst->getParent()) 9611 << "loop not vectorized: cannot prove it is safe to reorder " 9612 "floating-point operations"; 9613 }); 9614 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9615 "reorder floating-point operations\n"); 9616 Hints.emitRemarkWithHints(); 9617 return false; 9618 } 9619 9620 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9621 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9622 9623 // If an override option has been passed in for interleaved accesses, use it. 9624 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9625 UseInterleaved = EnableInterleavedMemAccesses; 9626 9627 // Analyze interleaved memory accesses. 9628 if (UseInterleaved) { 9629 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9630 } 9631 9632 // Use the cost model. 9633 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9634 F, &Hints, IAI); 9635 CM.collectValuesToIgnore(); 9636 9637 // Use the planner for vectorization. 9638 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9639 9640 // Get user vectorization factor and interleave count. 9641 ElementCount UserVF = Hints.getWidth(); 9642 unsigned UserIC = Hints.getInterleave(); 9643 9644 // Plan how to best vectorize, return the best VF and its cost. 9645 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9646 9647 VectorizationFactor VF = VectorizationFactor::Disabled(); 9648 unsigned IC = 1; 9649 9650 if (MaybeVF) { 9651 VF = *MaybeVF; 9652 // Select the interleave count. 9653 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9654 } 9655 9656 // Identify the diagnostic messages that should be produced. 9657 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9658 bool VectorizeLoop = true, InterleaveLoop = true; 9659 if (Requirements.doesNotMeet(F, L, Hints)) { 9660 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9661 "requirements.\n"); 9662 Hints.emitRemarkWithHints(); 9663 return false; 9664 } 9665 9666 if (VF.Width.isScalar()) { 9667 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9668 VecDiagMsg = std::make_pair( 9669 "VectorizationNotBeneficial", 9670 "the cost-model indicates that vectorization is not beneficial"); 9671 VectorizeLoop = false; 9672 } 9673 9674 if (!MaybeVF && UserIC > 1) { 9675 // Tell the user interleaving was avoided up-front, despite being explicitly 9676 // requested. 9677 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9678 "interleaving should be avoided up front\n"); 9679 IntDiagMsg = std::make_pair( 9680 "InterleavingAvoided", 9681 "Ignoring UserIC, because interleaving was avoided up front"); 9682 InterleaveLoop = false; 9683 } else if (IC == 1 && UserIC <= 1) { 9684 // Tell the user interleaving is not beneficial. 9685 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9686 IntDiagMsg = std::make_pair( 9687 "InterleavingNotBeneficial", 9688 "the cost-model indicates that interleaving is not beneficial"); 9689 InterleaveLoop = false; 9690 if (UserIC == 1) { 9691 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9692 IntDiagMsg.second += 9693 " and is explicitly disabled or interleave count is set to 1"; 9694 } 9695 } else if (IC > 1 && UserIC == 1) { 9696 // Tell the user interleaving is beneficial, but it explicitly disabled. 9697 LLVM_DEBUG( 9698 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9699 IntDiagMsg = std::make_pair( 9700 "InterleavingBeneficialButDisabled", 9701 "the cost-model indicates that interleaving is beneficial " 9702 "but is explicitly disabled or interleave count is set to 1"); 9703 InterleaveLoop = false; 9704 } 9705 9706 // Override IC if user provided an interleave count. 9707 IC = UserIC > 0 ? UserIC : IC; 9708 9709 // Emit diagnostic messages, if any. 9710 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9711 if (!VectorizeLoop && !InterleaveLoop) { 9712 // Do not vectorize or interleaving the loop. 9713 ORE->emit([&]() { 9714 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9715 L->getStartLoc(), L->getHeader()) 9716 << VecDiagMsg.second; 9717 }); 9718 ORE->emit([&]() { 9719 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9720 L->getStartLoc(), L->getHeader()) 9721 << IntDiagMsg.second; 9722 }); 9723 return false; 9724 } else if (!VectorizeLoop && InterleaveLoop) { 9725 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9726 ORE->emit([&]() { 9727 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9728 L->getStartLoc(), L->getHeader()) 9729 << VecDiagMsg.second; 9730 }); 9731 } else if (VectorizeLoop && !InterleaveLoop) { 9732 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9733 << ") in " << DebugLocStr << '\n'); 9734 ORE->emit([&]() { 9735 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9736 L->getStartLoc(), L->getHeader()) 9737 << IntDiagMsg.second; 9738 }); 9739 } else if (VectorizeLoop && InterleaveLoop) { 9740 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9741 << ") in " << DebugLocStr << '\n'); 9742 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9743 } 9744 9745 bool DisableRuntimeUnroll = false; 9746 MDNode *OrigLoopID = L->getLoopID(); 9747 { 9748 // Optimistically generate runtime checks. Drop them if they turn out to not 9749 // be profitable. Limit the scope of Checks, so the cleanup happens 9750 // immediately after vector codegeneration is done. 9751 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9752 F->getParent()->getDataLayout()); 9753 if (!VF.Width.isScalar() || IC > 1) 9754 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9755 LVP.setBestPlan(VF.Width, IC); 9756 9757 using namespace ore; 9758 if (!VectorizeLoop) { 9759 assert(IC > 1 && "interleave count should not be 1 or 0"); 9760 // If we decided that it is not legal to vectorize the loop, then 9761 // interleave it. 9762 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9763 &CM, BFI, PSI, Checks); 9764 LVP.executePlan(Unroller, DT); 9765 9766 ORE->emit([&]() { 9767 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9768 L->getHeader()) 9769 << "interleaved loop (interleaved count: " 9770 << NV("InterleaveCount", IC) << ")"; 9771 }); 9772 } else { 9773 // If we decided that it is *legal* to vectorize the loop, then do it. 9774 9775 // Consider vectorizing the epilogue too if it's profitable. 9776 VectorizationFactor EpilogueVF = 9777 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9778 if (EpilogueVF.Width.isVector()) { 9779 9780 // The first pass vectorizes the main loop and creates a scalar epilogue 9781 // to be vectorized by executing the plan (potentially with a different 9782 // factor) again shortly afterwards. 9783 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9784 EpilogueVF.Width.getKnownMinValue(), 9785 1); 9786 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9787 EPI, &LVL, &CM, BFI, PSI, Checks); 9788 9789 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9790 LVP.executePlan(MainILV, DT); 9791 ++LoopsVectorized; 9792 9793 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9794 formLCSSARecursively(*L, *DT, LI, SE); 9795 9796 // Second pass vectorizes the epilogue and adjusts the control flow 9797 // edges from the first pass. 9798 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9799 EPI.MainLoopVF = EPI.EpilogueVF; 9800 EPI.MainLoopUF = EPI.EpilogueUF; 9801 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9802 ORE, EPI, &LVL, &CM, BFI, PSI, 9803 Checks); 9804 LVP.executePlan(EpilogILV, DT); 9805 ++LoopsEpilogueVectorized; 9806 9807 if (!MainILV.areSafetyChecksAdded()) 9808 DisableRuntimeUnroll = true; 9809 } else { 9810 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9811 &LVL, &CM, BFI, PSI, Checks); 9812 LVP.executePlan(LB, DT); 9813 ++LoopsVectorized; 9814 9815 // Add metadata to disable runtime unrolling a scalar loop when there 9816 // are no runtime checks about strides and memory. A scalar loop that is 9817 // rarely used is not worth unrolling. 9818 if (!LB.areSafetyChecksAdded()) 9819 DisableRuntimeUnroll = true; 9820 } 9821 // Report the vectorization decision. 9822 ORE->emit([&]() { 9823 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9824 L->getHeader()) 9825 << "vectorized loop (vectorization width: " 9826 << NV("VectorizationFactor", VF.Width) 9827 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9828 }); 9829 } 9830 9831 if (ORE->allowExtraAnalysis(LV_NAME)) 9832 checkMixedPrecision(L, ORE); 9833 } 9834 9835 Optional<MDNode *> RemainderLoopID = 9836 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9837 LLVMLoopVectorizeFollowupEpilogue}); 9838 if (RemainderLoopID.hasValue()) { 9839 L->setLoopID(RemainderLoopID.getValue()); 9840 } else { 9841 if (DisableRuntimeUnroll) 9842 AddRuntimeUnrollDisableMetaData(L); 9843 9844 // Mark the loop as already vectorized to avoid vectorizing again. 9845 Hints.setAlreadyVectorized(); 9846 } 9847 9848 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9849 return true; 9850 } 9851 9852 LoopVectorizeResult LoopVectorizePass::runImpl( 9853 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9854 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9855 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9856 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9857 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9858 SE = &SE_; 9859 LI = &LI_; 9860 TTI = &TTI_; 9861 DT = &DT_; 9862 BFI = &BFI_; 9863 TLI = TLI_; 9864 AA = &AA_; 9865 AC = &AC_; 9866 GetLAA = &GetLAA_; 9867 DB = &DB_; 9868 ORE = &ORE_; 9869 PSI = PSI_; 9870 9871 // Don't attempt if 9872 // 1. the target claims to have no vector registers, and 9873 // 2. interleaving won't help ILP. 9874 // 9875 // The second condition is necessary because, even if the target has no 9876 // vector registers, loop vectorization may still enable scalar 9877 // interleaving. 9878 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9879 TTI->getMaxInterleaveFactor(1) < 2) 9880 return LoopVectorizeResult(false, false); 9881 9882 bool Changed = false, CFGChanged = false; 9883 9884 // The vectorizer requires loops to be in simplified form. 9885 // Since simplification may add new inner loops, it has to run before the 9886 // legality and profitability checks. This means running the loop vectorizer 9887 // will simplify all loops, regardless of whether anything end up being 9888 // vectorized. 9889 for (auto &L : *LI) 9890 Changed |= CFGChanged |= 9891 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9892 9893 // Build up a worklist of inner-loops to vectorize. This is necessary as 9894 // the act of vectorizing or partially unrolling a loop creates new loops 9895 // and can invalidate iterators across the loops. 9896 SmallVector<Loop *, 8> Worklist; 9897 9898 for (Loop *L : *LI) 9899 collectSupportedLoops(*L, LI, ORE, Worklist); 9900 9901 LoopsAnalyzed += Worklist.size(); 9902 9903 // Now walk the identified inner loops. 9904 while (!Worklist.empty()) { 9905 Loop *L = Worklist.pop_back_val(); 9906 9907 // For the inner loops we actually process, form LCSSA to simplify the 9908 // transform. 9909 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9910 9911 Changed |= CFGChanged |= processLoop(L); 9912 } 9913 9914 // Process each loop nest in the function. 9915 return LoopVectorizeResult(Changed, CFGChanged); 9916 } 9917 9918 PreservedAnalyses LoopVectorizePass::run(Function &F, 9919 FunctionAnalysisManager &AM) { 9920 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9921 auto &LI = AM.getResult<LoopAnalysis>(F); 9922 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9923 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9924 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9925 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9926 auto &AA = AM.getResult<AAManager>(F); 9927 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9928 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9929 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9930 MemorySSA *MSSA = EnableMSSALoopDependency 9931 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9932 : nullptr; 9933 9934 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9935 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9936 [&](Loop &L) -> const LoopAccessInfo & { 9937 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9938 TLI, TTI, nullptr, MSSA}; 9939 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9940 }; 9941 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9942 ProfileSummaryInfo *PSI = 9943 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9944 LoopVectorizeResult Result = 9945 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9946 if (!Result.MadeAnyChange) 9947 return PreservedAnalyses::all(); 9948 PreservedAnalyses PA; 9949 9950 // We currently do not preserve loopinfo/dominator analyses with outer loop 9951 // vectorization. Until this is addressed, mark these analyses as preserved 9952 // only for non-VPlan-native path. 9953 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9954 if (!EnableVPlanNativePath) { 9955 PA.preserve<LoopAnalysis>(); 9956 PA.preserve<DominatorTreeAnalysis>(); 9957 } 9958 PA.preserve<BasicAA>(); 9959 PA.preserve<GlobalsAA>(); 9960 if (!Result.MadeCFGChange) 9961 PA.preserveSet<CFGAnalyses>(); 9962 return PA; 9963 } 9964