1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 cl::opt<bool> EnableStrictReductions( 335 "enable-strict-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns the type of loaded or stored value. 379 static Type *getMemInstValueType(Value *I) { 380 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 381 "Expected Load or Store instruction"); 382 if (auto *LI = dyn_cast<LoadInst>(I)) 383 return LI->getType(); 384 return cast<StoreInst>(I)->getValueOperand()->getType(); 385 } 386 387 /// A helper function that returns true if the given type is irregular. The 388 /// type is irregular if its allocated size doesn't equal the store size of an 389 /// element of the corresponding vector type. 390 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 391 // Determine if an array of N elements of type Ty is "bitcast compatible" 392 // with a <N x Ty> vector. 393 // This is only true if there is no padding between the array elements. 394 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 395 } 396 397 /// A helper function that returns the reciprocal of the block probability of 398 /// predicated blocks. If we return X, we are assuming the predicated block 399 /// will execute once for every X iterations of the loop header. 400 /// 401 /// TODO: We should use actual block probability here, if available. Currently, 402 /// we always assume predicated blocks have a 50% chance of executing. 403 static unsigned getReciprocalPredBlockProb() { return 2; } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 // Forward declare GeneratedRTChecks. 436 class GeneratedRTChecks; 437 438 namespace llvm { 439 440 /// InnerLoopVectorizer vectorizes loops which contain only one basic 441 /// block to a specified vectorization factor (VF). 442 /// This class performs the widening of scalars into vectors, or multiple 443 /// scalars. This class also implements the following features: 444 /// * It inserts an epilogue loop for handling loops that don't have iteration 445 /// counts that are known to be a multiple of the vectorization factor. 446 /// * It handles the code generation for reduction variables. 447 /// * Scalarization (implementation using scalars) of un-vectorizable 448 /// instructions. 449 /// InnerLoopVectorizer does not perform any vectorization-legality 450 /// checks, and relies on the caller to check for the different legality 451 /// aspects. The InnerLoopVectorizer relies on the 452 /// LoopVectorizationLegality class to provide information about the induction 453 /// and reduction variables that were found to a given vectorization factor. 454 class InnerLoopVectorizer { 455 public: 456 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 457 LoopInfo *LI, DominatorTree *DT, 458 const TargetLibraryInfo *TLI, 459 const TargetTransformInfo *TTI, AssumptionCache *AC, 460 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 461 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 462 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 463 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 464 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 465 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 466 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 467 PSI(PSI), RTChecks(RTChecks) { 468 // Query this against the original loop and save it here because the profile 469 // of the original loop header may change as the transformation happens. 470 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 471 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 472 } 473 474 virtual ~InnerLoopVectorizer() = default; 475 476 /// Create a new empty loop that will contain vectorized instructions later 477 /// on, while the old loop will be used as the scalar remainder. Control flow 478 /// is generated around the vectorized (and scalar epilogue) loops consisting 479 /// of various checks and bypasses. Return the pre-header block of the new 480 /// loop. 481 /// In the case of epilogue vectorization, this function is overriden to 482 /// handle the more complex control flow around the loops. 483 virtual BasicBlock *createVectorizedLoopSkeleton(); 484 485 /// Widen a single instruction within the innermost loop. 486 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 487 VPTransformState &State); 488 489 /// Widen a single call instruction within the innermost loop. 490 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 491 VPTransformState &State); 492 493 /// Widen a single select instruction within the innermost loop. 494 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 495 bool InvariantCond, VPTransformState &State); 496 497 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 498 void fixVectorizedLoop(VPTransformState &State); 499 500 // Return true if any runtime check is added. 501 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 502 503 /// A type for vectorized values in the new loop. Each value from the 504 /// original loop, when vectorized, is represented by UF vector values in the 505 /// new unrolled loop, where UF is the unroll factor. 506 using VectorParts = SmallVector<Value *, 2>; 507 508 /// Vectorize a single GetElementPtrInst based on information gathered and 509 /// decisions taken during planning. 510 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 511 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 512 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 513 514 /// Vectorize a single PHINode in a block. This method handles the induction 515 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 516 /// arbitrary length vectors. 517 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 518 VPWidenPHIRecipe *PhiR, VPTransformState &State); 519 520 /// A helper function to scalarize a single Instruction in the innermost loop. 521 /// Generates a sequence of scalar instances for each lane between \p MinLane 522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 523 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 524 /// Instr's operands. 525 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 526 const VPIteration &Instance, bool IfPredicateInstr, 527 VPTransformState &State); 528 529 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 530 /// is provided, the integer induction variable will first be truncated to 531 /// the corresponding type. 532 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 533 VPValue *Def, VPValue *CastDef, 534 VPTransformState &State); 535 536 /// Construct the vector value of a scalarized value \p V one lane at a time. 537 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 538 VPTransformState &State); 539 540 /// Try to vectorize interleaved access group \p Group with the base address 541 /// given in \p Addr, optionally masking the vector operations if \p 542 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 543 /// values in the vectorized loop. 544 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 545 ArrayRef<VPValue *> VPDefs, 546 VPTransformState &State, VPValue *Addr, 547 ArrayRef<VPValue *> StoredValues, 548 VPValue *BlockInMask = nullptr); 549 550 /// Vectorize Load and Store instructions with the base address given in \p 551 /// Addr, optionally masking the vector operations if \p BlockInMask is 552 /// non-null. Use \p State to translate given VPValues to IR values in the 553 /// vectorized loop. 554 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 555 VPValue *Def, VPValue *Addr, 556 VPValue *StoredValue, VPValue *BlockInMask); 557 558 /// Set the debug location in the builder using the debug location in 559 /// the instruction. 560 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 561 562 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 563 void fixNonInductionPHIs(VPTransformState &State); 564 565 /// Create a broadcast instruction. This method generates a broadcast 566 /// instruction (shuffle) for loop invariant values and for the induction 567 /// value. If this is the induction variable then we extend it to N, N+1, ... 568 /// this is needed because each iteration in the loop corresponds to a SIMD 569 /// element. 570 virtual Value *getBroadcastInstrs(Value *V); 571 572 protected: 573 friend class LoopVectorizationPlanner; 574 575 /// A small list of PHINodes. 576 using PhiVector = SmallVector<PHINode *, 4>; 577 578 /// A type for scalarized values in the new loop. Each value from the 579 /// original loop, when scalarized, is represented by UF x VF scalar values 580 /// in the new unrolled loop, where UF is the unroll factor and VF is the 581 /// vectorization factor. 582 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 583 584 /// Set up the values of the IVs correctly when exiting the vector loop. 585 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 586 Value *CountRoundDown, Value *EndValue, 587 BasicBlock *MiddleBlock); 588 589 /// Create a new induction variable inside L. 590 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 591 Value *Step, Instruction *DL); 592 593 /// Handle all cross-iteration phis in the header. 594 void fixCrossIterationPHIs(VPTransformState &State); 595 596 /// Fix a first-order recurrence. This is the second phase of vectorizing 597 /// this phi node. 598 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 599 600 /// Fix a reduction cross-iteration phi. This is the second phase of 601 /// vectorizing this phi node. 602 void fixReduction(PHINode *Phi, VPTransformState &State); 603 604 /// Clear NSW/NUW flags from reduction instructions if necessary. 605 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 606 VPTransformState &State); 607 608 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 609 /// means we need to add the appropriate incoming value from the middle 610 /// block as exiting edges from the scalar epilogue loop (if present) are 611 /// already in place, and we exit the vector loop exclusively to the middle 612 /// block. 613 void fixLCSSAPHIs(VPTransformState &State); 614 615 /// Iteratively sink the scalarized operands of a predicated instruction into 616 /// the block that was created for it. 617 void sinkScalarOperands(Instruction *PredInst); 618 619 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 620 /// represented as. 621 void truncateToMinimalBitwidths(VPTransformState &State); 622 623 /// This function adds 624 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 625 /// to each vector element of Val. The sequence starts at StartIndex. 626 /// \p Opcode is relevant for FP induction variable. 627 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 628 Instruction::BinaryOps Opcode = 629 Instruction::BinaryOpsEnd); 630 631 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 632 /// variable on which to base the steps, \p Step is the size of the step, and 633 /// \p EntryVal is the value from the original loop that maps to the steps. 634 /// Note that \p EntryVal doesn't have to be an induction variable - it 635 /// can also be a truncate instruction. 636 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 637 const InductionDescriptor &ID, VPValue *Def, 638 VPValue *CastDef, VPTransformState &State); 639 640 /// Create a vector induction phi node based on an existing scalar one. \p 641 /// EntryVal is the value from the original loop that maps to the vector phi 642 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 643 /// truncate instruction, instead of widening the original IV, we widen a 644 /// version of the IV truncated to \p EntryVal's type. 645 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 646 Value *Step, Value *Start, 647 Instruction *EntryVal, VPValue *Def, 648 VPValue *CastDef, 649 VPTransformState &State); 650 651 /// Returns true if an instruction \p I should be scalarized instead of 652 /// vectorized for the chosen vectorization factor. 653 bool shouldScalarizeInstruction(Instruction *I) const; 654 655 /// Returns true if we should generate a scalar version of \p IV. 656 bool needsScalarInduction(Instruction *IV) const; 657 658 /// If there is a cast involved in the induction variable \p ID, which should 659 /// be ignored in the vectorized loop body, this function records the 660 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 661 /// cast. We had already proved that the casted Phi is equal to the uncasted 662 /// Phi in the vectorized loop (under a runtime guard), and therefore 663 /// there is no need to vectorize the cast - the same value can be used in the 664 /// vector loop for both the Phi and the cast. 665 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 666 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 667 /// 668 /// \p EntryVal is the value from the original loop that maps to the vector 669 /// phi node and is used to distinguish what is the IV currently being 670 /// processed - original one (if \p EntryVal is a phi corresponding to the 671 /// original IV) or the "newly-created" one based on the proof mentioned above 672 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 673 /// latter case \p EntryVal is a TruncInst and we must not record anything for 674 /// that IV, but it's error-prone to expect callers of this routine to care 675 /// about that, hence this explicit parameter. 676 void recordVectorLoopValueForInductionCast( 677 const InductionDescriptor &ID, const Instruction *EntryVal, 678 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 679 unsigned Part, unsigned Lane = UINT_MAX); 680 681 /// Generate a shuffle sequence that will reverse the vector Vec. 682 virtual Value *reverseVector(Value *Vec); 683 684 /// Returns (and creates if needed) the original loop trip count. 685 Value *getOrCreateTripCount(Loop *NewLoop); 686 687 /// Returns (and creates if needed) the trip count of the widened loop. 688 Value *getOrCreateVectorTripCount(Loop *NewLoop); 689 690 /// Returns a bitcasted value to the requested vector type. 691 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 692 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 693 const DataLayout &DL); 694 695 /// Emit a bypass check to see if the vector trip count is zero, including if 696 /// it overflows. 697 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 698 699 /// Emit a bypass check to see if all of the SCEV assumptions we've 700 /// had to make are correct. Returns the block containing the checks or 701 /// nullptr if no checks have been added. 702 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Emit bypass checks to check any memory assumptions we may have made. 705 /// Returns the block containing the checks or nullptr if no checks have been 706 /// added. 707 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 708 709 /// Compute the transformed value of Index at offset StartValue using step 710 /// StepValue. 711 /// For integer induction, returns StartValue + Index * StepValue. 712 /// For pointer induction, returns StartValue[Index * StepValue]. 713 /// FIXME: The newly created binary instructions should contain nsw/nuw 714 /// flags, which can be found from the original scalar operations. 715 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 716 const DataLayout &DL, 717 const InductionDescriptor &ID) const; 718 719 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 720 /// vector loop preheader, middle block and scalar preheader. Also 721 /// allocate a loop object for the new vector loop and return it. 722 Loop *createVectorLoopSkeleton(StringRef Prefix); 723 724 /// Create new phi nodes for the induction variables to resume iteration count 725 /// in the scalar epilogue, from where the vectorized loop left off (given by 726 /// \p VectorTripCount). 727 /// In cases where the loop skeleton is more complicated (eg. epilogue 728 /// vectorization) and the resume values can come from an additional bypass 729 /// block, the \p AdditionalBypass pair provides information about the bypass 730 /// block and the end value on the edge from bypass to this loop. 731 void createInductionResumeValues( 732 Loop *L, Value *VectorTripCount, 733 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 734 735 /// Complete the loop skeleton by adding debug MDs, creating appropriate 736 /// conditional branches in the middle block, preparing the builder and 737 /// running the verifier. Take in the vector loop \p L as argument, and return 738 /// the preheader of the completed vector loop. 739 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 740 741 /// Add additional metadata to \p To that was not present on \p Orig. 742 /// 743 /// Currently this is used to add the noalias annotations based on the 744 /// inserted memchecks. Use this for instructions that are *cloned* into the 745 /// vector loop. 746 void addNewMetadata(Instruction *To, const Instruction *Orig); 747 748 /// Add metadata from one instruction to another. 749 /// 750 /// This includes both the original MDs from \p From and additional ones (\see 751 /// addNewMetadata). Use this for *newly created* instructions in the vector 752 /// loop. 753 void addMetadata(Instruction *To, Instruction *From); 754 755 /// Similar to the previous function but it adds the metadata to a 756 /// vector of instructions. 757 void addMetadata(ArrayRef<Value *> To, Instruction *From); 758 759 /// Allow subclasses to override and print debug traces before/after vplan 760 /// execution, when trace information is requested. 761 virtual void printDebugTracesAtStart(){}; 762 virtual void printDebugTracesAtEnd(){}; 763 764 /// The original loop. 765 Loop *OrigLoop; 766 767 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 768 /// dynamic knowledge to simplify SCEV expressions and converts them to a 769 /// more usable form. 770 PredicatedScalarEvolution &PSE; 771 772 /// Loop Info. 773 LoopInfo *LI; 774 775 /// Dominator Tree. 776 DominatorTree *DT; 777 778 /// Alias Analysis. 779 AAResults *AA; 780 781 /// Target Library Info. 782 const TargetLibraryInfo *TLI; 783 784 /// Target Transform Info. 785 const TargetTransformInfo *TTI; 786 787 /// Assumption Cache. 788 AssumptionCache *AC; 789 790 /// Interface to emit optimization remarks. 791 OptimizationRemarkEmitter *ORE; 792 793 /// LoopVersioning. It's only set up (non-null) if memchecks were 794 /// used. 795 /// 796 /// This is currently only used to add no-alias metadata based on the 797 /// memchecks. The actually versioning is performed manually. 798 std::unique_ptr<LoopVersioning> LVer; 799 800 /// The vectorization SIMD factor to use. Each vector will have this many 801 /// vector elements. 802 ElementCount VF; 803 804 /// The vectorization unroll factor to use. Each scalar is vectorized to this 805 /// many different vector instructions. 806 unsigned UF; 807 808 /// The builder that we use 809 IRBuilder<> Builder; 810 811 // --- Vectorization state --- 812 813 /// The vector-loop preheader. 814 BasicBlock *LoopVectorPreHeader; 815 816 /// The scalar-loop preheader. 817 BasicBlock *LoopScalarPreHeader; 818 819 /// Middle Block between the vector and the scalar. 820 BasicBlock *LoopMiddleBlock; 821 822 /// The (unique) ExitBlock of the scalar loop. Note that 823 /// there can be multiple exiting edges reaching this block. 824 BasicBlock *LoopExitBlock; 825 826 /// The vector loop body. 827 BasicBlock *LoopVectorBody; 828 829 /// The scalar loop body. 830 BasicBlock *LoopScalarBody; 831 832 /// A list of all bypass blocks. The first block is the entry of the loop. 833 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 834 835 /// The new Induction variable which was added to the new block. 836 PHINode *Induction = nullptr; 837 838 /// The induction variable of the old basic block. 839 PHINode *OldInduction = nullptr; 840 841 /// Store instructions that were predicated. 842 SmallVector<Instruction *, 4> PredicatedInstructions; 843 844 /// Trip count of the original loop. 845 Value *TripCount = nullptr; 846 847 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 848 Value *VectorTripCount = nullptr; 849 850 /// The legality analysis. 851 LoopVectorizationLegality *Legal; 852 853 /// The profitablity analysis. 854 LoopVectorizationCostModel *Cost; 855 856 // Record whether runtime checks are added. 857 bool AddedSafetyChecks = false; 858 859 // Holds the end values for each induction variable. We save the end values 860 // so we can later fix-up the external users of the induction variables. 861 DenseMap<PHINode *, Value *> IVEndValues; 862 863 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 864 // fixed up at the end of vector code generation. 865 SmallVector<PHINode *, 8> OrigPHIsToFix; 866 867 /// BFI and PSI are used to check for profile guided size optimizations. 868 BlockFrequencyInfo *BFI; 869 ProfileSummaryInfo *PSI; 870 871 // Whether this loop should be optimized for size based on profile guided size 872 // optimizatios. 873 bool OptForSizeBasedOnProfile; 874 875 /// Structure to hold information about generated runtime checks, responsible 876 /// for cleaning the checks, if vectorization turns out unprofitable. 877 GeneratedRTChecks &RTChecks; 878 }; 879 880 class InnerLoopUnroller : public InnerLoopVectorizer { 881 public: 882 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 883 LoopInfo *LI, DominatorTree *DT, 884 const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 887 LoopVectorizationLegality *LVL, 888 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 889 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 890 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 892 BFI, PSI, Check) {} 893 894 private: 895 Value *getBroadcastInstrs(Value *V) override; 896 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 897 Instruction::BinaryOps Opcode = 898 Instruction::BinaryOpsEnd) override; 899 Value *reverseVector(Value *Vec) override; 900 }; 901 902 /// Encapsulate information regarding vectorization of a loop and its epilogue. 903 /// This information is meant to be updated and used across two stages of 904 /// epilogue vectorization. 905 struct EpilogueLoopVectorizationInfo { 906 ElementCount MainLoopVF = ElementCount::getFixed(0); 907 unsigned MainLoopUF = 0; 908 ElementCount EpilogueVF = ElementCount::getFixed(0); 909 unsigned EpilogueUF = 0; 910 BasicBlock *MainLoopIterationCountCheck = nullptr; 911 BasicBlock *EpilogueIterationCountCheck = nullptr; 912 BasicBlock *SCEVSafetyCheck = nullptr; 913 BasicBlock *MemSafetyCheck = nullptr; 914 Value *TripCount = nullptr; 915 Value *VectorTripCount = nullptr; 916 917 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 918 unsigned EUF) 919 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 920 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 921 assert(EUF == 1 && 922 "A high UF for the epilogue loop is likely not beneficial."); 923 } 924 }; 925 926 /// An extension of the inner loop vectorizer that creates a skeleton for a 927 /// vectorized loop that has its epilogue (residual) also vectorized. 928 /// The idea is to run the vplan on a given loop twice, firstly to setup the 929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 930 /// from the first step and vectorize the epilogue. This is achieved by 931 /// deriving two concrete strategy classes from this base class and invoking 932 /// them in succession from the loop vectorizer planner. 933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 934 public: 935 InnerLoopAndEpilogueVectorizer( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Checks) 943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 945 Checks), 946 EPI(EPI) {} 947 948 // Override this function to handle the more complex control flow around the 949 // three loops. 950 BasicBlock *createVectorizedLoopSkeleton() final override { 951 return createEpilogueVectorizedLoopSkeleton(); 952 } 953 954 /// The interface for creating a vectorized skeleton using one of two 955 /// different strategies, each corresponding to one execution of the vplan 956 /// as described above. 957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 958 959 /// Holds and updates state information required to vectorize the main loop 960 /// and its epilogue in two separate passes. This setup helps us avoid 961 /// regenerating and recomputing runtime safety checks. It also helps us to 962 /// shorten the iteration-count-check path length for the cases where the 963 /// iteration count of the loop is so small that the main vector loop is 964 /// completely skipped. 965 EpilogueLoopVectorizationInfo &EPI; 966 }; 967 968 /// A specialized derived class of inner loop vectorizer that performs 969 /// vectorization of *main* loops in the process of vectorizing loops and their 970 /// epilogues. 971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 972 public: 973 EpilogueVectorizerMainLoop( 974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 975 DominatorTree *DT, const TargetLibraryInfo *TLI, 976 const TargetTransformInfo *TTI, AssumptionCache *AC, 977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 980 GeneratedRTChecks &Check) 981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 982 EPI, LVL, CM, BFI, PSI, Check) {} 983 /// Implements the interface for creating a vectorized skeleton using the 984 /// *main loop* strategy (ie the first pass of vplan execution). 985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 986 987 protected: 988 /// Emits an iteration count bypass check once for the main loop (when \p 989 /// ForEpilogue is false) and once for the epilogue loop (when \p 990 /// ForEpilogue is true). 991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 992 bool ForEpilogue); 993 void printDebugTracesAtStart() override; 994 void printDebugTracesAtEnd() override; 995 }; 996 997 // A specialized derived class of inner loop vectorizer that performs 998 // vectorization of *epilogue* loops in the process of vectorizing loops and 999 // their epilogues. 1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerEpilogueLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1009 GeneratedRTChecks &Checks) 1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1011 EPI, LVL, CM, BFI, PSI, Checks) {} 1012 /// Implements the interface for creating a vectorized skeleton using the 1013 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1015 1016 protected: 1017 /// Emits an iteration count bypass check after the main vector loop has 1018 /// finished to see if there are any iterations left to execute by either 1019 /// the vector epilogue or the scalar epilogue. 1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1021 BasicBlock *Bypass, 1022 BasicBlock *Insert); 1023 void printDebugTracesAtStart() override; 1024 void printDebugTracesAtEnd() override; 1025 }; 1026 } // end namespace llvm 1027 1028 /// Look for a meaningful debug location on the instruction or it's 1029 /// operands. 1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1031 if (!I) 1032 return I; 1033 1034 DebugLoc Empty; 1035 if (I->getDebugLoc() != Empty) 1036 return I; 1037 1038 for (Use &Op : I->operands()) { 1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1040 if (OpInst->getDebugLoc() != Empty) 1041 return OpInst; 1042 } 1043 1044 return I; 1045 } 1046 1047 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1048 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1049 const DILocation *DIL = Inst->getDebugLoc(); 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst)) { 1052 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B.SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } 1062 else 1063 B.SetCurrentDebugLocation(DIL); 1064 } else 1065 B.SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a record \p DebugMsg about vectorization failure to the debug 1069 /// output stream. If \p I is passed, it is an instruction that prevents 1070 /// vectorization. 1071 #ifndef NDEBUG 1072 static void debugVectorizationFailure(const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1104 R << "loop not vectorized: "; 1105 return R; 1106 } 1107 1108 /// Return a value for Step multiplied by VF. 1109 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1110 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1111 Constant *StepVal = ConstantInt::get( 1112 Step->getType(), 1113 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1114 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1115 } 1116 1117 namespace llvm { 1118 1119 /// Return the runtime value for VF. 1120 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1121 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1122 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1123 } 1124 1125 void reportVectorizationFailure(const StringRef DebugMsg, 1126 const StringRef OREMsg, const StringRef ORETag, 1127 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1128 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1129 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1130 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1131 ORETag, TheLoop, I) << OREMsg); 1132 } 1133 1134 } // end namespace llvm 1135 1136 #ifndef NDEBUG 1137 /// \return string containing a file name and a line # for the given loop. 1138 static std::string getDebugLocString(const Loop *L) { 1139 std::string Result; 1140 if (L) { 1141 raw_string_ostream OS(Result); 1142 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1143 LoopDbgLoc.print(OS); 1144 else 1145 // Just print the module name. 1146 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1147 OS.flush(); 1148 } 1149 return Result; 1150 } 1151 #endif 1152 1153 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1154 const Instruction *Orig) { 1155 // If the loop was versioned with memchecks, add the corresponding no-alias 1156 // metadata. 1157 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1158 LVer->annotateInstWithNoAlias(To, Orig); 1159 } 1160 1161 void InnerLoopVectorizer::addMetadata(Instruction *To, 1162 Instruction *From) { 1163 propagateMetadata(To, From); 1164 addNewMetadata(To, From); 1165 } 1166 1167 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1168 Instruction *From) { 1169 for (Value *V : To) { 1170 if (Instruction *I = dyn_cast<Instruction>(V)) 1171 addMetadata(I, From); 1172 } 1173 } 1174 1175 namespace llvm { 1176 1177 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1178 // lowered. 1179 enum ScalarEpilogueLowering { 1180 1181 // The default: allowing scalar epilogues. 1182 CM_ScalarEpilogueAllowed, 1183 1184 // Vectorization with OptForSize: don't allow epilogues. 1185 CM_ScalarEpilogueNotAllowedOptSize, 1186 1187 // A special case of vectorisation with OptForSize: loops with a very small 1188 // trip count are considered for vectorization under OptForSize, thereby 1189 // making sure the cost of their loop body is dominant, free of runtime 1190 // guards and scalar iteration overheads. 1191 CM_ScalarEpilogueNotAllowedLowTripLoop, 1192 1193 // Loop hint predicate indicating an epilogue is undesired. 1194 CM_ScalarEpilogueNotNeededUsePredicate, 1195 1196 // Directive indicating we must either tail fold or not vectorize 1197 CM_ScalarEpilogueNotAllowedUsePredicate 1198 }; 1199 1200 /// LoopVectorizationCostModel - estimates the expected speedups due to 1201 /// vectorization. 1202 /// In many cases vectorization is not profitable. This can happen because of 1203 /// a number of reasons. In this class we mainly attempt to predict the 1204 /// expected speedup/slowdowns due to the supported instruction set. We use the 1205 /// TargetTransformInfo to query the different backends for the cost of 1206 /// different operations. 1207 class LoopVectorizationCostModel { 1208 public: 1209 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1210 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1211 LoopVectorizationLegality *Legal, 1212 const TargetTransformInfo &TTI, 1213 const TargetLibraryInfo *TLI, DemandedBits *DB, 1214 AssumptionCache *AC, 1215 OptimizationRemarkEmitter *ORE, const Function *F, 1216 const LoopVectorizeHints *Hints, 1217 InterleavedAccessInfo &IAI) 1218 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1219 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1220 Hints(Hints), InterleaveInfo(IAI) {} 1221 1222 /// \return An upper bound for the vectorization factor, or None if 1223 /// vectorization and interleaving should be avoided up front. 1224 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1225 1226 /// \return True if runtime checks are required for vectorization, and false 1227 /// otherwise. 1228 bool runtimeChecksRequired(); 1229 1230 /// \return The most profitable vectorization factor and the cost of that VF. 1231 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1232 /// then this vectorization factor will be selected if vectorization is 1233 /// possible. 1234 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1235 VectorizationFactor 1236 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1237 const LoopVectorizationPlanner &LVP); 1238 1239 /// Setup cost-based decisions for user vectorization factor. 1240 void selectUserVectorizationFactor(ElementCount UserVF) { 1241 collectUniformsAndScalars(UserVF); 1242 collectInstsToScalarize(UserVF); 1243 } 1244 1245 /// \return The size (in bits) of the smallest and widest types in the code 1246 /// that needs to be vectorized. We ignore values that remain scalar such as 1247 /// 64 bit loop indices. 1248 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1249 1250 /// \return The desired interleave count. 1251 /// If interleave count has been specified by metadata it will be returned. 1252 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1253 /// are the selected vectorization factor and the cost of the selected VF. 1254 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1255 1256 /// Memory access instruction may be vectorized in more than one way. 1257 /// Form of instruction after vectorization depends on cost. 1258 /// This function takes cost-based decisions for Load/Store instructions 1259 /// and collects them in a map. This decisions map is used for building 1260 /// the lists of loop-uniform and loop-scalar instructions. 1261 /// The calculated cost is saved with widening decision in order to 1262 /// avoid redundant calculations. 1263 void setCostBasedWideningDecision(ElementCount VF); 1264 1265 /// A struct that represents some properties of the register usage 1266 /// of a loop. 1267 struct RegisterUsage { 1268 /// Holds the number of loop invariant values that are used in the loop. 1269 /// The key is ClassID of target-provided register class. 1270 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1271 /// Holds the maximum number of concurrent live intervals in the loop. 1272 /// The key is ClassID of target-provided register class. 1273 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1274 }; 1275 1276 /// \return Returns information about the register usages of the loop for the 1277 /// given vectorization factors. 1278 SmallVector<RegisterUsage, 8> 1279 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1280 1281 /// Collect values we want to ignore in the cost model. 1282 void collectValuesToIgnore(); 1283 1284 /// Split reductions into those that happen in the loop, and those that happen 1285 /// outside. In loop reductions are collected into InLoopReductionChains. 1286 void collectInLoopReductions(); 1287 1288 /// \returns The smallest bitwidth each instruction can be represented with. 1289 /// The vector equivalents of these instructions should be truncated to this 1290 /// type. 1291 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1292 return MinBWs; 1293 } 1294 1295 /// \returns True if it is more profitable to scalarize instruction \p I for 1296 /// vectorization factor \p VF. 1297 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1298 assert(VF.isVector() && 1299 "Profitable to scalarize relevant only for VF > 1."); 1300 1301 // Cost model is not run in the VPlan-native path - return conservative 1302 // result until this changes. 1303 if (EnableVPlanNativePath) 1304 return false; 1305 1306 auto Scalars = InstsToScalarize.find(VF); 1307 assert(Scalars != InstsToScalarize.end() && 1308 "VF not yet analyzed for scalarization profitability"); 1309 return Scalars->second.find(I) != Scalars->second.end(); 1310 } 1311 1312 /// Returns true if \p I is known to be uniform after vectorization. 1313 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1314 if (VF.isScalar()) 1315 return true; 1316 1317 // Cost model is not run in the VPlan-native path - return conservative 1318 // result until this changes. 1319 if (EnableVPlanNativePath) 1320 return false; 1321 1322 auto UniformsPerVF = Uniforms.find(VF); 1323 assert(UniformsPerVF != Uniforms.end() && 1324 "VF not yet analyzed for uniformity"); 1325 return UniformsPerVF->second.count(I); 1326 } 1327 1328 /// Returns true if \p I is known to be scalar after vectorization. 1329 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1330 if (VF.isScalar()) 1331 return true; 1332 1333 // Cost model is not run in the VPlan-native path - return conservative 1334 // result until this changes. 1335 if (EnableVPlanNativePath) 1336 return false; 1337 1338 auto ScalarsPerVF = Scalars.find(VF); 1339 assert(ScalarsPerVF != Scalars.end() && 1340 "Scalar values are not calculated for VF"); 1341 return ScalarsPerVF->second.count(I); 1342 } 1343 1344 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1345 /// for vectorization factor \p VF. 1346 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1347 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1348 !isProfitableToScalarize(I, VF) && 1349 !isScalarAfterVectorization(I, VF); 1350 } 1351 1352 /// Decision that was taken during cost calculation for memory instruction. 1353 enum InstWidening { 1354 CM_Unknown, 1355 CM_Widen, // For consecutive accesses with stride +1. 1356 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1357 CM_Interleave, 1358 CM_GatherScatter, 1359 CM_Scalarize 1360 }; 1361 1362 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1363 /// instruction \p I and vector width \p VF. 1364 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1365 InstructionCost Cost) { 1366 assert(VF.isVector() && "Expected VF >=2"); 1367 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1368 } 1369 1370 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1371 /// interleaving group \p Grp and vector width \p VF. 1372 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1373 ElementCount VF, InstWidening W, 1374 InstructionCost Cost) { 1375 assert(VF.isVector() && "Expected VF >=2"); 1376 /// Broadcast this decicion to all instructions inside the group. 1377 /// But the cost will be assigned to one instruction only. 1378 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1379 if (auto *I = Grp->getMember(i)) { 1380 if (Grp->getInsertPos() == I) 1381 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1382 else 1383 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1384 } 1385 } 1386 } 1387 1388 /// Return the cost model decision for the given instruction \p I and vector 1389 /// width \p VF. Return CM_Unknown if this instruction did not pass 1390 /// through the cost modeling. 1391 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1392 assert(VF.isVector() && "Expected VF to be a vector VF"); 1393 // Cost model is not run in the VPlan-native path - return conservative 1394 // result until this changes. 1395 if (EnableVPlanNativePath) 1396 return CM_GatherScatter; 1397 1398 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1399 auto Itr = WideningDecisions.find(InstOnVF); 1400 if (Itr == WideningDecisions.end()) 1401 return CM_Unknown; 1402 return Itr->second.first; 1403 } 1404 1405 /// Return the vectorization cost for the given instruction \p I and vector 1406 /// width \p VF. 1407 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1408 assert(VF.isVector() && "Expected VF >=2"); 1409 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1410 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1411 "The cost is not calculated"); 1412 return WideningDecisions[InstOnVF].second; 1413 } 1414 1415 /// Return True if instruction \p I is an optimizable truncate whose operand 1416 /// is an induction variable. Such a truncate will be removed by adding a new 1417 /// induction variable with the destination type. 1418 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1419 // If the instruction is not a truncate, return false. 1420 auto *Trunc = dyn_cast<TruncInst>(I); 1421 if (!Trunc) 1422 return false; 1423 1424 // Get the source and destination types of the truncate. 1425 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1426 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1427 1428 // If the truncate is free for the given types, return false. Replacing a 1429 // free truncate with an induction variable would add an induction variable 1430 // update instruction to each iteration of the loop. We exclude from this 1431 // check the primary induction variable since it will need an update 1432 // instruction regardless. 1433 Value *Op = Trunc->getOperand(0); 1434 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1435 return false; 1436 1437 // If the truncated value is not an induction variable, return false. 1438 return Legal->isInductionPhi(Op); 1439 } 1440 1441 /// Collects the instructions to scalarize for each predicated instruction in 1442 /// the loop. 1443 void collectInstsToScalarize(ElementCount VF); 1444 1445 /// Collect Uniform and Scalar values for the given \p VF. 1446 /// The sets depend on CM decision for Load/Store instructions 1447 /// that may be vectorized as interleave, gather-scatter or scalarized. 1448 void collectUniformsAndScalars(ElementCount VF) { 1449 // Do the analysis once. 1450 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1451 return; 1452 setCostBasedWideningDecision(VF); 1453 collectLoopUniforms(VF); 1454 collectLoopScalars(VF); 1455 } 1456 1457 /// Returns true if the target machine supports masked store operation 1458 /// for the given \p DataType and kind of access to \p Ptr. 1459 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1460 return Legal->isConsecutivePtr(Ptr) && 1461 TTI.isLegalMaskedStore(DataType, Alignment); 1462 } 1463 1464 /// Returns true if the target machine supports masked load operation 1465 /// for the given \p DataType and kind of access to \p Ptr. 1466 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1467 return Legal->isConsecutivePtr(Ptr) && 1468 TTI.isLegalMaskedLoad(DataType, Alignment); 1469 } 1470 1471 /// Returns true if the target machine supports masked scatter operation 1472 /// for the given \p DataType. 1473 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1474 return TTI.isLegalMaskedScatter(DataType, Alignment); 1475 } 1476 1477 /// Returns true if the target machine supports masked gather operation 1478 /// for the given \p DataType. 1479 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1480 return TTI.isLegalMaskedGather(DataType, Alignment); 1481 } 1482 1483 /// Returns true if the target machine can represent \p V as a masked gather 1484 /// or scatter operation. 1485 bool isLegalGatherOrScatter(Value *V) { 1486 bool LI = isa<LoadInst>(V); 1487 bool SI = isa<StoreInst>(V); 1488 if (!LI && !SI) 1489 return false; 1490 auto *Ty = getMemInstValueType(V); 1491 Align Align = getLoadStoreAlignment(V); 1492 return (LI && isLegalMaskedGather(Ty, Align)) || 1493 (SI && isLegalMaskedScatter(Ty, Align)); 1494 } 1495 1496 /// Returns true if the target machine supports all of the reduction 1497 /// variables found for the given VF. 1498 bool canVectorizeReductions(ElementCount VF) { 1499 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1500 RecurrenceDescriptor RdxDesc = Reduction.second; 1501 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1502 })); 1503 } 1504 1505 /// Returns true if \p I is an instruction that will be scalarized with 1506 /// predication. Such instructions include conditional stores and 1507 /// instructions that may divide by zero. 1508 /// If a non-zero VF has been calculated, we check if I will be scalarized 1509 /// predication for that VF. 1510 bool 1511 isScalarWithPredication(Instruction *I, 1512 ElementCount VF = ElementCount::getFixed(1)) const; 1513 1514 // Returns true if \p I is an instruction that will be predicated either 1515 // through scalar predication or masked load/store or masked gather/scatter. 1516 // Superset of instructions that return true for isScalarWithPredication. 1517 bool isPredicatedInst(Instruction *I) { 1518 if (!blockNeedsPredication(I->getParent())) 1519 return false; 1520 // Loads and stores that need some form of masked operation are predicated 1521 // instructions. 1522 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1523 return Legal->isMaskRequired(I); 1524 return isScalarWithPredication(I); 1525 } 1526 1527 /// Returns true if \p I is a memory instruction with consecutive memory 1528 /// access that can be widened. 1529 bool 1530 memoryInstructionCanBeWidened(Instruction *I, 1531 ElementCount VF = ElementCount::getFixed(1)); 1532 1533 /// Returns true if \p I is a memory instruction in an interleaved-group 1534 /// of memory accesses that can be vectorized with wide vector loads/stores 1535 /// and shuffles. 1536 bool 1537 interleavedAccessCanBeWidened(Instruction *I, 1538 ElementCount VF = ElementCount::getFixed(1)); 1539 1540 /// Check if \p Instr belongs to any interleaved access group. 1541 bool isAccessInterleaved(Instruction *Instr) { 1542 return InterleaveInfo.isInterleaved(Instr); 1543 } 1544 1545 /// Get the interleaved access group that \p Instr belongs to. 1546 const InterleaveGroup<Instruction> * 1547 getInterleavedAccessGroup(Instruction *Instr) { 1548 return InterleaveInfo.getInterleaveGroup(Instr); 1549 } 1550 1551 /// Returns true if we're required to use a scalar epilogue for at least 1552 /// the final iteration of the original loop. 1553 bool requiresScalarEpilogue() const { 1554 if (!isScalarEpilogueAllowed()) 1555 return false; 1556 // If we might exit from anywhere but the latch, must run the exiting 1557 // iteration in scalar form. 1558 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1559 return true; 1560 return InterleaveInfo.requiresScalarEpilogue(); 1561 } 1562 1563 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1564 /// loop hint annotation. 1565 bool isScalarEpilogueAllowed() const { 1566 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1567 } 1568 1569 /// Returns true if all loop blocks should be masked to fold tail loop. 1570 bool foldTailByMasking() const { return FoldTailByMasking; } 1571 1572 bool blockNeedsPredication(BasicBlock *BB) const { 1573 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1574 } 1575 1576 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1577 /// nodes to the chain of instructions representing the reductions. Uses a 1578 /// MapVector to ensure deterministic iteration order. 1579 using ReductionChainMap = 1580 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1581 1582 /// Return the chain of instructions representing an inloop reduction. 1583 const ReductionChainMap &getInLoopReductionChains() const { 1584 return InLoopReductionChains; 1585 } 1586 1587 /// Returns true if the Phi is part of an inloop reduction. 1588 bool isInLoopReduction(PHINode *Phi) const { 1589 return InLoopReductionChains.count(Phi); 1590 } 1591 1592 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1593 /// with factor VF. Return the cost of the instruction, including 1594 /// scalarization overhead if it's needed. 1595 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1596 1597 /// Estimate cost of a call instruction CI if it were vectorized with factor 1598 /// VF. Return the cost of the instruction, including scalarization overhead 1599 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1600 /// scalarized - 1601 /// i.e. either vector version isn't available, or is too expensive. 1602 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1603 bool &NeedToScalarize) const; 1604 1605 /// Invalidates decisions already taken by the cost model. 1606 void invalidateCostModelingDecisions() { 1607 WideningDecisions.clear(); 1608 Uniforms.clear(); 1609 Scalars.clear(); 1610 } 1611 1612 private: 1613 unsigned NumPredStores = 0; 1614 1615 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1616 /// than zero. One is returned if vectorization should best be avoided due 1617 /// to cost. 1618 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1619 ElementCount UserVF); 1620 1621 /// The vectorization cost is a combination of the cost itself and a boolean 1622 /// indicating whether any of the contributing operations will actually 1623 /// operate on 1624 /// vector values after type legalization in the backend. If this latter value 1625 /// is 1626 /// false, then all operations will be scalarized (i.e. no vectorization has 1627 /// actually taken place). 1628 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1629 1630 /// Returns the expected execution cost. The unit of the cost does 1631 /// not matter because we use the 'cost' units to compare different 1632 /// vector widths. The cost that is returned is *not* normalized by 1633 /// the factor width. 1634 VectorizationCostTy expectedCost(ElementCount VF); 1635 1636 /// Returns the execution time cost of an instruction for a given vector 1637 /// width. Vector width of one means scalar. 1638 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1639 1640 /// The cost-computation logic from getInstructionCost which provides 1641 /// the vector type as an output parameter. 1642 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1643 Type *&VectorTy); 1644 1645 /// Return the cost of instructions in an inloop reduction pattern, if I is 1646 /// part of that pattern. 1647 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1648 Type *VectorTy, 1649 TTI::TargetCostKind CostKind); 1650 1651 /// Calculate vectorization cost of memory instruction \p I. 1652 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1653 1654 /// The cost computation for scalarized memory instruction. 1655 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1656 1657 /// The cost computation for interleaving group of memory instructions. 1658 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1659 1660 /// The cost computation for Gather/Scatter instruction. 1661 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1662 1663 /// The cost computation for widening instruction \p I with consecutive 1664 /// memory access. 1665 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1666 1667 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1668 /// Load: scalar load + broadcast. 1669 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1670 /// element) 1671 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1672 1673 /// Estimate the overhead of scalarizing an instruction. This is a 1674 /// convenience wrapper for the type-based getScalarizationOverhead API. 1675 InstructionCost getScalarizationOverhead(Instruction *I, 1676 ElementCount VF) const; 1677 1678 /// Returns whether the instruction is a load or store and will be a emitted 1679 /// as a vector operation. 1680 bool isConsecutiveLoadOrStore(Instruction *I); 1681 1682 /// Returns true if an artificially high cost for emulated masked memrefs 1683 /// should be used. 1684 bool useEmulatedMaskMemRefHack(Instruction *I); 1685 1686 /// Map of scalar integer values to the smallest bitwidth they can be legally 1687 /// represented as. The vector equivalents of these values should be truncated 1688 /// to this type. 1689 MapVector<Instruction *, uint64_t> MinBWs; 1690 1691 /// A type representing the costs for instructions if they were to be 1692 /// scalarized rather than vectorized. The entries are Instruction-Cost 1693 /// pairs. 1694 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1695 1696 /// A set containing all BasicBlocks that are known to present after 1697 /// vectorization as a predicated block. 1698 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1699 1700 /// Records whether it is allowed to have the original scalar loop execute at 1701 /// least once. This may be needed as a fallback loop in case runtime 1702 /// aliasing/dependence checks fail, or to handle the tail/remainder 1703 /// iterations when the trip count is unknown or doesn't divide by the VF, 1704 /// or as a peel-loop to handle gaps in interleave-groups. 1705 /// Under optsize and when the trip count is very small we don't allow any 1706 /// iterations to execute in the scalar loop. 1707 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1708 1709 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1710 bool FoldTailByMasking = false; 1711 1712 /// A map holding scalar costs for different vectorization factors. The 1713 /// presence of a cost for an instruction in the mapping indicates that the 1714 /// instruction will be scalarized when vectorizing with the associated 1715 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1716 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1717 1718 /// Holds the instructions known to be uniform after vectorization. 1719 /// The data is collected per VF. 1720 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1721 1722 /// Holds the instructions known to be scalar after vectorization. 1723 /// The data is collected per VF. 1724 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1725 1726 /// Holds the instructions (address computations) that are forced to be 1727 /// scalarized. 1728 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1729 1730 /// PHINodes of the reductions that should be expanded in-loop along with 1731 /// their associated chains of reduction operations, in program order from top 1732 /// (PHI) to bottom 1733 ReductionChainMap InLoopReductionChains; 1734 1735 /// A Map of inloop reduction operations and their immediate chain operand. 1736 /// FIXME: This can be removed once reductions can be costed correctly in 1737 /// vplan. This was added to allow quick lookup to the inloop operations, 1738 /// without having to loop through InLoopReductionChains. 1739 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1740 1741 /// Returns the expected difference in cost from scalarizing the expression 1742 /// feeding a predicated instruction \p PredInst. The instructions to 1743 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1744 /// non-negative return value implies the expression will be scalarized. 1745 /// Currently, only single-use chains are considered for scalarization. 1746 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1747 ElementCount VF); 1748 1749 /// Collect the instructions that are uniform after vectorization. An 1750 /// instruction is uniform if we represent it with a single scalar value in 1751 /// the vectorized loop corresponding to each vector iteration. Examples of 1752 /// uniform instructions include pointer operands of consecutive or 1753 /// interleaved memory accesses. Note that although uniformity implies an 1754 /// instruction will be scalar, the reverse is not true. In general, a 1755 /// scalarized instruction will be represented by VF scalar values in the 1756 /// vectorized loop, each corresponding to an iteration of the original 1757 /// scalar loop. 1758 void collectLoopUniforms(ElementCount VF); 1759 1760 /// Collect the instructions that are scalar after vectorization. An 1761 /// instruction is scalar if it is known to be uniform or will be scalarized 1762 /// during vectorization. Non-uniform scalarized instructions will be 1763 /// represented by VF values in the vectorized loop, each corresponding to an 1764 /// iteration of the original scalar loop. 1765 void collectLoopScalars(ElementCount VF); 1766 1767 /// Keeps cost model vectorization decision and cost for instructions. 1768 /// Right now it is used for memory instructions only. 1769 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1770 std::pair<InstWidening, InstructionCost>>; 1771 1772 DecisionList WideningDecisions; 1773 1774 /// Returns true if \p V is expected to be vectorized and it needs to be 1775 /// extracted. 1776 bool needsExtract(Value *V, ElementCount VF) const { 1777 Instruction *I = dyn_cast<Instruction>(V); 1778 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1779 TheLoop->isLoopInvariant(I)) 1780 return false; 1781 1782 // Assume we can vectorize V (and hence we need extraction) if the 1783 // scalars are not computed yet. This can happen, because it is called 1784 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1785 // the scalars are collected. That should be a safe assumption in most 1786 // cases, because we check if the operands have vectorizable types 1787 // beforehand in LoopVectorizationLegality. 1788 return Scalars.find(VF) == Scalars.end() || 1789 !isScalarAfterVectorization(I, VF); 1790 }; 1791 1792 /// Returns a range containing only operands needing to be extracted. 1793 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1794 ElementCount VF) const { 1795 return SmallVector<Value *, 4>(make_filter_range( 1796 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1797 } 1798 1799 /// Determines if we have the infrastructure to vectorize loop \p L and its 1800 /// epilogue, assuming the main loop is vectorized by \p VF. 1801 bool isCandidateForEpilogueVectorization(const Loop &L, 1802 const ElementCount VF) const; 1803 1804 /// Returns true if epilogue vectorization is considered profitable, and 1805 /// false otherwise. 1806 /// \p VF is the vectorization factor chosen for the original loop. 1807 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1808 1809 public: 1810 /// The loop that we evaluate. 1811 Loop *TheLoop; 1812 1813 /// Predicated scalar evolution analysis. 1814 PredicatedScalarEvolution &PSE; 1815 1816 /// Loop Info analysis. 1817 LoopInfo *LI; 1818 1819 /// Vectorization legality. 1820 LoopVectorizationLegality *Legal; 1821 1822 /// Vector target information. 1823 const TargetTransformInfo &TTI; 1824 1825 /// Target Library Info. 1826 const TargetLibraryInfo *TLI; 1827 1828 /// Demanded bits analysis. 1829 DemandedBits *DB; 1830 1831 /// Assumption cache. 1832 AssumptionCache *AC; 1833 1834 /// Interface to emit optimization remarks. 1835 OptimizationRemarkEmitter *ORE; 1836 1837 const Function *TheFunction; 1838 1839 /// Loop Vectorize Hint. 1840 const LoopVectorizeHints *Hints; 1841 1842 /// The interleave access information contains groups of interleaved accesses 1843 /// with the same stride and close to each other. 1844 InterleavedAccessInfo &InterleaveInfo; 1845 1846 /// Values to ignore in the cost model. 1847 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1848 1849 /// Values to ignore in the cost model when VF > 1. 1850 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1851 1852 /// Profitable vector factors. 1853 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1854 }; 1855 } // end namespace llvm 1856 1857 /// Helper struct to manage generating runtime checks for vectorization. 1858 /// 1859 /// The runtime checks are created up-front in temporary blocks to allow better 1860 /// estimating the cost and un-linked from the existing IR. After deciding to 1861 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1862 /// temporary blocks are completely removed. 1863 class GeneratedRTChecks { 1864 /// Basic block which contains the generated SCEV checks, if any. 1865 BasicBlock *SCEVCheckBlock = nullptr; 1866 1867 /// The value representing the result of the generated SCEV checks. If it is 1868 /// nullptr, either no SCEV checks have been generated or they have been used. 1869 Value *SCEVCheckCond = nullptr; 1870 1871 /// Basic block which contains the generated memory runtime checks, if any. 1872 BasicBlock *MemCheckBlock = nullptr; 1873 1874 /// The value representing the result of the generated memory runtime checks. 1875 /// If it is nullptr, either no memory runtime checks have been generated or 1876 /// they have been used. 1877 Instruction *MemRuntimeCheckCond = nullptr; 1878 1879 DominatorTree *DT; 1880 LoopInfo *LI; 1881 1882 SCEVExpander SCEVExp; 1883 SCEVExpander MemCheckExp; 1884 1885 public: 1886 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1887 const DataLayout &DL) 1888 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1889 MemCheckExp(SE, DL, "scev.check") {} 1890 1891 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1892 /// accurately estimate the cost of the runtime checks. The blocks are 1893 /// un-linked from the IR and is added back during vector code generation. If 1894 /// there is no vector code generation, the check blocks are removed 1895 /// completely. 1896 void Create(Loop *L, const LoopAccessInfo &LAI, 1897 const SCEVUnionPredicate &UnionPred) { 1898 1899 BasicBlock *LoopHeader = L->getHeader(); 1900 BasicBlock *Preheader = L->getLoopPreheader(); 1901 1902 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1903 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1904 // may be used by SCEVExpander. The blocks will be un-linked from their 1905 // predecessors and removed from LI & DT at the end of the function. 1906 if (!UnionPred.isAlwaysTrue()) { 1907 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1908 nullptr, "vector.scevcheck"); 1909 1910 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1911 &UnionPred, SCEVCheckBlock->getTerminator()); 1912 } 1913 1914 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1915 if (RtPtrChecking.Need) { 1916 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1917 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1918 "vector.memcheck"); 1919 1920 std::tie(std::ignore, MemRuntimeCheckCond) = 1921 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1922 RtPtrChecking.getChecks(), MemCheckExp); 1923 assert(MemRuntimeCheckCond && 1924 "no RT checks generated although RtPtrChecking " 1925 "claimed checks are required"); 1926 } 1927 1928 if (!MemCheckBlock && !SCEVCheckBlock) 1929 return; 1930 1931 // Unhook the temporary block with the checks, update various places 1932 // accordingly. 1933 if (SCEVCheckBlock) 1934 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1935 if (MemCheckBlock) 1936 MemCheckBlock->replaceAllUsesWith(Preheader); 1937 1938 if (SCEVCheckBlock) { 1939 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1940 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1941 Preheader->getTerminator()->eraseFromParent(); 1942 } 1943 if (MemCheckBlock) { 1944 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1945 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1946 Preheader->getTerminator()->eraseFromParent(); 1947 } 1948 1949 DT->changeImmediateDominator(LoopHeader, Preheader); 1950 if (MemCheckBlock) { 1951 DT->eraseNode(MemCheckBlock); 1952 LI->removeBlock(MemCheckBlock); 1953 } 1954 if (SCEVCheckBlock) { 1955 DT->eraseNode(SCEVCheckBlock); 1956 LI->removeBlock(SCEVCheckBlock); 1957 } 1958 } 1959 1960 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1961 /// unused. 1962 ~GeneratedRTChecks() { 1963 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1964 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1965 if (!SCEVCheckCond) 1966 SCEVCleaner.markResultUsed(); 1967 1968 if (!MemRuntimeCheckCond) 1969 MemCheckCleaner.markResultUsed(); 1970 1971 if (MemRuntimeCheckCond) { 1972 auto &SE = *MemCheckExp.getSE(); 1973 // Memory runtime check generation creates compares that use expanded 1974 // values. Remove them before running the SCEVExpanderCleaners. 1975 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1976 if (MemCheckExp.isInsertedInstruction(&I)) 1977 continue; 1978 SE.forgetValue(&I); 1979 SE.eraseValueFromMap(&I); 1980 I.eraseFromParent(); 1981 } 1982 } 1983 MemCheckCleaner.cleanup(); 1984 SCEVCleaner.cleanup(); 1985 1986 if (SCEVCheckCond) 1987 SCEVCheckBlock->eraseFromParent(); 1988 if (MemRuntimeCheckCond) 1989 MemCheckBlock->eraseFromParent(); 1990 } 1991 1992 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1993 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1994 /// depending on the generated condition. 1995 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1996 BasicBlock *LoopVectorPreHeader, 1997 BasicBlock *LoopExitBlock) { 1998 if (!SCEVCheckCond) 1999 return nullptr; 2000 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2001 if (C->isZero()) 2002 return nullptr; 2003 2004 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2005 2006 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2007 // Create new preheader for vector loop. 2008 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2009 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2010 2011 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2012 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2013 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2014 SCEVCheckBlock); 2015 2016 DT->addNewBlock(SCEVCheckBlock, Pred); 2017 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2018 2019 ReplaceInstWithInst( 2020 SCEVCheckBlock->getTerminator(), 2021 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2022 // Mark the check as used, to prevent it from being removed during cleanup. 2023 SCEVCheckCond = nullptr; 2024 return SCEVCheckBlock; 2025 } 2026 2027 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2028 /// the branches to branch to the vector preheader or \p Bypass, depending on 2029 /// the generated condition. 2030 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2031 BasicBlock *LoopVectorPreHeader) { 2032 // Check if we generated code that checks in runtime if arrays overlap. 2033 if (!MemRuntimeCheckCond) 2034 return nullptr; 2035 2036 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2037 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2038 MemCheckBlock); 2039 2040 DT->addNewBlock(MemCheckBlock, Pred); 2041 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2042 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2043 2044 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2045 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2046 2047 ReplaceInstWithInst( 2048 MemCheckBlock->getTerminator(), 2049 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2050 MemCheckBlock->getTerminator()->setDebugLoc( 2051 Pred->getTerminator()->getDebugLoc()); 2052 2053 // Mark the check as used, to prevent it from being removed during cleanup. 2054 MemRuntimeCheckCond = nullptr; 2055 return MemCheckBlock; 2056 } 2057 }; 2058 2059 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2060 // vectorization. The loop needs to be annotated with #pragma omp simd 2061 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2062 // vector length information is not provided, vectorization is not considered 2063 // explicit. Interleave hints are not allowed either. These limitations will be 2064 // relaxed in the future. 2065 // Please, note that we are currently forced to abuse the pragma 'clang 2066 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2067 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2068 // provides *explicit vectorization hints* (LV can bypass legal checks and 2069 // assume that vectorization is legal). However, both hints are implemented 2070 // using the same metadata (llvm.loop.vectorize, processed by 2071 // LoopVectorizeHints). This will be fixed in the future when the native IR 2072 // representation for pragma 'omp simd' is introduced. 2073 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2074 OptimizationRemarkEmitter *ORE) { 2075 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2076 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2077 2078 // Only outer loops with an explicit vectorization hint are supported. 2079 // Unannotated outer loops are ignored. 2080 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2081 return false; 2082 2083 Function *Fn = OuterLp->getHeader()->getParent(); 2084 if (!Hints.allowVectorization(Fn, OuterLp, 2085 true /*VectorizeOnlyWhenForced*/)) { 2086 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2087 return false; 2088 } 2089 2090 if (Hints.getInterleave() > 1) { 2091 // TODO: Interleave support is future work. 2092 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2093 "outer loops.\n"); 2094 Hints.emitRemarkWithHints(); 2095 return false; 2096 } 2097 2098 return true; 2099 } 2100 2101 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2102 OptimizationRemarkEmitter *ORE, 2103 SmallVectorImpl<Loop *> &V) { 2104 // Collect inner loops and outer loops without irreducible control flow. For 2105 // now, only collect outer loops that have explicit vectorization hints. If we 2106 // are stress testing the VPlan H-CFG construction, we collect the outermost 2107 // loop of every loop nest. 2108 if (L.isInnermost() || VPlanBuildStressTest || 2109 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2110 LoopBlocksRPO RPOT(&L); 2111 RPOT.perform(LI); 2112 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2113 V.push_back(&L); 2114 // TODO: Collect inner loops inside marked outer loops in case 2115 // vectorization fails for the outer loop. Do not invoke 2116 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2117 // already known to be reducible. We can use an inherited attribute for 2118 // that. 2119 return; 2120 } 2121 } 2122 for (Loop *InnerL : L) 2123 collectSupportedLoops(*InnerL, LI, ORE, V); 2124 } 2125 2126 namespace { 2127 2128 /// The LoopVectorize Pass. 2129 struct LoopVectorize : public FunctionPass { 2130 /// Pass identification, replacement for typeid 2131 static char ID; 2132 2133 LoopVectorizePass Impl; 2134 2135 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2136 bool VectorizeOnlyWhenForced = false) 2137 : FunctionPass(ID), 2138 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2139 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2140 } 2141 2142 bool runOnFunction(Function &F) override { 2143 if (skipFunction(F)) 2144 return false; 2145 2146 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2147 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2148 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2149 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2150 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2151 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2152 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2153 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2154 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2155 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2156 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2157 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2158 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2159 2160 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2161 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2162 2163 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2164 GetLAA, *ORE, PSI).MadeAnyChange; 2165 } 2166 2167 void getAnalysisUsage(AnalysisUsage &AU) const override { 2168 AU.addRequired<AssumptionCacheTracker>(); 2169 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2170 AU.addRequired<DominatorTreeWrapperPass>(); 2171 AU.addRequired<LoopInfoWrapperPass>(); 2172 AU.addRequired<ScalarEvolutionWrapperPass>(); 2173 AU.addRequired<TargetTransformInfoWrapperPass>(); 2174 AU.addRequired<AAResultsWrapperPass>(); 2175 AU.addRequired<LoopAccessLegacyAnalysis>(); 2176 AU.addRequired<DemandedBitsWrapperPass>(); 2177 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2178 AU.addRequired<InjectTLIMappingsLegacy>(); 2179 2180 // We currently do not preserve loopinfo/dominator analyses with outer loop 2181 // vectorization. Until this is addressed, mark these analyses as preserved 2182 // only for non-VPlan-native path. 2183 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2184 if (!EnableVPlanNativePath) { 2185 AU.addPreserved<LoopInfoWrapperPass>(); 2186 AU.addPreserved<DominatorTreeWrapperPass>(); 2187 } 2188 2189 AU.addPreserved<BasicAAWrapperPass>(); 2190 AU.addPreserved<GlobalsAAWrapperPass>(); 2191 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2192 } 2193 }; 2194 2195 } // end anonymous namespace 2196 2197 //===----------------------------------------------------------------------===// 2198 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2199 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2200 //===----------------------------------------------------------------------===// 2201 2202 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2203 // We need to place the broadcast of invariant variables outside the loop, 2204 // but only if it's proven safe to do so. Else, broadcast will be inside 2205 // vector loop body. 2206 Instruction *Instr = dyn_cast<Instruction>(V); 2207 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2208 (!Instr || 2209 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2210 // Place the code for broadcasting invariant variables in the new preheader. 2211 IRBuilder<>::InsertPointGuard Guard(Builder); 2212 if (SafeToHoist) 2213 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2214 2215 // Broadcast the scalar into all locations in the vector. 2216 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2217 2218 return Shuf; 2219 } 2220 2221 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2222 const InductionDescriptor &II, Value *Step, Value *Start, 2223 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2224 VPTransformState &State) { 2225 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2226 "Expected either an induction phi-node or a truncate of it!"); 2227 2228 // Construct the initial value of the vector IV in the vector loop preheader 2229 auto CurrIP = Builder.saveIP(); 2230 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2231 if (isa<TruncInst>(EntryVal)) { 2232 assert(Start->getType()->isIntegerTy() && 2233 "Truncation requires an integer type"); 2234 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2235 Step = Builder.CreateTrunc(Step, TruncType); 2236 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2237 } 2238 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2239 Value *SteppedStart = 2240 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2241 2242 // We create vector phi nodes for both integer and floating-point induction 2243 // variables. Here, we determine the kind of arithmetic we will perform. 2244 Instruction::BinaryOps AddOp; 2245 Instruction::BinaryOps MulOp; 2246 if (Step->getType()->isIntegerTy()) { 2247 AddOp = Instruction::Add; 2248 MulOp = Instruction::Mul; 2249 } else { 2250 AddOp = II.getInductionOpcode(); 2251 MulOp = Instruction::FMul; 2252 } 2253 2254 // Multiply the vectorization factor by the step using integer or 2255 // floating-point arithmetic as appropriate. 2256 Type *StepType = Step->getType(); 2257 if (Step->getType()->isFloatingPointTy()) 2258 StepType = IntegerType::get(StepType->getContext(), 2259 StepType->getScalarSizeInBits()); 2260 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2261 if (Step->getType()->isFloatingPointTy()) 2262 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2263 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2264 2265 // Create a vector splat to use in the induction update. 2266 // 2267 // FIXME: If the step is non-constant, we create the vector splat with 2268 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2269 // handle a constant vector splat. 2270 Value *SplatVF = isa<Constant>(Mul) 2271 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2272 : Builder.CreateVectorSplat(VF, Mul); 2273 Builder.restoreIP(CurrIP); 2274 2275 // We may need to add the step a number of times, depending on the unroll 2276 // factor. The last of those goes into the PHI. 2277 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2278 &*LoopVectorBody->getFirstInsertionPt()); 2279 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2280 Instruction *LastInduction = VecInd; 2281 for (unsigned Part = 0; Part < UF; ++Part) { 2282 State.set(Def, LastInduction, Part); 2283 2284 if (isa<TruncInst>(EntryVal)) 2285 addMetadata(LastInduction, EntryVal); 2286 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2287 State, Part); 2288 2289 LastInduction = cast<Instruction>( 2290 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2291 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2292 } 2293 2294 // Move the last step to the end of the latch block. This ensures consistent 2295 // placement of all induction updates. 2296 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2297 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2298 auto *ICmp = cast<Instruction>(Br->getCondition()); 2299 LastInduction->moveBefore(ICmp); 2300 LastInduction->setName("vec.ind.next"); 2301 2302 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2303 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2304 } 2305 2306 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2307 return Cost->isScalarAfterVectorization(I, VF) || 2308 Cost->isProfitableToScalarize(I, VF); 2309 } 2310 2311 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2312 if (shouldScalarizeInstruction(IV)) 2313 return true; 2314 auto isScalarInst = [&](User *U) -> bool { 2315 auto *I = cast<Instruction>(U); 2316 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2317 }; 2318 return llvm::any_of(IV->users(), isScalarInst); 2319 } 2320 2321 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2322 const InductionDescriptor &ID, const Instruction *EntryVal, 2323 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2324 unsigned Part, unsigned Lane) { 2325 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2326 "Expected either an induction phi-node or a truncate of it!"); 2327 2328 // This induction variable is not the phi from the original loop but the 2329 // newly-created IV based on the proof that casted Phi is equal to the 2330 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2331 // re-uses the same InductionDescriptor that original IV uses but we don't 2332 // have to do any recording in this case - that is done when original IV is 2333 // processed. 2334 if (isa<TruncInst>(EntryVal)) 2335 return; 2336 2337 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2338 if (Casts.empty()) 2339 return; 2340 // Only the first Cast instruction in the Casts vector is of interest. 2341 // The rest of the Casts (if exist) have no uses outside the 2342 // induction update chain itself. 2343 if (Lane < UINT_MAX) 2344 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2345 else 2346 State.set(CastDef, VectorLoopVal, Part); 2347 } 2348 2349 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2350 TruncInst *Trunc, VPValue *Def, 2351 VPValue *CastDef, 2352 VPTransformState &State) { 2353 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2354 "Primary induction variable must have an integer type"); 2355 2356 auto II = Legal->getInductionVars().find(IV); 2357 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2358 2359 auto ID = II->second; 2360 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2361 2362 // The value from the original loop to which we are mapping the new induction 2363 // variable. 2364 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2365 2366 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2367 2368 // Generate code for the induction step. Note that induction steps are 2369 // required to be loop-invariant 2370 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2371 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2372 "Induction step should be loop invariant"); 2373 if (PSE.getSE()->isSCEVable(IV->getType())) { 2374 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2375 return Exp.expandCodeFor(Step, Step->getType(), 2376 LoopVectorPreHeader->getTerminator()); 2377 } 2378 return cast<SCEVUnknown>(Step)->getValue(); 2379 }; 2380 2381 // The scalar value to broadcast. This is derived from the canonical 2382 // induction variable. If a truncation type is given, truncate the canonical 2383 // induction variable and step. Otherwise, derive these values from the 2384 // induction descriptor. 2385 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2386 Value *ScalarIV = Induction; 2387 if (IV != OldInduction) { 2388 ScalarIV = IV->getType()->isIntegerTy() 2389 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2390 : Builder.CreateCast(Instruction::SIToFP, Induction, 2391 IV->getType()); 2392 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2393 ScalarIV->setName("offset.idx"); 2394 } 2395 if (Trunc) { 2396 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2397 assert(Step->getType()->isIntegerTy() && 2398 "Truncation requires an integer step"); 2399 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2400 Step = Builder.CreateTrunc(Step, TruncType); 2401 } 2402 return ScalarIV; 2403 }; 2404 2405 // Create the vector values from the scalar IV, in the absence of creating a 2406 // vector IV. 2407 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2408 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2409 for (unsigned Part = 0; Part < UF; ++Part) { 2410 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2411 Value *EntryPart = 2412 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2413 ID.getInductionOpcode()); 2414 State.set(Def, EntryPart, Part); 2415 if (Trunc) 2416 addMetadata(EntryPart, Trunc); 2417 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2418 State, Part); 2419 } 2420 }; 2421 2422 // Fast-math-flags propagate from the original induction instruction. 2423 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2424 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2425 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2426 2427 // Now do the actual transformations, and start with creating the step value. 2428 Value *Step = CreateStepValue(ID.getStep()); 2429 if (VF.isZero() || VF.isScalar()) { 2430 Value *ScalarIV = CreateScalarIV(Step); 2431 CreateSplatIV(ScalarIV, Step); 2432 return; 2433 } 2434 2435 // Determine if we want a scalar version of the induction variable. This is 2436 // true if the induction variable itself is not widened, or if it has at 2437 // least one user in the loop that is not widened. 2438 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2439 if (!NeedsScalarIV) { 2440 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2441 State); 2442 return; 2443 } 2444 2445 // Try to create a new independent vector induction variable. If we can't 2446 // create the phi node, we will splat the scalar induction variable in each 2447 // loop iteration. 2448 if (!shouldScalarizeInstruction(EntryVal)) { 2449 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2450 State); 2451 Value *ScalarIV = CreateScalarIV(Step); 2452 // Create scalar steps that can be used by instructions we will later 2453 // scalarize. Note that the addition of the scalar steps will not increase 2454 // the number of instructions in the loop in the common case prior to 2455 // InstCombine. We will be trading one vector extract for each scalar step. 2456 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2457 return; 2458 } 2459 2460 // All IV users are scalar instructions, so only emit a scalar IV, not a 2461 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2462 // predicate used by the masked loads/stores. 2463 Value *ScalarIV = CreateScalarIV(Step); 2464 if (!Cost->isScalarEpilogueAllowed()) 2465 CreateSplatIV(ScalarIV, Step); 2466 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2467 } 2468 2469 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2470 Instruction::BinaryOps BinOp) { 2471 // Create and check the types. 2472 auto *ValVTy = cast<VectorType>(Val->getType()); 2473 ElementCount VLen = ValVTy->getElementCount(); 2474 2475 Type *STy = Val->getType()->getScalarType(); 2476 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2477 "Induction Step must be an integer or FP"); 2478 assert(Step->getType() == STy && "Step has wrong type"); 2479 2480 SmallVector<Constant *, 8> Indices; 2481 2482 // Create a vector of consecutive numbers from zero to VF. 2483 VectorType *InitVecValVTy = ValVTy; 2484 Type *InitVecValSTy = STy; 2485 if (STy->isFloatingPointTy()) { 2486 InitVecValSTy = 2487 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2488 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2489 } 2490 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2491 2492 // Add on StartIdx 2493 Value *StartIdxSplat = Builder.CreateVectorSplat( 2494 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2495 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2496 2497 if (STy->isIntegerTy()) { 2498 Step = Builder.CreateVectorSplat(VLen, Step); 2499 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2500 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2501 // which can be found from the original scalar operations. 2502 Step = Builder.CreateMul(InitVec, Step); 2503 return Builder.CreateAdd(Val, Step, "induction"); 2504 } 2505 2506 // Floating point induction. 2507 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2508 "Binary Opcode should be specified for FP induction"); 2509 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2510 Step = Builder.CreateVectorSplat(VLen, Step); 2511 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2512 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2513 } 2514 2515 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2516 Instruction *EntryVal, 2517 const InductionDescriptor &ID, 2518 VPValue *Def, VPValue *CastDef, 2519 VPTransformState &State) { 2520 // We shouldn't have to build scalar steps if we aren't vectorizing. 2521 assert(VF.isVector() && "VF should be greater than one"); 2522 // Get the value type and ensure it and the step have the same integer type. 2523 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2524 assert(ScalarIVTy == Step->getType() && 2525 "Val and Step should have the same type"); 2526 2527 // We build scalar steps for both integer and floating-point induction 2528 // variables. Here, we determine the kind of arithmetic we will perform. 2529 Instruction::BinaryOps AddOp; 2530 Instruction::BinaryOps MulOp; 2531 if (ScalarIVTy->isIntegerTy()) { 2532 AddOp = Instruction::Add; 2533 MulOp = Instruction::Mul; 2534 } else { 2535 AddOp = ID.getInductionOpcode(); 2536 MulOp = Instruction::FMul; 2537 } 2538 2539 // Determine the number of scalars we need to generate for each unroll 2540 // iteration. If EntryVal is uniform, we only need to generate the first 2541 // lane. Otherwise, we generate all VF values. 2542 bool IsUniform = 2543 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2544 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2545 // Compute the scalar steps and save the results in State. 2546 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2547 ScalarIVTy->getScalarSizeInBits()); 2548 Type *VecIVTy = nullptr; 2549 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2550 if (!IsUniform && VF.isScalable()) { 2551 VecIVTy = VectorType::get(ScalarIVTy, VF); 2552 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2553 SplatStep = Builder.CreateVectorSplat(VF, Step); 2554 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2555 } 2556 2557 for (unsigned Part = 0; Part < UF; ++Part) { 2558 Value *StartIdx0 = 2559 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2560 2561 if (!IsUniform && VF.isScalable()) { 2562 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2563 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2564 if (ScalarIVTy->isFloatingPointTy()) 2565 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2566 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2567 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2568 State.set(Def, Add, Part); 2569 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2570 Part); 2571 // It's useful to record the lane values too for the known minimum number 2572 // of elements so we do those below. This improves the code quality when 2573 // trying to extract the first element, for example. 2574 } 2575 2576 if (ScalarIVTy->isFloatingPointTy()) 2577 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2578 2579 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2580 Value *StartIdx = Builder.CreateBinOp( 2581 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2582 // The step returned by `createStepForVF` is a runtime-evaluated value 2583 // when VF is scalable. Otherwise, it should be folded into a Constant. 2584 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2585 "Expected StartIdx to be folded to a constant when VF is not " 2586 "scalable"); 2587 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2588 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2589 State.set(Def, Add, VPIteration(Part, Lane)); 2590 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2591 Part, Lane); 2592 } 2593 } 2594 } 2595 2596 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2597 const VPIteration &Instance, 2598 VPTransformState &State) { 2599 Value *ScalarInst = State.get(Def, Instance); 2600 Value *VectorValue = State.get(Def, Instance.Part); 2601 VectorValue = Builder.CreateInsertElement( 2602 VectorValue, ScalarInst, 2603 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2604 State.set(Def, VectorValue, Instance.Part); 2605 } 2606 2607 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2608 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2609 return Builder.CreateVectorReverse(Vec, "reverse"); 2610 } 2611 2612 // Return whether we allow using masked interleave-groups (for dealing with 2613 // strided loads/stores that reside in predicated blocks, or for dealing 2614 // with gaps). 2615 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2616 // If an override option has been passed in for interleaved accesses, use it. 2617 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2618 return EnableMaskedInterleavedMemAccesses; 2619 2620 return TTI.enableMaskedInterleavedAccessVectorization(); 2621 } 2622 2623 // Try to vectorize the interleave group that \p Instr belongs to. 2624 // 2625 // E.g. Translate following interleaved load group (factor = 3): 2626 // for (i = 0; i < N; i+=3) { 2627 // R = Pic[i]; // Member of index 0 2628 // G = Pic[i+1]; // Member of index 1 2629 // B = Pic[i+2]; // Member of index 2 2630 // ... // do something to R, G, B 2631 // } 2632 // To: 2633 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2634 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2635 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2636 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2637 // 2638 // Or translate following interleaved store group (factor = 3): 2639 // for (i = 0; i < N; i+=3) { 2640 // ... do something to R, G, B 2641 // Pic[i] = R; // Member of index 0 2642 // Pic[i+1] = G; // Member of index 1 2643 // Pic[i+2] = B; // Member of index 2 2644 // } 2645 // To: 2646 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2647 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2648 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2649 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2650 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2651 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2652 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2653 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2654 VPValue *BlockInMask) { 2655 Instruction *Instr = Group->getInsertPos(); 2656 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2657 2658 // Prepare for the vector type of the interleaved load/store. 2659 Type *ScalarTy = getMemInstValueType(Instr); 2660 unsigned InterleaveFactor = Group->getFactor(); 2661 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2662 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2663 2664 // Prepare for the new pointers. 2665 SmallVector<Value *, 2> AddrParts; 2666 unsigned Index = Group->getIndex(Instr); 2667 2668 // TODO: extend the masked interleaved-group support to reversed access. 2669 assert((!BlockInMask || !Group->isReverse()) && 2670 "Reversed masked interleave-group not supported."); 2671 2672 // If the group is reverse, adjust the index to refer to the last vector lane 2673 // instead of the first. We adjust the index from the first vector lane, 2674 // rather than directly getting the pointer for lane VF - 1, because the 2675 // pointer operand of the interleaved access is supposed to be uniform. For 2676 // uniform instructions, we're only required to generate a value for the 2677 // first vector lane in each unroll iteration. 2678 if (Group->isReverse()) 2679 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2680 2681 for (unsigned Part = 0; Part < UF; Part++) { 2682 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2683 setDebugLocFromInst(Builder, AddrPart); 2684 2685 // Notice current instruction could be any index. Need to adjust the address 2686 // to the member of index 0. 2687 // 2688 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2689 // b = A[i]; // Member of index 0 2690 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2691 // 2692 // E.g. A[i+1] = a; // Member of index 1 2693 // A[i] = b; // Member of index 0 2694 // A[i+2] = c; // Member of index 2 (Current instruction) 2695 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2696 2697 bool InBounds = false; 2698 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2699 InBounds = gep->isInBounds(); 2700 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2701 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2702 2703 // Cast to the vector pointer type. 2704 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2705 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2706 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2707 } 2708 2709 setDebugLocFromInst(Builder, Instr); 2710 Value *PoisonVec = PoisonValue::get(VecTy); 2711 2712 Value *MaskForGaps = nullptr; 2713 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2714 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2715 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2716 } 2717 2718 // Vectorize the interleaved load group. 2719 if (isa<LoadInst>(Instr)) { 2720 // For each unroll part, create a wide load for the group. 2721 SmallVector<Value *, 2> NewLoads; 2722 for (unsigned Part = 0; Part < UF; Part++) { 2723 Instruction *NewLoad; 2724 if (BlockInMask || MaskForGaps) { 2725 assert(useMaskedInterleavedAccesses(*TTI) && 2726 "masked interleaved groups are not allowed."); 2727 Value *GroupMask = MaskForGaps; 2728 if (BlockInMask) { 2729 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2730 Value *ShuffledMask = Builder.CreateShuffleVector( 2731 BlockInMaskPart, 2732 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2733 "interleaved.mask"); 2734 GroupMask = MaskForGaps 2735 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2736 MaskForGaps) 2737 : ShuffledMask; 2738 } 2739 NewLoad = 2740 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2741 GroupMask, PoisonVec, "wide.masked.vec"); 2742 } 2743 else 2744 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2745 Group->getAlign(), "wide.vec"); 2746 Group->addMetadata(NewLoad); 2747 NewLoads.push_back(NewLoad); 2748 } 2749 2750 // For each member in the group, shuffle out the appropriate data from the 2751 // wide loads. 2752 unsigned J = 0; 2753 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2754 Instruction *Member = Group->getMember(I); 2755 2756 // Skip the gaps in the group. 2757 if (!Member) 2758 continue; 2759 2760 auto StrideMask = 2761 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2762 for (unsigned Part = 0; Part < UF; Part++) { 2763 Value *StridedVec = Builder.CreateShuffleVector( 2764 NewLoads[Part], StrideMask, "strided.vec"); 2765 2766 // If this member has different type, cast the result type. 2767 if (Member->getType() != ScalarTy) { 2768 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2769 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2770 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2771 } 2772 2773 if (Group->isReverse()) 2774 StridedVec = reverseVector(StridedVec); 2775 2776 State.set(VPDefs[J], StridedVec, Part); 2777 } 2778 ++J; 2779 } 2780 return; 2781 } 2782 2783 // The sub vector type for current instruction. 2784 auto *SubVT = VectorType::get(ScalarTy, VF); 2785 2786 // Vectorize the interleaved store group. 2787 for (unsigned Part = 0; Part < UF; Part++) { 2788 // Collect the stored vector from each member. 2789 SmallVector<Value *, 4> StoredVecs; 2790 for (unsigned i = 0; i < InterleaveFactor; i++) { 2791 // Interleaved store group doesn't allow a gap, so each index has a member 2792 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2793 2794 Value *StoredVec = State.get(StoredValues[i], Part); 2795 2796 if (Group->isReverse()) 2797 StoredVec = reverseVector(StoredVec); 2798 2799 // If this member has different type, cast it to a unified type. 2800 2801 if (StoredVec->getType() != SubVT) 2802 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2803 2804 StoredVecs.push_back(StoredVec); 2805 } 2806 2807 // Concatenate all vectors into a wide vector. 2808 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2809 2810 // Interleave the elements in the wide vector. 2811 Value *IVec = Builder.CreateShuffleVector( 2812 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2813 "interleaved.vec"); 2814 2815 Instruction *NewStoreInstr; 2816 if (BlockInMask) { 2817 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2818 Value *ShuffledMask = Builder.CreateShuffleVector( 2819 BlockInMaskPart, 2820 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2821 "interleaved.mask"); 2822 NewStoreInstr = Builder.CreateMaskedStore( 2823 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2824 } 2825 else 2826 NewStoreInstr = 2827 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2828 2829 Group->addMetadata(NewStoreInstr); 2830 } 2831 } 2832 2833 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2834 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2835 VPValue *StoredValue, VPValue *BlockInMask) { 2836 // Attempt to issue a wide load. 2837 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2838 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2839 2840 assert((LI || SI) && "Invalid Load/Store instruction"); 2841 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2842 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2843 2844 LoopVectorizationCostModel::InstWidening Decision = 2845 Cost->getWideningDecision(Instr, VF); 2846 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2847 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2848 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2849 "CM decision is not to widen the memory instruction"); 2850 2851 Type *ScalarDataTy = getMemInstValueType(Instr); 2852 2853 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2854 const Align Alignment = getLoadStoreAlignment(Instr); 2855 2856 // Determine if the pointer operand of the access is either consecutive or 2857 // reverse consecutive. 2858 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2859 bool ConsecutiveStride = 2860 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2861 bool CreateGatherScatter = 2862 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2863 2864 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2865 // gather/scatter. Otherwise Decision should have been to Scalarize. 2866 assert((ConsecutiveStride || CreateGatherScatter) && 2867 "The instruction should be scalarized"); 2868 (void)ConsecutiveStride; 2869 2870 VectorParts BlockInMaskParts(UF); 2871 bool isMaskRequired = BlockInMask; 2872 if (isMaskRequired) 2873 for (unsigned Part = 0; Part < UF; ++Part) 2874 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2875 2876 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2877 // Calculate the pointer for the specific unroll-part. 2878 GetElementPtrInst *PartPtr = nullptr; 2879 2880 bool InBounds = false; 2881 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2882 InBounds = gep->isInBounds(); 2883 if (Reverse) { 2884 // If the address is consecutive but reversed, then the 2885 // wide store needs to start at the last vector element. 2886 // RunTimeVF = VScale * VF.getKnownMinValue() 2887 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2888 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2889 // NumElt = -Part * RunTimeVF 2890 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2891 // LastLane = 1 - RunTimeVF 2892 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2893 PartPtr = 2894 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2895 PartPtr->setIsInBounds(InBounds); 2896 PartPtr = cast<GetElementPtrInst>( 2897 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2898 PartPtr->setIsInBounds(InBounds); 2899 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2900 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2901 } else { 2902 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2903 PartPtr = cast<GetElementPtrInst>( 2904 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2905 PartPtr->setIsInBounds(InBounds); 2906 } 2907 2908 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2909 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2910 }; 2911 2912 // Handle Stores: 2913 if (SI) { 2914 setDebugLocFromInst(Builder, SI); 2915 2916 for (unsigned Part = 0; Part < UF; ++Part) { 2917 Instruction *NewSI = nullptr; 2918 Value *StoredVal = State.get(StoredValue, Part); 2919 if (CreateGatherScatter) { 2920 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2921 Value *VectorGep = State.get(Addr, Part); 2922 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2923 MaskPart); 2924 } else { 2925 if (Reverse) { 2926 // If we store to reverse consecutive memory locations, then we need 2927 // to reverse the order of elements in the stored value. 2928 StoredVal = reverseVector(StoredVal); 2929 // We don't want to update the value in the map as it might be used in 2930 // another expression. So don't call resetVectorValue(StoredVal). 2931 } 2932 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2933 if (isMaskRequired) 2934 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2935 BlockInMaskParts[Part]); 2936 else 2937 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2938 } 2939 addMetadata(NewSI, SI); 2940 } 2941 return; 2942 } 2943 2944 // Handle loads. 2945 assert(LI && "Must have a load instruction"); 2946 setDebugLocFromInst(Builder, LI); 2947 for (unsigned Part = 0; Part < UF; ++Part) { 2948 Value *NewLI; 2949 if (CreateGatherScatter) { 2950 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2951 Value *VectorGep = State.get(Addr, Part); 2952 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2953 nullptr, "wide.masked.gather"); 2954 addMetadata(NewLI, LI); 2955 } else { 2956 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2957 if (isMaskRequired) 2958 NewLI = Builder.CreateMaskedLoad( 2959 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2960 "wide.masked.load"); 2961 else 2962 NewLI = 2963 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2964 2965 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2966 addMetadata(NewLI, LI); 2967 if (Reverse) 2968 NewLI = reverseVector(NewLI); 2969 } 2970 2971 State.set(Def, NewLI, Part); 2972 } 2973 } 2974 2975 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2976 VPUser &User, 2977 const VPIteration &Instance, 2978 bool IfPredicateInstr, 2979 VPTransformState &State) { 2980 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2981 2982 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2983 // the first lane and part. 2984 if (isa<NoAliasScopeDeclInst>(Instr)) 2985 if (!Instance.isFirstIteration()) 2986 return; 2987 2988 setDebugLocFromInst(Builder, Instr); 2989 2990 // Does this instruction return a value ? 2991 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2992 2993 Instruction *Cloned = Instr->clone(); 2994 if (!IsVoidRetTy) 2995 Cloned->setName(Instr->getName() + ".cloned"); 2996 2997 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2998 Builder.GetInsertPoint()); 2999 // Replace the operands of the cloned instructions with their scalar 3000 // equivalents in the new loop. 3001 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3002 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3003 auto InputInstance = Instance; 3004 if (!Operand || !OrigLoop->contains(Operand) || 3005 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3006 InputInstance.Lane = VPLane::getFirstLane(); 3007 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3008 Cloned->setOperand(op, NewOp); 3009 } 3010 addNewMetadata(Cloned, Instr); 3011 3012 // Place the cloned scalar in the new loop. 3013 Builder.Insert(Cloned); 3014 3015 State.set(Def, Cloned, Instance); 3016 3017 // If we just cloned a new assumption, add it the assumption cache. 3018 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3019 AC->registerAssumption(II); 3020 3021 // End if-block. 3022 if (IfPredicateInstr) 3023 PredicatedInstructions.push_back(Cloned); 3024 } 3025 3026 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3027 Value *End, Value *Step, 3028 Instruction *DL) { 3029 BasicBlock *Header = L->getHeader(); 3030 BasicBlock *Latch = L->getLoopLatch(); 3031 // As we're just creating this loop, it's possible no latch exists 3032 // yet. If so, use the header as this will be a single block loop. 3033 if (!Latch) 3034 Latch = Header; 3035 3036 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3037 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3038 setDebugLocFromInst(Builder, OldInst); 3039 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3040 3041 Builder.SetInsertPoint(Latch->getTerminator()); 3042 setDebugLocFromInst(Builder, OldInst); 3043 3044 // Create i+1 and fill the PHINode. 3045 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3046 Induction->addIncoming(Start, L->getLoopPreheader()); 3047 Induction->addIncoming(Next, Latch); 3048 // Create the compare. 3049 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3050 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3051 3052 // Now we have two terminators. Remove the old one from the block. 3053 Latch->getTerminator()->eraseFromParent(); 3054 3055 return Induction; 3056 } 3057 3058 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3059 if (TripCount) 3060 return TripCount; 3061 3062 assert(L && "Create Trip Count for null loop."); 3063 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3064 // Find the loop boundaries. 3065 ScalarEvolution *SE = PSE.getSE(); 3066 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3067 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3068 "Invalid loop count"); 3069 3070 Type *IdxTy = Legal->getWidestInductionType(); 3071 assert(IdxTy && "No type for induction"); 3072 3073 // The exit count might have the type of i64 while the phi is i32. This can 3074 // happen if we have an induction variable that is sign extended before the 3075 // compare. The only way that we get a backedge taken count is that the 3076 // induction variable was signed and as such will not overflow. In such a case 3077 // truncation is legal. 3078 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3079 IdxTy->getPrimitiveSizeInBits()) 3080 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3081 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3082 3083 // Get the total trip count from the count by adding 1. 3084 const SCEV *ExitCount = SE->getAddExpr( 3085 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3086 3087 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3088 3089 // Expand the trip count and place the new instructions in the preheader. 3090 // Notice that the pre-header does not change, only the loop body. 3091 SCEVExpander Exp(*SE, DL, "induction"); 3092 3093 // Count holds the overall loop count (N). 3094 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3095 L->getLoopPreheader()->getTerminator()); 3096 3097 if (TripCount->getType()->isPointerTy()) 3098 TripCount = 3099 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3100 L->getLoopPreheader()->getTerminator()); 3101 3102 return TripCount; 3103 } 3104 3105 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3106 if (VectorTripCount) 3107 return VectorTripCount; 3108 3109 Value *TC = getOrCreateTripCount(L); 3110 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3111 3112 Type *Ty = TC->getType(); 3113 // This is where we can make the step a runtime constant. 3114 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3115 3116 // If the tail is to be folded by masking, round the number of iterations N 3117 // up to a multiple of Step instead of rounding down. This is done by first 3118 // adding Step-1 and then rounding down. Note that it's ok if this addition 3119 // overflows: the vector induction variable will eventually wrap to zero given 3120 // that it starts at zero and its Step is a power of two; the loop will then 3121 // exit, with the last early-exit vector comparison also producing all-true. 3122 if (Cost->foldTailByMasking()) { 3123 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3124 "VF*UF must be a power of 2 when folding tail by masking"); 3125 assert(!VF.isScalable() && 3126 "Tail folding not yet supported for scalable vectors"); 3127 TC = Builder.CreateAdd( 3128 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3129 } 3130 3131 // Now we need to generate the expression for the part of the loop that the 3132 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3133 // iterations are not required for correctness, or N - Step, otherwise. Step 3134 // is equal to the vectorization factor (number of SIMD elements) times the 3135 // unroll factor (number of SIMD instructions). 3136 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3137 3138 // There are two cases where we need to ensure (at least) the last iteration 3139 // runs in the scalar remainder loop. Thus, if the step evenly divides 3140 // the trip count, we set the remainder to be equal to the step. If the step 3141 // does not evenly divide the trip count, no adjustment is necessary since 3142 // there will already be scalar iterations. Note that the minimum iterations 3143 // check ensures that N >= Step. The cases are: 3144 // 1) If there is a non-reversed interleaved group that may speculatively 3145 // access memory out-of-bounds. 3146 // 2) If any instruction may follow a conditionally taken exit. That is, if 3147 // the loop contains multiple exiting blocks, or a single exiting block 3148 // which is not the latch. 3149 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3150 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3151 R = Builder.CreateSelect(IsZero, Step, R); 3152 } 3153 3154 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3155 3156 return VectorTripCount; 3157 } 3158 3159 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3160 const DataLayout &DL) { 3161 // Verify that V is a vector type with same number of elements as DstVTy. 3162 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3163 unsigned VF = DstFVTy->getNumElements(); 3164 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3165 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3166 Type *SrcElemTy = SrcVecTy->getElementType(); 3167 Type *DstElemTy = DstFVTy->getElementType(); 3168 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3169 "Vector elements must have same size"); 3170 3171 // Do a direct cast if element types are castable. 3172 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3173 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3174 } 3175 // V cannot be directly casted to desired vector type. 3176 // May happen when V is a floating point vector but DstVTy is a vector of 3177 // pointers or vice-versa. Handle this using a two-step bitcast using an 3178 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3179 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3180 "Only one type should be a pointer type"); 3181 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3182 "Only one type should be a floating point type"); 3183 Type *IntTy = 3184 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3185 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3186 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3187 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3188 } 3189 3190 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3191 BasicBlock *Bypass) { 3192 Value *Count = getOrCreateTripCount(L); 3193 // Reuse existing vector loop preheader for TC checks. 3194 // Note that new preheader block is generated for vector loop. 3195 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3196 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3197 3198 // Generate code to check if the loop's trip count is less than VF * UF, or 3199 // equal to it in case a scalar epilogue is required; this implies that the 3200 // vector trip count is zero. This check also covers the case where adding one 3201 // to the backedge-taken count overflowed leading to an incorrect trip count 3202 // of zero. In this case we will also jump to the scalar loop. 3203 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3204 : ICmpInst::ICMP_ULT; 3205 3206 // If tail is to be folded, vector loop takes care of all iterations. 3207 Value *CheckMinIters = Builder.getFalse(); 3208 if (!Cost->foldTailByMasking()) { 3209 Value *Step = 3210 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3211 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3212 } 3213 // Create new preheader for vector loop. 3214 LoopVectorPreHeader = 3215 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3216 "vector.ph"); 3217 3218 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3219 DT->getNode(Bypass)->getIDom()) && 3220 "TC check is expected to dominate Bypass"); 3221 3222 // Update dominator for Bypass & LoopExit. 3223 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3224 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3225 3226 ReplaceInstWithInst( 3227 TCCheckBlock->getTerminator(), 3228 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3229 LoopBypassBlocks.push_back(TCCheckBlock); 3230 } 3231 3232 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3233 3234 BasicBlock *const SCEVCheckBlock = 3235 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3236 if (!SCEVCheckBlock) 3237 return nullptr; 3238 3239 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3240 (OptForSizeBasedOnProfile && 3241 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3242 "Cannot SCEV check stride or overflow when optimizing for size"); 3243 3244 3245 // Update dominator only if this is first RT check. 3246 if (LoopBypassBlocks.empty()) { 3247 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3248 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3249 } 3250 3251 LoopBypassBlocks.push_back(SCEVCheckBlock); 3252 AddedSafetyChecks = true; 3253 return SCEVCheckBlock; 3254 } 3255 3256 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3257 BasicBlock *Bypass) { 3258 // VPlan-native path does not do any analysis for runtime checks currently. 3259 if (EnableVPlanNativePath) 3260 return nullptr; 3261 3262 BasicBlock *const MemCheckBlock = 3263 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3264 3265 // Check if we generated code that checks in runtime if arrays overlap. We put 3266 // the checks into a separate block to make the more common case of few 3267 // elements faster. 3268 if (!MemCheckBlock) 3269 return nullptr; 3270 3271 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3272 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3273 "Cannot emit memory checks when optimizing for size, unless forced " 3274 "to vectorize."); 3275 ORE->emit([&]() { 3276 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3277 L->getStartLoc(), L->getHeader()) 3278 << "Code-size may be reduced by not forcing " 3279 "vectorization, or by source-code modifications " 3280 "eliminating the need for runtime checks " 3281 "(e.g., adding 'restrict')."; 3282 }); 3283 } 3284 3285 LoopBypassBlocks.push_back(MemCheckBlock); 3286 3287 AddedSafetyChecks = true; 3288 3289 // We currently don't use LoopVersioning for the actual loop cloning but we 3290 // still use it to add the noalias metadata. 3291 LVer = std::make_unique<LoopVersioning>( 3292 *Legal->getLAI(), 3293 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3294 DT, PSE.getSE()); 3295 LVer->prepareNoAliasMetadata(); 3296 return MemCheckBlock; 3297 } 3298 3299 Value *InnerLoopVectorizer::emitTransformedIndex( 3300 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3301 const InductionDescriptor &ID) const { 3302 3303 SCEVExpander Exp(*SE, DL, "induction"); 3304 auto Step = ID.getStep(); 3305 auto StartValue = ID.getStartValue(); 3306 assert(Index->getType() == Step->getType() && 3307 "Index type does not match StepValue type"); 3308 3309 // Note: the IR at this point is broken. We cannot use SE to create any new 3310 // SCEV and then expand it, hoping that SCEV's simplification will give us 3311 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3312 // lead to various SCEV crashes. So all we can do is to use builder and rely 3313 // on InstCombine for future simplifications. Here we handle some trivial 3314 // cases only. 3315 auto CreateAdd = [&B](Value *X, Value *Y) { 3316 assert(X->getType() == Y->getType() && "Types don't match!"); 3317 if (auto *CX = dyn_cast<ConstantInt>(X)) 3318 if (CX->isZero()) 3319 return Y; 3320 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3321 if (CY->isZero()) 3322 return X; 3323 return B.CreateAdd(X, Y); 3324 }; 3325 3326 auto CreateMul = [&B](Value *X, Value *Y) { 3327 assert(X->getType() == Y->getType() && "Types don't match!"); 3328 if (auto *CX = dyn_cast<ConstantInt>(X)) 3329 if (CX->isOne()) 3330 return Y; 3331 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3332 if (CY->isOne()) 3333 return X; 3334 return B.CreateMul(X, Y); 3335 }; 3336 3337 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3338 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3339 // the DomTree is not kept up-to-date for additional blocks generated in the 3340 // vector loop. By using the header as insertion point, we guarantee that the 3341 // expanded instructions dominate all their uses. 3342 auto GetInsertPoint = [this, &B]() { 3343 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3344 if (InsertBB != LoopVectorBody && 3345 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3346 return LoopVectorBody->getTerminator(); 3347 return &*B.GetInsertPoint(); 3348 }; 3349 3350 switch (ID.getKind()) { 3351 case InductionDescriptor::IK_IntInduction: { 3352 assert(Index->getType() == StartValue->getType() && 3353 "Index type does not match StartValue type"); 3354 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3355 return B.CreateSub(StartValue, Index); 3356 auto *Offset = CreateMul( 3357 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3358 return CreateAdd(StartValue, Offset); 3359 } 3360 case InductionDescriptor::IK_PtrInduction: { 3361 assert(isa<SCEVConstant>(Step) && 3362 "Expected constant step for pointer induction"); 3363 return B.CreateGEP( 3364 StartValue->getType()->getPointerElementType(), StartValue, 3365 CreateMul(Index, 3366 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3367 } 3368 case InductionDescriptor::IK_FpInduction: { 3369 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3370 auto InductionBinOp = ID.getInductionBinOp(); 3371 assert(InductionBinOp && 3372 (InductionBinOp->getOpcode() == Instruction::FAdd || 3373 InductionBinOp->getOpcode() == Instruction::FSub) && 3374 "Original bin op should be defined for FP induction"); 3375 3376 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3377 Value *MulExp = B.CreateFMul(StepValue, Index); 3378 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3379 "induction"); 3380 } 3381 case InductionDescriptor::IK_NoInduction: 3382 return nullptr; 3383 } 3384 llvm_unreachable("invalid enum"); 3385 } 3386 3387 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3388 LoopScalarBody = OrigLoop->getHeader(); 3389 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3390 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3391 assert(LoopExitBlock && "Must have an exit block"); 3392 assert(LoopVectorPreHeader && "Invalid loop structure"); 3393 3394 LoopMiddleBlock = 3395 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3396 LI, nullptr, Twine(Prefix) + "middle.block"); 3397 LoopScalarPreHeader = 3398 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3399 nullptr, Twine(Prefix) + "scalar.ph"); 3400 3401 // Set up branch from middle block to the exit and scalar preheader blocks. 3402 // completeLoopSkeleton will update the condition to use an iteration check, 3403 // if required to decide whether to execute the remainder. 3404 BranchInst *BrInst = 3405 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3406 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3407 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3408 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3409 3410 // We intentionally don't let SplitBlock to update LoopInfo since 3411 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3412 // LoopVectorBody is explicitly added to the correct place few lines later. 3413 LoopVectorBody = 3414 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3415 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3416 3417 // Update dominator for loop exit. 3418 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3419 3420 // Create and register the new vector loop. 3421 Loop *Lp = LI->AllocateLoop(); 3422 Loop *ParentLoop = OrigLoop->getParentLoop(); 3423 3424 // Insert the new loop into the loop nest and register the new basic blocks 3425 // before calling any utilities such as SCEV that require valid LoopInfo. 3426 if (ParentLoop) { 3427 ParentLoop->addChildLoop(Lp); 3428 } else { 3429 LI->addTopLevelLoop(Lp); 3430 } 3431 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3432 return Lp; 3433 } 3434 3435 void InnerLoopVectorizer::createInductionResumeValues( 3436 Loop *L, Value *VectorTripCount, 3437 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3438 assert(VectorTripCount && L && "Expected valid arguments"); 3439 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3440 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3441 "Inconsistent information about additional bypass."); 3442 // We are going to resume the execution of the scalar loop. 3443 // Go over all of the induction variables that we found and fix the 3444 // PHIs that are left in the scalar version of the loop. 3445 // The starting values of PHI nodes depend on the counter of the last 3446 // iteration in the vectorized loop. 3447 // If we come from a bypass edge then we need to start from the original 3448 // start value. 3449 for (auto &InductionEntry : Legal->getInductionVars()) { 3450 PHINode *OrigPhi = InductionEntry.first; 3451 InductionDescriptor II = InductionEntry.second; 3452 3453 // Create phi nodes to merge from the backedge-taken check block. 3454 PHINode *BCResumeVal = 3455 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3456 LoopScalarPreHeader->getTerminator()); 3457 // Copy original phi DL over to the new one. 3458 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3459 Value *&EndValue = IVEndValues[OrigPhi]; 3460 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3461 if (OrigPhi == OldInduction) { 3462 // We know what the end value is. 3463 EndValue = VectorTripCount; 3464 } else { 3465 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3466 3467 // Fast-math-flags propagate from the original induction instruction. 3468 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3469 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3470 3471 Type *StepType = II.getStep()->getType(); 3472 Instruction::CastOps CastOp = 3473 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3474 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3475 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3476 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3477 EndValue->setName("ind.end"); 3478 3479 // Compute the end value for the additional bypass (if applicable). 3480 if (AdditionalBypass.first) { 3481 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3482 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3483 StepType, true); 3484 CRD = 3485 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3486 EndValueFromAdditionalBypass = 3487 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3488 EndValueFromAdditionalBypass->setName("ind.end"); 3489 } 3490 } 3491 // The new PHI merges the original incoming value, in case of a bypass, 3492 // or the value at the end of the vectorized loop. 3493 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3494 3495 // Fix the scalar body counter (PHI node). 3496 // The old induction's phi node in the scalar body needs the truncated 3497 // value. 3498 for (BasicBlock *BB : LoopBypassBlocks) 3499 BCResumeVal->addIncoming(II.getStartValue(), BB); 3500 3501 if (AdditionalBypass.first) 3502 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3503 EndValueFromAdditionalBypass); 3504 3505 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3506 } 3507 } 3508 3509 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3510 MDNode *OrigLoopID) { 3511 assert(L && "Expected valid loop."); 3512 3513 // The trip counts should be cached by now. 3514 Value *Count = getOrCreateTripCount(L); 3515 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3516 3517 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3518 3519 // Add a check in the middle block to see if we have completed 3520 // all of the iterations in the first vector loop. 3521 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3522 // If tail is to be folded, we know we don't need to run the remainder. 3523 if (!Cost->foldTailByMasking()) { 3524 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3525 Count, VectorTripCount, "cmp.n", 3526 LoopMiddleBlock->getTerminator()); 3527 3528 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3529 // of the corresponding compare because they may have ended up with 3530 // different line numbers and we want to avoid awkward line stepping while 3531 // debugging. Eg. if the compare has got a line number inside the loop. 3532 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3533 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3534 } 3535 3536 // Get ready to start creating new instructions into the vectorized body. 3537 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3538 "Inconsistent vector loop preheader"); 3539 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3540 3541 Optional<MDNode *> VectorizedLoopID = 3542 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3543 LLVMLoopVectorizeFollowupVectorized}); 3544 if (VectorizedLoopID.hasValue()) { 3545 L->setLoopID(VectorizedLoopID.getValue()); 3546 3547 // Do not setAlreadyVectorized if loop attributes have been defined 3548 // explicitly. 3549 return LoopVectorPreHeader; 3550 } 3551 3552 // Keep all loop hints from the original loop on the vector loop (we'll 3553 // replace the vectorizer-specific hints below). 3554 if (MDNode *LID = OrigLoop->getLoopID()) 3555 L->setLoopID(LID); 3556 3557 LoopVectorizeHints Hints(L, true, *ORE); 3558 Hints.setAlreadyVectorized(); 3559 3560 #ifdef EXPENSIVE_CHECKS 3561 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3562 LI->verify(*DT); 3563 #endif 3564 3565 return LoopVectorPreHeader; 3566 } 3567 3568 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3569 /* 3570 In this function we generate a new loop. The new loop will contain 3571 the vectorized instructions while the old loop will continue to run the 3572 scalar remainder. 3573 3574 [ ] <-- loop iteration number check. 3575 / | 3576 / v 3577 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3578 | / | 3579 | / v 3580 || [ ] <-- vector pre header. 3581 |/ | 3582 | v 3583 | [ ] \ 3584 | [ ]_| <-- vector loop. 3585 | | 3586 | v 3587 | -[ ] <--- middle-block. 3588 | / | 3589 | / v 3590 -|- >[ ] <--- new preheader. 3591 | | 3592 | v 3593 | [ ] \ 3594 | [ ]_| <-- old scalar loop to handle remainder. 3595 \ | 3596 \ v 3597 >[ ] <-- exit block. 3598 ... 3599 */ 3600 3601 // Get the metadata of the original loop before it gets modified. 3602 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3603 3604 // Create an empty vector loop, and prepare basic blocks for the runtime 3605 // checks. 3606 Loop *Lp = createVectorLoopSkeleton(""); 3607 3608 // Now, compare the new count to zero. If it is zero skip the vector loop and 3609 // jump to the scalar loop. This check also covers the case where the 3610 // backedge-taken count is uint##_max: adding one to it will overflow leading 3611 // to an incorrect trip count of zero. In this (rare) case we will also jump 3612 // to the scalar loop. 3613 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3614 3615 // Generate the code to check any assumptions that we've made for SCEV 3616 // expressions. 3617 emitSCEVChecks(Lp, LoopScalarPreHeader); 3618 3619 // Generate the code that checks in runtime if arrays overlap. We put the 3620 // checks into a separate block to make the more common case of few elements 3621 // faster. 3622 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3623 3624 // Some loops have a single integer induction variable, while other loops 3625 // don't. One example is c++ iterators that often have multiple pointer 3626 // induction variables. In the code below we also support a case where we 3627 // don't have a single induction variable. 3628 // 3629 // We try to obtain an induction variable from the original loop as hard 3630 // as possible. However if we don't find one that: 3631 // - is an integer 3632 // - counts from zero, stepping by one 3633 // - is the size of the widest induction variable type 3634 // then we create a new one. 3635 OldInduction = Legal->getPrimaryInduction(); 3636 Type *IdxTy = Legal->getWidestInductionType(); 3637 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3638 // The loop step is equal to the vectorization factor (num of SIMD elements) 3639 // times the unroll factor (num of SIMD instructions). 3640 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3641 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3642 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3643 Induction = 3644 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3645 getDebugLocFromInstOrOperands(OldInduction)); 3646 3647 // Emit phis for the new starting index of the scalar loop. 3648 createInductionResumeValues(Lp, CountRoundDown); 3649 3650 return completeLoopSkeleton(Lp, OrigLoopID); 3651 } 3652 3653 // Fix up external users of the induction variable. At this point, we are 3654 // in LCSSA form, with all external PHIs that use the IV having one input value, 3655 // coming from the remainder loop. We need those PHIs to also have a correct 3656 // value for the IV when arriving directly from the middle block. 3657 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3658 const InductionDescriptor &II, 3659 Value *CountRoundDown, Value *EndValue, 3660 BasicBlock *MiddleBlock) { 3661 // There are two kinds of external IV usages - those that use the value 3662 // computed in the last iteration (the PHI) and those that use the penultimate 3663 // value (the value that feeds into the phi from the loop latch). 3664 // We allow both, but they, obviously, have different values. 3665 3666 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3667 3668 DenseMap<Value *, Value *> MissingVals; 3669 3670 // An external user of the last iteration's value should see the value that 3671 // the remainder loop uses to initialize its own IV. 3672 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3673 for (User *U : PostInc->users()) { 3674 Instruction *UI = cast<Instruction>(U); 3675 if (!OrigLoop->contains(UI)) { 3676 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3677 MissingVals[UI] = EndValue; 3678 } 3679 } 3680 3681 // An external user of the penultimate value need to see EndValue - Step. 3682 // The simplest way to get this is to recompute it from the constituent SCEVs, 3683 // that is Start + (Step * (CRD - 1)). 3684 for (User *U : OrigPhi->users()) { 3685 auto *UI = cast<Instruction>(U); 3686 if (!OrigLoop->contains(UI)) { 3687 const DataLayout &DL = 3688 OrigLoop->getHeader()->getModule()->getDataLayout(); 3689 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3690 3691 IRBuilder<> B(MiddleBlock->getTerminator()); 3692 3693 // Fast-math-flags propagate from the original induction instruction. 3694 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3695 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3696 3697 Value *CountMinusOne = B.CreateSub( 3698 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3699 Value *CMO = 3700 !II.getStep()->getType()->isIntegerTy() 3701 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3702 II.getStep()->getType()) 3703 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3704 CMO->setName("cast.cmo"); 3705 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3706 Escape->setName("ind.escape"); 3707 MissingVals[UI] = Escape; 3708 } 3709 } 3710 3711 for (auto &I : MissingVals) { 3712 PHINode *PHI = cast<PHINode>(I.first); 3713 // One corner case we have to handle is two IVs "chasing" each-other, 3714 // that is %IV2 = phi [...], [ %IV1, %latch ] 3715 // In this case, if IV1 has an external use, we need to avoid adding both 3716 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3717 // don't already have an incoming value for the middle block. 3718 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3719 PHI->addIncoming(I.second, MiddleBlock); 3720 } 3721 } 3722 3723 namespace { 3724 3725 struct CSEDenseMapInfo { 3726 static bool canHandle(const Instruction *I) { 3727 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3728 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3729 } 3730 3731 static inline Instruction *getEmptyKey() { 3732 return DenseMapInfo<Instruction *>::getEmptyKey(); 3733 } 3734 3735 static inline Instruction *getTombstoneKey() { 3736 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3737 } 3738 3739 static unsigned getHashValue(const Instruction *I) { 3740 assert(canHandle(I) && "Unknown instruction!"); 3741 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3742 I->value_op_end())); 3743 } 3744 3745 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3746 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3747 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3748 return LHS == RHS; 3749 return LHS->isIdenticalTo(RHS); 3750 } 3751 }; 3752 3753 } // end anonymous namespace 3754 3755 ///Perform cse of induction variable instructions. 3756 static void cse(BasicBlock *BB) { 3757 // Perform simple cse. 3758 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3759 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3760 Instruction *In = &*I++; 3761 3762 if (!CSEDenseMapInfo::canHandle(In)) 3763 continue; 3764 3765 // Check if we can replace this instruction with any of the 3766 // visited instructions. 3767 if (Instruction *V = CSEMap.lookup(In)) { 3768 In->replaceAllUsesWith(V); 3769 In->eraseFromParent(); 3770 continue; 3771 } 3772 3773 CSEMap[In] = In; 3774 } 3775 } 3776 3777 InstructionCost 3778 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3779 bool &NeedToScalarize) const { 3780 Function *F = CI->getCalledFunction(); 3781 Type *ScalarRetTy = CI->getType(); 3782 SmallVector<Type *, 4> Tys, ScalarTys; 3783 for (auto &ArgOp : CI->arg_operands()) 3784 ScalarTys.push_back(ArgOp->getType()); 3785 3786 // Estimate cost of scalarized vector call. The source operands are assumed 3787 // to be vectors, so we need to extract individual elements from there, 3788 // execute VF scalar calls, and then gather the result into the vector return 3789 // value. 3790 InstructionCost ScalarCallCost = 3791 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3792 if (VF.isScalar()) 3793 return ScalarCallCost; 3794 3795 // Compute corresponding vector type for return value and arguments. 3796 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3797 for (Type *ScalarTy : ScalarTys) 3798 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3799 3800 // Compute costs of unpacking argument values for the scalar calls and 3801 // packing the return values to a vector. 3802 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3803 3804 InstructionCost Cost = 3805 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3806 3807 // If we can't emit a vector call for this function, then the currently found 3808 // cost is the cost we need to return. 3809 NeedToScalarize = true; 3810 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3811 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3812 3813 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3814 return Cost; 3815 3816 // If the corresponding vector cost is cheaper, return its cost. 3817 InstructionCost VectorCallCost = 3818 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3819 if (VectorCallCost < Cost) { 3820 NeedToScalarize = false; 3821 Cost = VectorCallCost; 3822 } 3823 return Cost; 3824 } 3825 3826 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3827 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3828 return Elt; 3829 return VectorType::get(Elt, VF); 3830 } 3831 3832 InstructionCost 3833 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3834 ElementCount VF) const { 3835 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3836 assert(ID && "Expected intrinsic call!"); 3837 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3838 FastMathFlags FMF; 3839 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3840 FMF = FPMO->getFastMathFlags(); 3841 3842 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3843 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3844 SmallVector<Type *> ParamTys; 3845 std::transform(FTy->param_begin(), FTy->param_end(), 3846 std::back_inserter(ParamTys), 3847 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3848 3849 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3850 dyn_cast<IntrinsicInst>(CI)); 3851 return TTI.getIntrinsicInstrCost(CostAttrs, 3852 TargetTransformInfo::TCK_RecipThroughput); 3853 } 3854 3855 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3856 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3857 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3858 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3859 } 3860 3861 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3862 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3863 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3864 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3865 } 3866 3867 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3868 // For every instruction `I` in MinBWs, truncate the operands, create a 3869 // truncated version of `I` and reextend its result. InstCombine runs 3870 // later and will remove any ext/trunc pairs. 3871 SmallPtrSet<Value *, 4> Erased; 3872 for (const auto &KV : Cost->getMinimalBitwidths()) { 3873 // If the value wasn't vectorized, we must maintain the original scalar 3874 // type. The absence of the value from State indicates that it 3875 // wasn't vectorized. 3876 VPValue *Def = State.Plan->getVPValue(KV.first); 3877 if (!State.hasAnyVectorValue(Def)) 3878 continue; 3879 for (unsigned Part = 0; Part < UF; ++Part) { 3880 Value *I = State.get(Def, Part); 3881 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3882 continue; 3883 Type *OriginalTy = I->getType(); 3884 Type *ScalarTruncatedTy = 3885 IntegerType::get(OriginalTy->getContext(), KV.second); 3886 auto *TruncatedTy = FixedVectorType::get( 3887 ScalarTruncatedTy, 3888 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3889 if (TruncatedTy == OriginalTy) 3890 continue; 3891 3892 IRBuilder<> B(cast<Instruction>(I)); 3893 auto ShrinkOperand = [&](Value *V) -> Value * { 3894 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3895 if (ZI->getSrcTy() == TruncatedTy) 3896 return ZI->getOperand(0); 3897 return B.CreateZExtOrTrunc(V, TruncatedTy); 3898 }; 3899 3900 // The actual instruction modification depends on the instruction type, 3901 // unfortunately. 3902 Value *NewI = nullptr; 3903 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3904 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3905 ShrinkOperand(BO->getOperand(1))); 3906 3907 // Any wrapping introduced by shrinking this operation shouldn't be 3908 // considered undefined behavior. So, we can't unconditionally copy 3909 // arithmetic wrapping flags to NewI. 3910 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3911 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3912 NewI = 3913 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3914 ShrinkOperand(CI->getOperand(1))); 3915 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3916 NewI = B.CreateSelect(SI->getCondition(), 3917 ShrinkOperand(SI->getTrueValue()), 3918 ShrinkOperand(SI->getFalseValue())); 3919 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3920 switch (CI->getOpcode()) { 3921 default: 3922 llvm_unreachable("Unhandled cast!"); 3923 case Instruction::Trunc: 3924 NewI = ShrinkOperand(CI->getOperand(0)); 3925 break; 3926 case Instruction::SExt: 3927 NewI = B.CreateSExtOrTrunc( 3928 CI->getOperand(0), 3929 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3930 break; 3931 case Instruction::ZExt: 3932 NewI = B.CreateZExtOrTrunc( 3933 CI->getOperand(0), 3934 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3935 break; 3936 } 3937 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3938 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3939 ->getNumElements(); 3940 auto *O0 = B.CreateZExtOrTrunc( 3941 SI->getOperand(0), 3942 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3943 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3944 ->getNumElements(); 3945 auto *O1 = B.CreateZExtOrTrunc( 3946 SI->getOperand(1), 3947 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3948 3949 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3950 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3951 // Don't do anything with the operands, just extend the result. 3952 continue; 3953 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3954 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3955 ->getNumElements(); 3956 auto *O0 = B.CreateZExtOrTrunc( 3957 IE->getOperand(0), 3958 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3959 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3960 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3961 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3962 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3963 ->getNumElements(); 3964 auto *O0 = B.CreateZExtOrTrunc( 3965 EE->getOperand(0), 3966 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3967 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3968 } else { 3969 // If we don't know what to do, be conservative and don't do anything. 3970 continue; 3971 } 3972 3973 // Lastly, extend the result. 3974 NewI->takeName(cast<Instruction>(I)); 3975 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3976 I->replaceAllUsesWith(Res); 3977 cast<Instruction>(I)->eraseFromParent(); 3978 Erased.insert(I); 3979 State.reset(Def, Res, Part); 3980 } 3981 } 3982 3983 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3984 for (const auto &KV : Cost->getMinimalBitwidths()) { 3985 // If the value wasn't vectorized, we must maintain the original scalar 3986 // type. The absence of the value from State indicates that it 3987 // wasn't vectorized. 3988 VPValue *Def = State.Plan->getVPValue(KV.first); 3989 if (!State.hasAnyVectorValue(Def)) 3990 continue; 3991 for (unsigned Part = 0; Part < UF; ++Part) { 3992 Value *I = State.get(Def, Part); 3993 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3994 if (Inst && Inst->use_empty()) { 3995 Value *NewI = Inst->getOperand(0); 3996 Inst->eraseFromParent(); 3997 State.reset(Def, NewI, Part); 3998 } 3999 } 4000 } 4001 } 4002 4003 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4004 // Insert truncates and extends for any truncated instructions as hints to 4005 // InstCombine. 4006 if (VF.isVector()) 4007 truncateToMinimalBitwidths(State); 4008 4009 // Fix widened non-induction PHIs by setting up the PHI operands. 4010 if (OrigPHIsToFix.size()) { 4011 assert(EnableVPlanNativePath && 4012 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4013 fixNonInductionPHIs(State); 4014 } 4015 4016 // At this point every instruction in the original loop is widened to a 4017 // vector form. Now we need to fix the recurrences in the loop. These PHI 4018 // nodes are currently empty because we did not want to introduce cycles. 4019 // This is the second stage of vectorizing recurrences. 4020 fixCrossIterationPHIs(State); 4021 4022 // Forget the original basic block. 4023 PSE.getSE()->forgetLoop(OrigLoop); 4024 4025 // Fix-up external users of the induction variables. 4026 for (auto &Entry : Legal->getInductionVars()) 4027 fixupIVUsers(Entry.first, Entry.second, 4028 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4029 IVEndValues[Entry.first], LoopMiddleBlock); 4030 4031 fixLCSSAPHIs(State); 4032 for (Instruction *PI : PredicatedInstructions) 4033 sinkScalarOperands(&*PI); 4034 4035 // Remove redundant induction instructions. 4036 cse(LoopVectorBody); 4037 4038 // Set/update profile weights for the vector and remainder loops as original 4039 // loop iterations are now distributed among them. Note that original loop 4040 // represented by LoopScalarBody becomes remainder loop after vectorization. 4041 // 4042 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4043 // end up getting slightly roughened result but that should be OK since 4044 // profile is not inherently precise anyway. Note also possible bypass of 4045 // vector code caused by legality checks is ignored, assigning all the weight 4046 // to the vector loop, optimistically. 4047 // 4048 // For scalable vectorization we can't know at compile time how many iterations 4049 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4050 // vscale of '1'. 4051 setProfileInfoAfterUnrolling( 4052 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4053 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4054 } 4055 4056 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4057 // In order to support recurrences we need to be able to vectorize Phi nodes. 4058 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4059 // stage #2: We now need to fix the recurrences by adding incoming edges to 4060 // the currently empty PHI nodes. At this point every instruction in the 4061 // original loop is widened to a vector form so we can use them to construct 4062 // the incoming edges. 4063 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4064 // Handle first-order recurrences and reductions that need to be fixed. 4065 if (Legal->isFirstOrderRecurrence(&Phi)) 4066 fixFirstOrderRecurrence(&Phi, State); 4067 else if (Legal->isReductionVariable(&Phi)) 4068 fixReduction(&Phi, State); 4069 } 4070 } 4071 4072 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4073 VPTransformState &State) { 4074 // This is the second phase of vectorizing first-order recurrences. An 4075 // overview of the transformation is described below. Suppose we have the 4076 // following loop. 4077 // 4078 // for (int i = 0; i < n; ++i) 4079 // b[i] = a[i] - a[i - 1]; 4080 // 4081 // There is a first-order recurrence on "a". For this loop, the shorthand 4082 // scalar IR looks like: 4083 // 4084 // scalar.ph: 4085 // s_init = a[-1] 4086 // br scalar.body 4087 // 4088 // scalar.body: 4089 // i = phi [0, scalar.ph], [i+1, scalar.body] 4090 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4091 // s2 = a[i] 4092 // b[i] = s2 - s1 4093 // br cond, scalar.body, ... 4094 // 4095 // In this example, s1 is a recurrence because it's value depends on the 4096 // previous iteration. In the first phase of vectorization, we created a 4097 // temporary value for s1. We now complete the vectorization and produce the 4098 // shorthand vector IR shown below (for VF = 4, UF = 1). 4099 // 4100 // vector.ph: 4101 // v_init = vector(..., ..., ..., a[-1]) 4102 // br vector.body 4103 // 4104 // vector.body 4105 // i = phi [0, vector.ph], [i+4, vector.body] 4106 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4107 // v2 = a[i, i+1, i+2, i+3]; 4108 // v3 = vector(v1(3), v2(0, 1, 2)) 4109 // b[i, i+1, i+2, i+3] = v2 - v3 4110 // br cond, vector.body, middle.block 4111 // 4112 // middle.block: 4113 // x = v2(3) 4114 // br scalar.ph 4115 // 4116 // scalar.ph: 4117 // s_init = phi [x, middle.block], [a[-1], otherwise] 4118 // br scalar.body 4119 // 4120 // After execution completes the vector loop, we extract the next value of 4121 // the recurrence (x) to use as the initial value in the scalar loop. 4122 4123 // Get the original loop preheader and single loop latch. 4124 auto *Preheader = OrigLoop->getLoopPreheader(); 4125 auto *Latch = OrigLoop->getLoopLatch(); 4126 4127 // Get the initial and previous values of the scalar recurrence. 4128 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4129 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4130 4131 // Create a vector from the initial value. 4132 auto *VectorInit = ScalarInit; 4133 if (VF.isVector()) { 4134 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4135 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4136 VectorInit = Builder.CreateInsertElement( 4137 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4138 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4139 } 4140 4141 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4142 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4143 // We constructed a temporary phi node in the first phase of vectorization. 4144 // This phi node will eventually be deleted. 4145 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4146 4147 // Create a phi node for the new recurrence. The current value will either be 4148 // the initial value inserted into a vector or loop-varying vector value. 4149 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4150 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4151 4152 // Get the vectorized previous value of the last part UF - 1. It appears last 4153 // among all unrolled iterations, due to the order of their construction. 4154 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4155 4156 // Find and set the insertion point after the previous value if it is an 4157 // instruction. 4158 BasicBlock::iterator InsertPt; 4159 // Note that the previous value may have been constant-folded so it is not 4160 // guaranteed to be an instruction in the vector loop. 4161 // FIXME: Loop invariant values do not form recurrences. We should deal with 4162 // them earlier. 4163 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4164 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4165 else { 4166 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4167 if (isa<PHINode>(PreviousLastPart)) 4168 // If the previous value is a phi node, we should insert after all the phi 4169 // nodes in the block containing the PHI to avoid breaking basic block 4170 // verification. Note that the basic block may be different to 4171 // LoopVectorBody, in case we predicate the loop. 4172 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4173 else 4174 InsertPt = ++PreviousInst->getIterator(); 4175 } 4176 Builder.SetInsertPoint(&*InsertPt); 4177 4178 // We will construct a vector for the recurrence by combining the values for 4179 // the current and previous iterations. This is the required shuffle mask. 4180 assert(!VF.isScalable()); 4181 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4182 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4183 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4184 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4185 4186 // The vector from which to take the initial value for the current iteration 4187 // (actual or unrolled). Initially, this is the vector phi node. 4188 Value *Incoming = VecPhi; 4189 4190 // Shuffle the current and previous vector and update the vector parts. 4191 for (unsigned Part = 0; Part < UF; ++Part) { 4192 Value *PreviousPart = State.get(PreviousDef, Part); 4193 Value *PhiPart = State.get(PhiDef, Part); 4194 auto *Shuffle = 4195 VF.isVector() 4196 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4197 : Incoming; 4198 PhiPart->replaceAllUsesWith(Shuffle); 4199 cast<Instruction>(PhiPart)->eraseFromParent(); 4200 State.reset(PhiDef, Shuffle, Part); 4201 Incoming = PreviousPart; 4202 } 4203 4204 // Fix the latch value of the new recurrence in the vector loop. 4205 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4206 4207 // Extract the last vector element in the middle block. This will be the 4208 // initial value for the recurrence when jumping to the scalar loop. 4209 auto *ExtractForScalar = Incoming; 4210 if (VF.isVector()) { 4211 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4212 ExtractForScalar = Builder.CreateExtractElement( 4213 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4214 "vector.recur.extract"); 4215 } 4216 // Extract the second last element in the middle block if the 4217 // Phi is used outside the loop. We need to extract the phi itself 4218 // and not the last element (the phi update in the current iteration). This 4219 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4220 // when the scalar loop is not run at all. 4221 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4222 if (VF.isVector()) 4223 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4224 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4225 "vector.recur.extract.for.phi"); 4226 // When loop is unrolled without vectorizing, initialize 4227 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4228 // `Incoming`. This is analogous to the vectorized case above: extracting the 4229 // second last element when VF > 1. 4230 else if (UF > 1) 4231 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4232 4233 // Fix the initial value of the original recurrence in the scalar loop. 4234 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4235 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4236 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4237 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4238 Start->addIncoming(Incoming, BB); 4239 } 4240 4241 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4242 Phi->setName("scalar.recur"); 4243 4244 // Finally, fix users of the recurrence outside the loop. The users will need 4245 // either the last value of the scalar recurrence or the last value of the 4246 // vector recurrence we extracted in the middle block. Since the loop is in 4247 // LCSSA form, we just need to find all the phi nodes for the original scalar 4248 // recurrence in the exit block, and then add an edge for the middle block. 4249 // Note that LCSSA does not imply single entry when the original scalar loop 4250 // had multiple exiting edges (as we always run the last iteration in the 4251 // scalar epilogue); in that case, the exiting path through middle will be 4252 // dynamically dead and the value picked for the phi doesn't matter. 4253 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4254 if (any_of(LCSSAPhi.incoming_values(), 4255 [Phi](Value *V) { return V == Phi; })) 4256 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4257 } 4258 4259 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4260 return EnableStrictReductions && RdxDesc.isOrdered(); 4261 } 4262 4263 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4264 // Get it's reduction variable descriptor. 4265 assert(Legal->isReductionVariable(Phi) && 4266 "Unable to find the reduction variable"); 4267 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4268 4269 RecurKind RK = RdxDesc.getRecurrenceKind(); 4270 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4271 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4272 setDebugLocFromInst(Builder, ReductionStartValue); 4273 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4274 4275 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4276 // This is the vector-clone of the value that leaves the loop. 4277 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4278 4279 // Wrap flags are in general invalid after vectorization, clear them. 4280 clearReductionWrapFlags(RdxDesc, State); 4281 4282 // Fix the vector-loop phi. 4283 4284 // Reductions do not have to start at zero. They can start with 4285 // any loop invariant values. 4286 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4287 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4288 4289 for (unsigned Part = 0; Part < UF; ++Part) { 4290 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4291 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4292 if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc) && 4293 State.VF.isVector()) 4294 Val = State.get(State.Plan->getVPValue(LoopVal), UF - 1); 4295 cast<PHINode>(VecRdxPhi) 4296 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4297 } 4298 4299 // Before each round, move the insertion point right between 4300 // the PHIs and the values we are going to write. 4301 // This allows us to write both PHINodes and the extractelement 4302 // instructions. 4303 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4304 4305 setDebugLocFromInst(Builder, LoopExitInst); 4306 4307 Type *PhiTy = Phi->getType(); 4308 // If tail is folded by masking, the vector value to leave the loop should be 4309 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4310 // instead of the former. For an inloop reduction the reduction will already 4311 // be predicated, and does not need to be handled here. 4312 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4313 for (unsigned Part = 0; Part < UF; ++Part) { 4314 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4315 Value *Sel = nullptr; 4316 for (User *U : VecLoopExitInst->users()) { 4317 if (isa<SelectInst>(U)) { 4318 assert(!Sel && "Reduction exit feeding two selects"); 4319 Sel = U; 4320 } else 4321 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4322 } 4323 assert(Sel && "Reduction exit feeds no select"); 4324 State.reset(LoopExitInstDef, Sel, Part); 4325 4326 // If the target can create a predicated operator for the reduction at no 4327 // extra cost in the loop (for example a predicated vadd), it can be 4328 // cheaper for the select to remain in the loop than be sunk out of it, 4329 // and so use the select value for the phi instead of the old 4330 // LoopExitValue. 4331 if (PreferPredicatedReductionSelect || 4332 TTI->preferPredicatedReductionSelect( 4333 RdxDesc.getOpcode(), PhiTy, 4334 TargetTransformInfo::ReductionFlags())) { 4335 auto *VecRdxPhi = 4336 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4337 VecRdxPhi->setIncomingValueForBlock( 4338 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4339 } 4340 } 4341 } 4342 4343 // If the vector reduction can be performed in a smaller type, we truncate 4344 // then extend the loop exit value to enable InstCombine to evaluate the 4345 // entire expression in the smaller type. 4346 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4347 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4348 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4349 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4350 Builder.SetInsertPoint( 4351 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4352 VectorParts RdxParts(UF); 4353 for (unsigned Part = 0; Part < UF; ++Part) { 4354 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4355 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4356 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4357 : Builder.CreateZExt(Trunc, VecTy); 4358 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4359 UI != RdxParts[Part]->user_end();) 4360 if (*UI != Trunc) { 4361 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4362 RdxParts[Part] = Extnd; 4363 } else { 4364 ++UI; 4365 } 4366 } 4367 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4368 for (unsigned Part = 0; Part < UF; ++Part) { 4369 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4370 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4371 } 4372 } 4373 4374 // Reduce all of the unrolled parts into a single vector. 4375 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4376 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4377 4378 // The middle block terminator has already been assigned a DebugLoc here (the 4379 // OrigLoop's single latch terminator). We want the whole middle block to 4380 // appear to execute on this line because: (a) it is all compiler generated, 4381 // (b) these instructions are always executed after evaluating the latch 4382 // conditional branch, and (c) other passes may add new predecessors which 4383 // terminate on this line. This is the easiest way to ensure we don't 4384 // accidentally cause an extra step back into the loop while debugging. 4385 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4386 if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc)) 4387 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4388 else { 4389 // Floating-point operations should have some FMF to enable the reduction. 4390 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4391 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4392 for (unsigned Part = 1; Part < UF; ++Part) { 4393 Value *RdxPart = State.get(LoopExitInstDef, Part); 4394 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4395 ReducedPartRdx = Builder.CreateBinOp( 4396 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4397 } else { 4398 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4399 } 4400 } 4401 } 4402 4403 // Create the reduction after the loop. Note that inloop reductions create the 4404 // target reduction in the loop using a Reduction recipe. 4405 if (VF.isVector() && !IsInLoopReductionPhi) { 4406 ReducedPartRdx = 4407 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4408 // If the reduction can be performed in a smaller type, we need to extend 4409 // the reduction to the wider type before we branch to the original loop. 4410 if (PhiTy != RdxDesc.getRecurrenceType()) 4411 ReducedPartRdx = RdxDesc.isSigned() 4412 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4413 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4414 } 4415 4416 // Create a phi node that merges control-flow from the backedge-taken check 4417 // block and the middle block. 4418 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4419 LoopScalarPreHeader->getTerminator()); 4420 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4421 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4422 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4423 4424 // Now, we need to fix the users of the reduction variable 4425 // inside and outside of the scalar remainder loop. 4426 4427 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4428 // in the exit blocks. See comment on analogous loop in 4429 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4430 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4431 if (any_of(LCSSAPhi.incoming_values(), 4432 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4433 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4434 4435 // Fix the scalar loop reduction variable with the incoming reduction sum 4436 // from the vector body and from the backedge value. 4437 int IncomingEdgeBlockIdx = 4438 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4439 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4440 // Pick the other block. 4441 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4442 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4443 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4444 } 4445 4446 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4447 VPTransformState &State) { 4448 RecurKind RK = RdxDesc.getRecurrenceKind(); 4449 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4450 return; 4451 4452 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4453 assert(LoopExitInstr && "null loop exit instruction"); 4454 SmallVector<Instruction *, 8> Worklist; 4455 SmallPtrSet<Instruction *, 8> Visited; 4456 Worklist.push_back(LoopExitInstr); 4457 Visited.insert(LoopExitInstr); 4458 4459 while (!Worklist.empty()) { 4460 Instruction *Cur = Worklist.pop_back_val(); 4461 if (isa<OverflowingBinaryOperator>(Cur)) 4462 for (unsigned Part = 0; Part < UF; ++Part) { 4463 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4464 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4465 } 4466 4467 for (User *U : Cur->users()) { 4468 Instruction *UI = cast<Instruction>(U); 4469 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4470 Visited.insert(UI).second) 4471 Worklist.push_back(UI); 4472 } 4473 } 4474 } 4475 4476 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4477 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4478 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4479 // Some phis were already hand updated by the reduction and recurrence 4480 // code above, leave them alone. 4481 continue; 4482 4483 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4484 // Non-instruction incoming values will have only one value. 4485 4486 VPLane Lane = VPLane::getFirstLane(); 4487 if (isa<Instruction>(IncomingValue) && 4488 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4489 VF)) 4490 Lane = VPLane::getLastLaneForVF(VF); 4491 4492 // Can be a loop invariant incoming value or the last scalar value to be 4493 // extracted from the vectorized loop. 4494 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4495 Value *lastIncomingValue = 4496 OrigLoop->isLoopInvariant(IncomingValue) 4497 ? IncomingValue 4498 : State.get(State.Plan->getVPValue(IncomingValue), 4499 VPIteration(UF - 1, Lane)); 4500 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4501 } 4502 } 4503 4504 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4505 // The basic block and loop containing the predicated instruction. 4506 auto *PredBB = PredInst->getParent(); 4507 auto *VectorLoop = LI->getLoopFor(PredBB); 4508 4509 // Initialize a worklist with the operands of the predicated instruction. 4510 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4511 4512 // Holds instructions that we need to analyze again. An instruction may be 4513 // reanalyzed if we don't yet know if we can sink it or not. 4514 SmallVector<Instruction *, 8> InstsToReanalyze; 4515 4516 // Returns true if a given use occurs in the predicated block. Phi nodes use 4517 // their operands in their corresponding predecessor blocks. 4518 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4519 auto *I = cast<Instruction>(U.getUser()); 4520 BasicBlock *BB = I->getParent(); 4521 if (auto *Phi = dyn_cast<PHINode>(I)) 4522 BB = Phi->getIncomingBlock( 4523 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4524 return BB == PredBB; 4525 }; 4526 4527 // Iteratively sink the scalarized operands of the predicated instruction 4528 // into the block we created for it. When an instruction is sunk, it's 4529 // operands are then added to the worklist. The algorithm ends after one pass 4530 // through the worklist doesn't sink a single instruction. 4531 bool Changed; 4532 do { 4533 // Add the instructions that need to be reanalyzed to the worklist, and 4534 // reset the changed indicator. 4535 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4536 InstsToReanalyze.clear(); 4537 Changed = false; 4538 4539 while (!Worklist.empty()) { 4540 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4541 4542 // We can't sink an instruction if it is a phi node, is already in the 4543 // predicated block, is not in the loop, or may have side effects. 4544 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4545 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4546 continue; 4547 4548 // It's legal to sink the instruction if all its uses occur in the 4549 // predicated block. Otherwise, there's nothing to do yet, and we may 4550 // need to reanalyze the instruction. 4551 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4552 InstsToReanalyze.push_back(I); 4553 continue; 4554 } 4555 4556 // Move the instruction to the beginning of the predicated block, and add 4557 // it's operands to the worklist. 4558 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4559 Worklist.insert(I->op_begin(), I->op_end()); 4560 4561 // The sinking may have enabled other instructions to be sunk, so we will 4562 // need to iterate. 4563 Changed = true; 4564 } 4565 } while (Changed); 4566 } 4567 4568 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4569 for (PHINode *OrigPhi : OrigPHIsToFix) { 4570 VPWidenPHIRecipe *VPPhi = 4571 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4572 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4573 // Make sure the builder has a valid insert point. 4574 Builder.SetInsertPoint(NewPhi); 4575 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4576 VPValue *Inc = VPPhi->getIncomingValue(i); 4577 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4578 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4579 } 4580 } 4581 } 4582 4583 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4584 VPUser &Operands, unsigned UF, 4585 ElementCount VF, bool IsPtrLoopInvariant, 4586 SmallBitVector &IsIndexLoopInvariant, 4587 VPTransformState &State) { 4588 // Construct a vector GEP by widening the operands of the scalar GEP as 4589 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4590 // results in a vector of pointers when at least one operand of the GEP 4591 // is vector-typed. Thus, to keep the representation compact, we only use 4592 // vector-typed operands for loop-varying values. 4593 4594 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4595 // If we are vectorizing, but the GEP has only loop-invariant operands, 4596 // the GEP we build (by only using vector-typed operands for 4597 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4598 // produce a vector of pointers, we need to either arbitrarily pick an 4599 // operand to broadcast, or broadcast a clone of the original GEP. 4600 // Here, we broadcast a clone of the original. 4601 // 4602 // TODO: If at some point we decide to scalarize instructions having 4603 // loop-invariant operands, this special case will no longer be 4604 // required. We would add the scalarization decision to 4605 // collectLoopScalars() and teach getVectorValue() to broadcast 4606 // the lane-zero scalar value. 4607 auto *Clone = Builder.Insert(GEP->clone()); 4608 for (unsigned Part = 0; Part < UF; ++Part) { 4609 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4610 State.set(VPDef, EntryPart, Part); 4611 addMetadata(EntryPart, GEP); 4612 } 4613 } else { 4614 // If the GEP has at least one loop-varying operand, we are sure to 4615 // produce a vector of pointers. But if we are only unrolling, we want 4616 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4617 // produce with the code below will be scalar (if VF == 1) or vector 4618 // (otherwise). Note that for the unroll-only case, we still maintain 4619 // values in the vector mapping with initVector, as we do for other 4620 // instructions. 4621 for (unsigned Part = 0; Part < UF; ++Part) { 4622 // The pointer operand of the new GEP. If it's loop-invariant, we 4623 // won't broadcast it. 4624 auto *Ptr = IsPtrLoopInvariant 4625 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4626 : State.get(Operands.getOperand(0), Part); 4627 4628 // Collect all the indices for the new GEP. If any index is 4629 // loop-invariant, we won't broadcast it. 4630 SmallVector<Value *, 4> Indices; 4631 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4632 VPValue *Operand = Operands.getOperand(I); 4633 if (IsIndexLoopInvariant[I - 1]) 4634 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4635 else 4636 Indices.push_back(State.get(Operand, Part)); 4637 } 4638 4639 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4640 // but it should be a vector, otherwise. 4641 auto *NewGEP = 4642 GEP->isInBounds() 4643 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4644 Indices) 4645 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4646 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4647 "NewGEP is not a pointer vector"); 4648 State.set(VPDef, NewGEP, Part); 4649 addMetadata(NewGEP, GEP); 4650 } 4651 } 4652 } 4653 4654 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4655 RecurrenceDescriptor *RdxDesc, 4656 VPWidenPHIRecipe *PhiR, 4657 VPTransformState &State) { 4658 PHINode *P = cast<PHINode>(PN); 4659 if (EnableVPlanNativePath) { 4660 // Currently we enter here in the VPlan-native path for non-induction 4661 // PHIs where all control flow is uniform. We simply widen these PHIs. 4662 // Create a vector phi with no operands - the vector phi operands will be 4663 // set at the end of vector code generation. 4664 Type *VecTy = (State.VF.isScalar()) 4665 ? PN->getType() 4666 : VectorType::get(PN->getType(), State.VF); 4667 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4668 State.set(PhiR, VecPhi, 0); 4669 OrigPHIsToFix.push_back(P); 4670 4671 return; 4672 } 4673 4674 assert(PN->getParent() == OrigLoop->getHeader() && 4675 "Non-header phis should have been handled elsewhere"); 4676 4677 VPValue *StartVPV = PhiR->getStartValue(); 4678 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4679 // In order to support recurrences we need to be able to vectorize Phi nodes. 4680 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4681 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4682 // this value when we vectorize all of the instructions that use the PHI. 4683 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4684 Value *Iden = nullptr; 4685 bool ScalarPHI = 4686 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4687 Type *VecTy = 4688 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4689 4690 if (RdxDesc) { 4691 assert(Legal->isReductionVariable(P) && StartV && 4692 "RdxDesc should only be set for reduction variables; in that case " 4693 "a StartV is also required"); 4694 RecurKind RK = RdxDesc->getRecurrenceKind(); 4695 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4696 // MinMax reduction have the start value as their identify. 4697 if (ScalarPHI) { 4698 Iden = StartV; 4699 } else { 4700 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4701 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4702 StartV = Iden = 4703 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4704 } 4705 } else { 4706 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4707 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4708 Iden = IdenC; 4709 4710 if (!ScalarPHI) { 4711 Iden = ConstantVector::getSplat(State.VF, IdenC); 4712 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4713 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4714 Constant *Zero = Builder.getInt32(0); 4715 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4716 } 4717 } 4718 } 4719 4720 for (unsigned Part = 0; Part < State.UF; ++Part) { 4721 // This is phase one of vectorizing PHIs. 4722 Value *EntryPart = PHINode::Create( 4723 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4724 State.set(PhiR, EntryPart, Part); 4725 if (StartV) { 4726 // Make sure to add the reduction start value only to the 4727 // first unroll part. 4728 Value *StartVal = (Part == 0) ? StartV : Iden; 4729 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4730 } 4731 } 4732 return; 4733 } 4734 4735 assert(!Legal->isReductionVariable(P) && 4736 "reductions should be handled above"); 4737 4738 setDebugLocFromInst(Builder, P); 4739 4740 // This PHINode must be an induction variable. 4741 // Make sure that we know about it. 4742 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4743 4744 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4745 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4746 4747 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4748 // which can be found from the original scalar operations. 4749 switch (II.getKind()) { 4750 case InductionDescriptor::IK_NoInduction: 4751 llvm_unreachable("Unknown induction"); 4752 case InductionDescriptor::IK_IntInduction: 4753 case InductionDescriptor::IK_FpInduction: 4754 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4755 case InductionDescriptor::IK_PtrInduction: { 4756 // Handle the pointer induction variable case. 4757 assert(P->getType()->isPointerTy() && "Unexpected type."); 4758 4759 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4760 // This is the normalized GEP that starts counting at zero. 4761 Value *PtrInd = 4762 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4763 // Determine the number of scalars we need to generate for each unroll 4764 // iteration. If the instruction is uniform, we only need to generate the 4765 // first lane. Otherwise, we generate all VF values. 4766 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4767 assert((IsUniform || !VF.isScalable()) && 4768 "Currently unsupported for scalable vectors"); 4769 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4770 4771 Value *RuntimeVF = getRuntimeVF(Builder, PtrInd->getType(), VF); 4772 for (unsigned Part = 0; Part < UF; ++Part) { 4773 Value *PartStart = Builder.CreateMul( 4774 RuntimeVF, ConstantInt::get(PtrInd->getType(), Part)); 4775 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4776 Value *Idx = Builder.CreateAdd( 4777 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4778 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4779 Value *SclrGep = 4780 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4781 SclrGep->setName("next.gep"); 4782 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4783 } 4784 } 4785 return; 4786 } 4787 assert(isa<SCEVConstant>(II.getStep()) && 4788 "Induction step not a SCEV constant!"); 4789 Type *PhiType = II.getStep()->getType(); 4790 4791 // Build a pointer phi 4792 Value *ScalarStartValue = II.getStartValue(); 4793 Type *ScStValueType = ScalarStartValue->getType(); 4794 PHINode *NewPointerPhi = 4795 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4796 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4797 4798 // A pointer induction, performed by using a gep 4799 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4800 Instruction *InductionLoc = LoopLatch->getTerminator(); 4801 const SCEV *ScalarStep = II.getStep(); 4802 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4803 Value *ScalarStepValue = 4804 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4805 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4806 Value *NumUnrolledElems = 4807 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4808 Value *InductionGEP = GetElementPtrInst::Create( 4809 ScStValueType->getPointerElementType(), NewPointerPhi, 4810 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4811 InductionLoc); 4812 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4813 4814 // Create UF many actual address geps that use the pointer 4815 // phi as base and a vectorized version of the step value 4816 // (<step*0, ..., step*N>) as offset. 4817 for (unsigned Part = 0; Part < State.UF; ++Part) { 4818 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4819 Value *StartOffsetScalar = 4820 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4821 Value *StartOffset = 4822 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4823 // Create a vector of consecutive numbers from zero to VF. 4824 StartOffset = 4825 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4826 4827 Value *GEP = Builder.CreateGEP( 4828 ScStValueType->getPointerElementType(), NewPointerPhi, 4829 Builder.CreateMul( 4830 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4831 "vector.gep")); 4832 State.set(PhiR, GEP, Part); 4833 } 4834 } 4835 } 4836 } 4837 4838 /// A helper function for checking whether an integer division-related 4839 /// instruction may divide by zero (in which case it must be predicated if 4840 /// executed conditionally in the scalar code). 4841 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4842 /// Non-zero divisors that are non compile-time constants will not be 4843 /// converted into multiplication, so we will still end up scalarizing 4844 /// the division, but can do so w/o predication. 4845 static bool mayDivideByZero(Instruction &I) { 4846 assert((I.getOpcode() == Instruction::UDiv || 4847 I.getOpcode() == Instruction::SDiv || 4848 I.getOpcode() == Instruction::URem || 4849 I.getOpcode() == Instruction::SRem) && 4850 "Unexpected instruction"); 4851 Value *Divisor = I.getOperand(1); 4852 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4853 return !CInt || CInt->isZero(); 4854 } 4855 4856 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4857 VPUser &User, 4858 VPTransformState &State) { 4859 switch (I.getOpcode()) { 4860 case Instruction::Call: 4861 case Instruction::Br: 4862 case Instruction::PHI: 4863 case Instruction::GetElementPtr: 4864 case Instruction::Select: 4865 llvm_unreachable("This instruction is handled by a different recipe."); 4866 case Instruction::UDiv: 4867 case Instruction::SDiv: 4868 case Instruction::SRem: 4869 case Instruction::URem: 4870 case Instruction::Add: 4871 case Instruction::FAdd: 4872 case Instruction::Sub: 4873 case Instruction::FSub: 4874 case Instruction::FNeg: 4875 case Instruction::Mul: 4876 case Instruction::FMul: 4877 case Instruction::FDiv: 4878 case Instruction::FRem: 4879 case Instruction::Shl: 4880 case Instruction::LShr: 4881 case Instruction::AShr: 4882 case Instruction::And: 4883 case Instruction::Or: 4884 case Instruction::Xor: { 4885 // Just widen unops and binops. 4886 setDebugLocFromInst(Builder, &I); 4887 4888 for (unsigned Part = 0; Part < UF; ++Part) { 4889 SmallVector<Value *, 2> Ops; 4890 for (VPValue *VPOp : User.operands()) 4891 Ops.push_back(State.get(VPOp, Part)); 4892 4893 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4894 4895 if (auto *VecOp = dyn_cast<Instruction>(V)) 4896 VecOp->copyIRFlags(&I); 4897 4898 // Use this vector value for all users of the original instruction. 4899 State.set(Def, V, Part); 4900 addMetadata(V, &I); 4901 } 4902 4903 break; 4904 } 4905 case Instruction::ICmp: 4906 case Instruction::FCmp: { 4907 // Widen compares. Generate vector compares. 4908 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4909 auto *Cmp = cast<CmpInst>(&I); 4910 setDebugLocFromInst(Builder, Cmp); 4911 for (unsigned Part = 0; Part < UF; ++Part) { 4912 Value *A = State.get(User.getOperand(0), Part); 4913 Value *B = State.get(User.getOperand(1), Part); 4914 Value *C = nullptr; 4915 if (FCmp) { 4916 // Propagate fast math flags. 4917 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4918 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4919 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4920 } else { 4921 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4922 } 4923 State.set(Def, C, Part); 4924 addMetadata(C, &I); 4925 } 4926 4927 break; 4928 } 4929 4930 case Instruction::ZExt: 4931 case Instruction::SExt: 4932 case Instruction::FPToUI: 4933 case Instruction::FPToSI: 4934 case Instruction::FPExt: 4935 case Instruction::PtrToInt: 4936 case Instruction::IntToPtr: 4937 case Instruction::SIToFP: 4938 case Instruction::UIToFP: 4939 case Instruction::Trunc: 4940 case Instruction::FPTrunc: 4941 case Instruction::BitCast: { 4942 auto *CI = cast<CastInst>(&I); 4943 setDebugLocFromInst(Builder, CI); 4944 4945 /// Vectorize casts. 4946 Type *DestTy = 4947 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4948 4949 for (unsigned Part = 0; Part < UF; ++Part) { 4950 Value *A = State.get(User.getOperand(0), Part); 4951 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4952 State.set(Def, Cast, Part); 4953 addMetadata(Cast, &I); 4954 } 4955 break; 4956 } 4957 default: 4958 // This instruction is not vectorized by simple widening. 4959 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4960 llvm_unreachable("Unhandled instruction!"); 4961 } // end of switch. 4962 } 4963 4964 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4965 VPUser &ArgOperands, 4966 VPTransformState &State) { 4967 assert(!isa<DbgInfoIntrinsic>(I) && 4968 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4969 setDebugLocFromInst(Builder, &I); 4970 4971 Module *M = I.getParent()->getParent()->getParent(); 4972 auto *CI = cast<CallInst>(&I); 4973 4974 SmallVector<Type *, 4> Tys; 4975 for (Value *ArgOperand : CI->arg_operands()) 4976 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4977 4978 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4979 4980 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4981 // version of the instruction. 4982 // Is it beneficial to perform intrinsic call compared to lib call? 4983 bool NeedToScalarize = false; 4984 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4985 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4986 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4987 assert((UseVectorIntrinsic || !NeedToScalarize) && 4988 "Instruction should be scalarized elsewhere."); 4989 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4990 "Either the intrinsic cost or vector call cost must be valid"); 4991 4992 for (unsigned Part = 0; Part < UF; ++Part) { 4993 SmallVector<Value *, 4> Args; 4994 for (auto &I : enumerate(ArgOperands.operands())) { 4995 // Some intrinsics have a scalar argument - don't replace it with a 4996 // vector. 4997 Value *Arg; 4998 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4999 Arg = State.get(I.value(), Part); 5000 else 5001 Arg = State.get(I.value(), VPIteration(0, 0)); 5002 Args.push_back(Arg); 5003 } 5004 5005 Function *VectorF; 5006 if (UseVectorIntrinsic) { 5007 // Use vector version of the intrinsic. 5008 Type *TysForDecl[] = {CI->getType()}; 5009 if (VF.isVector()) 5010 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5011 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5012 assert(VectorF && "Can't retrieve vector intrinsic."); 5013 } else { 5014 // Use vector version of the function call. 5015 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5016 #ifndef NDEBUG 5017 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5018 "Can't create vector function."); 5019 #endif 5020 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5021 } 5022 SmallVector<OperandBundleDef, 1> OpBundles; 5023 CI->getOperandBundlesAsDefs(OpBundles); 5024 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5025 5026 if (isa<FPMathOperator>(V)) 5027 V->copyFastMathFlags(CI); 5028 5029 State.set(Def, V, Part); 5030 addMetadata(V, &I); 5031 } 5032 } 5033 5034 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5035 VPUser &Operands, 5036 bool InvariantCond, 5037 VPTransformState &State) { 5038 setDebugLocFromInst(Builder, &I); 5039 5040 // The condition can be loop invariant but still defined inside the 5041 // loop. This means that we can't just use the original 'cond' value. 5042 // We have to take the 'vectorized' value and pick the first lane. 5043 // Instcombine will make this a no-op. 5044 auto *InvarCond = InvariantCond 5045 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5046 : nullptr; 5047 5048 for (unsigned Part = 0; Part < UF; ++Part) { 5049 Value *Cond = 5050 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5051 Value *Op0 = State.get(Operands.getOperand(1), Part); 5052 Value *Op1 = State.get(Operands.getOperand(2), Part); 5053 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5054 State.set(VPDef, Sel, Part); 5055 addMetadata(Sel, &I); 5056 } 5057 } 5058 5059 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5060 // We should not collect Scalars more than once per VF. Right now, this 5061 // function is called from collectUniformsAndScalars(), which already does 5062 // this check. Collecting Scalars for VF=1 does not make any sense. 5063 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5064 "This function should not be visited twice for the same VF"); 5065 5066 SmallSetVector<Instruction *, 8> Worklist; 5067 5068 // These sets are used to seed the analysis with pointers used by memory 5069 // accesses that will remain scalar. 5070 SmallSetVector<Instruction *, 8> ScalarPtrs; 5071 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5072 auto *Latch = TheLoop->getLoopLatch(); 5073 5074 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5075 // The pointer operands of loads and stores will be scalar as long as the 5076 // memory access is not a gather or scatter operation. The value operand of a 5077 // store will remain scalar if the store is scalarized. 5078 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5079 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5080 assert(WideningDecision != CM_Unknown && 5081 "Widening decision should be ready at this moment"); 5082 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5083 if (Ptr == Store->getValueOperand()) 5084 return WideningDecision == CM_Scalarize; 5085 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5086 "Ptr is neither a value or pointer operand"); 5087 return WideningDecision != CM_GatherScatter; 5088 }; 5089 5090 // A helper that returns true if the given value is a bitcast or 5091 // getelementptr instruction contained in the loop. 5092 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5093 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5094 isa<GetElementPtrInst>(V)) && 5095 !TheLoop->isLoopInvariant(V); 5096 }; 5097 5098 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5099 if (!isa<PHINode>(Ptr) || 5100 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5101 return false; 5102 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5103 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5104 return false; 5105 return isScalarUse(MemAccess, Ptr); 5106 }; 5107 5108 // A helper that evaluates a memory access's use of a pointer. If the 5109 // pointer is actually the pointer induction of a loop, it is being 5110 // inserted into Worklist. If the use will be a scalar use, and the 5111 // pointer is only used by memory accesses, we place the pointer in 5112 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5113 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5114 if (isScalarPtrInduction(MemAccess, Ptr)) { 5115 Worklist.insert(cast<Instruction>(Ptr)); 5116 Instruction *Update = cast<Instruction>( 5117 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5118 Worklist.insert(Update); 5119 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5120 << "\n"); 5121 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5122 << "\n"); 5123 return; 5124 } 5125 // We only care about bitcast and getelementptr instructions contained in 5126 // the loop. 5127 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5128 return; 5129 5130 // If the pointer has already been identified as scalar (e.g., if it was 5131 // also identified as uniform), there's nothing to do. 5132 auto *I = cast<Instruction>(Ptr); 5133 if (Worklist.count(I)) 5134 return; 5135 5136 // If the use of the pointer will be a scalar use, and all users of the 5137 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5138 // place the pointer in PossibleNonScalarPtrs. 5139 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5140 return isa<LoadInst>(U) || isa<StoreInst>(U); 5141 })) 5142 ScalarPtrs.insert(I); 5143 else 5144 PossibleNonScalarPtrs.insert(I); 5145 }; 5146 5147 // We seed the scalars analysis with three classes of instructions: (1) 5148 // instructions marked uniform-after-vectorization and (2) bitcast, 5149 // getelementptr and (pointer) phi instructions used by memory accesses 5150 // requiring a scalar use. 5151 // 5152 // (1) Add to the worklist all instructions that have been identified as 5153 // uniform-after-vectorization. 5154 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5155 5156 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5157 // memory accesses requiring a scalar use. The pointer operands of loads and 5158 // stores will be scalar as long as the memory accesses is not a gather or 5159 // scatter operation. The value operand of a store will remain scalar if the 5160 // store is scalarized. 5161 for (auto *BB : TheLoop->blocks()) 5162 for (auto &I : *BB) { 5163 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5164 evaluatePtrUse(Load, Load->getPointerOperand()); 5165 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5166 evaluatePtrUse(Store, Store->getPointerOperand()); 5167 evaluatePtrUse(Store, Store->getValueOperand()); 5168 } 5169 } 5170 for (auto *I : ScalarPtrs) 5171 if (!PossibleNonScalarPtrs.count(I)) { 5172 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5173 Worklist.insert(I); 5174 } 5175 5176 // Insert the forced scalars. 5177 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5178 // induction variable when the PHI user is scalarized. 5179 auto ForcedScalar = ForcedScalars.find(VF); 5180 if (ForcedScalar != ForcedScalars.end()) 5181 for (auto *I : ForcedScalar->second) 5182 Worklist.insert(I); 5183 5184 // Expand the worklist by looking through any bitcasts and getelementptr 5185 // instructions we've already identified as scalar. This is similar to the 5186 // expansion step in collectLoopUniforms(); however, here we're only 5187 // expanding to include additional bitcasts and getelementptr instructions. 5188 unsigned Idx = 0; 5189 while (Idx != Worklist.size()) { 5190 Instruction *Dst = Worklist[Idx++]; 5191 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5192 continue; 5193 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5194 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5195 auto *J = cast<Instruction>(U); 5196 return !TheLoop->contains(J) || Worklist.count(J) || 5197 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5198 isScalarUse(J, Src)); 5199 })) { 5200 Worklist.insert(Src); 5201 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5202 } 5203 } 5204 5205 // An induction variable will remain scalar if all users of the induction 5206 // variable and induction variable update remain scalar. 5207 for (auto &Induction : Legal->getInductionVars()) { 5208 auto *Ind = Induction.first; 5209 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5210 5211 // If tail-folding is applied, the primary induction variable will be used 5212 // to feed a vector compare. 5213 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5214 continue; 5215 5216 // Determine if all users of the induction variable are scalar after 5217 // vectorization. 5218 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5219 auto *I = cast<Instruction>(U); 5220 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5221 }); 5222 if (!ScalarInd) 5223 continue; 5224 5225 // Determine if all users of the induction variable update instruction are 5226 // scalar after vectorization. 5227 auto ScalarIndUpdate = 5228 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5229 auto *I = cast<Instruction>(U); 5230 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5231 }); 5232 if (!ScalarIndUpdate) 5233 continue; 5234 5235 // The induction variable and its update instruction will remain scalar. 5236 Worklist.insert(Ind); 5237 Worklist.insert(IndUpdate); 5238 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5239 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5240 << "\n"); 5241 } 5242 5243 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5244 } 5245 5246 bool LoopVectorizationCostModel::isScalarWithPredication( 5247 Instruction *I, ElementCount VF) const { 5248 if (!blockNeedsPredication(I->getParent())) 5249 return false; 5250 switch(I->getOpcode()) { 5251 default: 5252 break; 5253 case Instruction::Load: 5254 case Instruction::Store: { 5255 if (!Legal->isMaskRequired(I)) 5256 return false; 5257 auto *Ptr = getLoadStorePointerOperand(I); 5258 auto *Ty = getMemInstValueType(I); 5259 // We have already decided how to vectorize this instruction, get that 5260 // result. 5261 if (VF.isVector()) { 5262 InstWidening WideningDecision = getWideningDecision(I, VF); 5263 assert(WideningDecision != CM_Unknown && 5264 "Widening decision should be ready at this moment"); 5265 return WideningDecision == CM_Scalarize; 5266 } 5267 const Align Alignment = getLoadStoreAlignment(I); 5268 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5269 isLegalMaskedGather(Ty, Alignment)) 5270 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5271 isLegalMaskedScatter(Ty, Alignment)); 5272 } 5273 case Instruction::UDiv: 5274 case Instruction::SDiv: 5275 case Instruction::SRem: 5276 case Instruction::URem: 5277 return mayDivideByZero(*I); 5278 } 5279 return false; 5280 } 5281 5282 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5283 Instruction *I, ElementCount VF) { 5284 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5285 assert(getWideningDecision(I, VF) == CM_Unknown && 5286 "Decision should not be set yet."); 5287 auto *Group = getInterleavedAccessGroup(I); 5288 assert(Group && "Must have a group."); 5289 5290 // If the instruction's allocated size doesn't equal it's type size, it 5291 // requires padding and will be scalarized. 5292 auto &DL = I->getModule()->getDataLayout(); 5293 auto *ScalarTy = getMemInstValueType(I); 5294 if (hasIrregularType(ScalarTy, DL)) 5295 return false; 5296 5297 // Check if masking is required. 5298 // A Group may need masking for one of two reasons: it resides in a block that 5299 // needs predication, or it was decided to use masking to deal with gaps. 5300 bool PredicatedAccessRequiresMasking = 5301 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5302 bool AccessWithGapsRequiresMasking = 5303 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5304 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5305 return true; 5306 5307 // If masked interleaving is required, we expect that the user/target had 5308 // enabled it, because otherwise it either wouldn't have been created or 5309 // it should have been invalidated by the CostModel. 5310 assert(useMaskedInterleavedAccesses(TTI) && 5311 "Masked interleave-groups for predicated accesses are not enabled."); 5312 5313 auto *Ty = getMemInstValueType(I); 5314 const Align Alignment = getLoadStoreAlignment(I); 5315 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5316 : TTI.isLegalMaskedStore(Ty, Alignment); 5317 } 5318 5319 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5320 Instruction *I, ElementCount VF) { 5321 // Get and ensure we have a valid memory instruction. 5322 LoadInst *LI = dyn_cast<LoadInst>(I); 5323 StoreInst *SI = dyn_cast<StoreInst>(I); 5324 assert((LI || SI) && "Invalid memory instruction"); 5325 5326 auto *Ptr = getLoadStorePointerOperand(I); 5327 5328 // In order to be widened, the pointer should be consecutive, first of all. 5329 if (!Legal->isConsecutivePtr(Ptr)) 5330 return false; 5331 5332 // If the instruction is a store located in a predicated block, it will be 5333 // scalarized. 5334 if (isScalarWithPredication(I)) 5335 return false; 5336 5337 // If the instruction's allocated size doesn't equal it's type size, it 5338 // requires padding and will be scalarized. 5339 auto &DL = I->getModule()->getDataLayout(); 5340 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5341 if (hasIrregularType(ScalarTy, DL)) 5342 return false; 5343 5344 return true; 5345 } 5346 5347 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5348 // We should not collect Uniforms more than once per VF. Right now, 5349 // this function is called from collectUniformsAndScalars(), which 5350 // already does this check. Collecting Uniforms for VF=1 does not make any 5351 // sense. 5352 5353 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5354 "This function should not be visited twice for the same VF"); 5355 5356 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5357 // not analyze again. Uniforms.count(VF) will return 1. 5358 Uniforms[VF].clear(); 5359 5360 // We now know that the loop is vectorizable! 5361 // Collect instructions inside the loop that will remain uniform after 5362 // vectorization. 5363 5364 // Global values, params and instructions outside of current loop are out of 5365 // scope. 5366 auto isOutOfScope = [&](Value *V) -> bool { 5367 Instruction *I = dyn_cast<Instruction>(V); 5368 return (!I || !TheLoop->contains(I)); 5369 }; 5370 5371 SetVector<Instruction *> Worklist; 5372 BasicBlock *Latch = TheLoop->getLoopLatch(); 5373 5374 // Instructions that are scalar with predication must not be considered 5375 // uniform after vectorization, because that would create an erroneous 5376 // replicating region where only a single instance out of VF should be formed. 5377 // TODO: optimize such seldom cases if found important, see PR40816. 5378 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5379 if (isOutOfScope(I)) { 5380 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5381 << *I << "\n"); 5382 return; 5383 } 5384 if (isScalarWithPredication(I, VF)) { 5385 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5386 << *I << "\n"); 5387 return; 5388 } 5389 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5390 Worklist.insert(I); 5391 }; 5392 5393 // Start with the conditional branch. If the branch condition is an 5394 // instruction contained in the loop that is only used by the branch, it is 5395 // uniform. 5396 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5397 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5398 addToWorklistIfAllowed(Cmp); 5399 5400 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5401 InstWidening WideningDecision = getWideningDecision(I, VF); 5402 assert(WideningDecision != CM_Unknown && 5403 "Widening decision should be ready at this moment"); 5404 5405 // A uniform memory op is itself uniform. We exclude uniform stores 5406 // here as they demand the last lane, not the first one. 5407 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5408 assert(WideningDecision == CM_Scalarize); 5409 return true; 5410 } 5411 5412 return (WideningDecision == CM_Widen || 5413 WideningDecision == CM_Widen_Reverse || 5414 WideningDecision == CM_Interleave); 5415 }; 5416 5417 5418 // Returns true if Ptr is the pointer operand of a memory access instruction 5419 // I, and I is known to not require scalarization. 5420 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5421 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5422 }; 5423 5424 // Holds a list of values which are known to have at least one uniform use. 5425 // Note that there may be other uses which aren't uniform. A "uniform use" 5426 // here is something which only demands lane 0 of the unrolled iterations; 5427 // it does not imply that all lanes produce the same value (e.g. this is not 5428 // the usual meaning of uniform) 5429 SetVector<Value *> HasUniformUse; 5430 5431 // Scan the loop for instructions which are either a) known to have only 5432 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5433 for (auto *BB : TheLoop->blocks()) 5434 for (auto &I : *BB) { 5435 // If there's no pointer operand, there's nothing to do. 5436 auto *Ptr = getLoadStorePointerOperand(&I); 5437 if (!Ptr) 5438 continue; 5439 5440 // A uniform memory op is itself uniform. We exclude uniform stores 5441 // here as they demand the last lane, not the first one. 5442 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5443 addToWorklistIfAllowed(&I); 5444 5445 if (isUniformDecision(&I, VF)) { 5446 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5447 HasUniformUse.insert(Ptr); 5448 } 5449 } 5450 5451 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5452 // demanding) users. Since loops are assumed to be in LCSSA form, this 5453 // disallows uses outside the loop as well. 5454 for (auto *V : HasUniformUse) { 5455 if (isOutOfScope(V)) 5456 continue; 5457 auto *I = cast<Instruction>(V); 5458 auto UsersAreMemAccesses = 5459 llvm::all_of(I->users(), [&](User *U) -> bool { 5460 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5461 }); 5462 if (UsersAreMemAccesses) 5463 addToWorklistIfAllowed(I); 5464 } 5465 5466 // Expand Worklist in topological order: whenever a new instruction 5467 // is added , its users should be already inside Worklist. It ensures 5468 // a uniform instruction will only be used by uniform instructions. 5469 unsigned idx = 0; 5470 while (idx != Worklist.size()) { 5471 Instruction *I = Worklist[idx++]; 5472 5473 for (auto OV : I->operand_values()) { 5474 // isOutOfScope operands cannot be uniform instructions. 5475 if (isOutOfScope(OV)) 5476 continue; 5477 // First order recurrence Phi's should typically be considered 5478 // non-uniform. 5479 auto *OP = dyn_cast<PHINode>(OV); 5480 if (OP && Legal->isFirstOrderRecurrence(OP)) 5481 continue; 5482 // If all the users of the operand are uniform, then add the 5483 // operand into the uniform worklist. 5484 auto *OI = cast<Instruction>(OV); 5485 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5486 auto *J = cast<Instruction>(U); 5487 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5488 })) 5489 addToWorklistIfAllowed(OI); 5490 } 5491 } 5492 5493 // For an instruction to be added into Worklist above, all its users inside 5494 // the loop should also be in Worklist. However, this condition cannot be 5495 // true for phi nodes that form a cyclic dependence. We must process phi 5496 // nodes separately. An induction variable will remain uniform if all users 5497 // of the induction variable and induction variable update remain uniform. 5498 // The code below handles both pointer and non-pointer induction variables. 5499 for (auto &Induction : Legal->getInductionVars()) { 5500 auto *Ind = Induction.first; 5501 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5502 5503 // Determine if all users of the induction variable are uniform after 5504 // vectorization. 5505 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5506 auto *I = cast<Instruction>(U); 5507 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5508 isVectorizedMemAccessUse(I, Ind); 5509 }); 5510 if (!UniformInd) 5511 continue; 5512 5513 // Determine if all users of the induction variable update instruction are 5514 // uniform after vectorization. 5515 auto UniformIndUpdate = 5516 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5517 auto *I = cast<Instruction>(U); 5518 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5519 isVectorizedMemAccessUse(I, IndUpdate); 5520 }); 5521 if (!UniformIndUpdate) 5522 continue; 5523 5524 // The induction variable and its update instruction will remain uniform. 5525 addToWorklistIfAllowed(Ind); 5526 addToWorklistIfAllowed(IndUpdate); 5527 } 5528 5529 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5530 } 5531 5532 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5533 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5534 5535 if (Legal->getRuntimePointerChecking()->Need) { 5536 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5537 "runtime pointer checks needed. Enable vectorization of this " 5538 "loop with '#pragma clang loop vectorize(enable)' when " 5539 "compiling with -Os/-Oz", 5540 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5541 return true; 5542 } 5543 5544 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5545 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5546 "runtime SCEV checks needed. Enable vectorization of this " 5547 "loop with '#pragma clang loop vectorize(enable)' when " 5548 "compiling with -Os/-Oz", 5549 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5550 return true; 5551 } 5552 5553 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5554 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5555 reportVectorizationFailure("Runtime stride check for small trip count", 5556 "runtime stride == 1 checks needed. Enable vectorization of " 5557 "this loop without such check by compiling with -Os/-Oz", 5558 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5559 return true; 5560 } 5561 5562 return false; 5563 } 5564 5565 Optional<ElementCount> 5566 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5567 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5568 // TODO: It may by useful to do since it's still likely to be dynamically 5569 // uniform if the target can skip. 5570 reportVectorizationFailure( 5571 "Not inserting runtime ptr check for divergent target", 5572 "runtime pointer checks needed. Not enabled for divergent target", 5573 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5574 return None; 5575 } 5576 5577 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5578 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5579 if (TC == 1) { 5580 reportVectorizationFailure("Single iteration (non) loop", 5581 "loop trip count is one, irrelevant for vectorization", 5582 "SingleIterationLoop", ORE, TheLoop); 5583 return None; 5584 } 5585 5586 switch (ScalarEpilogueStatus) { 5587 case CM_ScalarEpilogueAllowed: 5588 return computeFeasibleMaxVF(TC, UserVF); 5589 case CM_ScalarEpilogueNotAllowedUsePredicate: 5590 LLVM_FALLTHROUGH; 5591 case CM_ScalarEpilogueNotNeededUsePredicate: 5592 LLVM_DEBUG( 5593 dbgs() << "LV: vector predicate hint/switch found.\n" 5594 << "LV: Not allowing scalar epilogue, creating predicated " 5595 << "vector loop.\n"); 5596 break; 5597 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5598 // fallthrough as a special case of OptForSize 5599 case CM_ScalarEpilogueNotAllowedOptSize: 5600 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5601 LLVM_DEBUG( 5602 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5603 else 5604 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5605 << "count.\n"); 5606 5607 // Bail if runtime checks are required, which are not good when optimising 5608 // for size. 5609 if (runtimeChecksRequired()) 5610 return None; 5611 5612 break; 5613 } 5614 5615 // The only loops we can vectorize without a scalar epilogue, are loops with 5616 // a bottom-test and a single exiting block. We'd have to handle the fact 5617 // that not every instruction executes on the last iteration. This will 5618 // require a lane mask which varies through the vector loop body. (TODO) 5619 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5620 // If there was a tail-folding hint/switch, but we can't fold the tail by 5621 // masking, fallback to a vectorization with a scalar epilogue. 5622 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5623 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5624 "scalar epilogue instead.\n"); 5625 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5626 return computeFeasibleMaxVF(TC, UserVF); 5627 } 5628 return None; 5629 } 5630 5631 // Now try the tail folding 5632 5633 // Invalidate interleave groups that require an epilogue if we can't mask 5634 // the interleave-group. 5635 if (!useMaskedInterleavedAccesses(TTI)) { 5636 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5637 "No decisions should have been taken at this point"); 5638 // Note: There is no need to invalidate any cost modeling decisions here, as 5639 // non where taken so far. 5640 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5641 } 5642 5643 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5644 assert(!MaxVF.isScalable() && 5645 "Scalable vectors do not yet support tail folding"); 5646 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5647 "MaxVF must be a power of 2"); 5648 unsigned MaxVFtimesIC = 5649 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5650 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5651 // chose. 5652 ScalarEvolution *SE = PSE.getSE(); 5653 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5654 const SCEV *ExitCount = SE->getAddExpr( 5655 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5656 const SCEV *Rem = SE->getURemExpr( 5657 SE->applyLoopGuards(ExitCount, TheLoop), 5658 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5659 if (Rem->isZero()) { 5660 // Accept MaxVF if we do not have a tail. 5661 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5662 return MaxVF; 5663 } 5664 5665 // If we don't know the precise trip count, or if the trip count that we 5666 // found modulo the vectorization factor is not zero, try to fold the tail 5667 // by masking. 5668 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5669 if (Legal->prepareToFoldTailByMasking()) { 5670 FoldTailByMasking = true; 5671 return MaxVF; 5672 } 5673 5674 // If there was a tail-folding hint/switch, but we can't fold the tail by 5675 // masking, fallback to a vectorization with a scalar epilogue. 5676 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5677 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5678 "scalar epilogue instead.\n"); 5679 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5680 return MaxVF; 5681 } 5682 5683 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5684 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5685 return None; 5686 } 5687 5688 if (TC == 0) { 5689 reportVectorizationFailure( 5690 "Unable to calculate the loop count due to complex control flow", 5691 "unable to calculate the loop count due to complex control flow", 5692 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5693 return None; 5694 } 5695 5696 reportVectorizationFailure( 5697 "Cannot optimize for size and vectorize at the same time.", 5698 "cannot optimize for size and vectorize at the same time. " 5699 "Enable vectorization of this loop with '#pragma clang loop " 5700 "vectorize(enable)' when compiling with -Os/-Oz", 5701 "NoTailLoopWithOptForSize", ORE, TheLoop); 5702 return None; 5703 } 5704 5705 ElementCount 5706 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5707 ElementCount UserVF) { 5708 bool IgnoreScalableUserVF = UserVF.isScalable() && 5709 !TTI.supportsScalableVectors() && 5710 !ForceTargetSupportsScalableVectors; 5711 if (IgnoreScalableUserVF) { 5712 LLVM_DEBUG( 5713 dbgs() << "LV: Ignoring VF=" << UserVF 5714 << " because target does not support scalable vectors.\n"); 5715 ORE->emit([&]() { 5716 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5717 TheLoop->getStartLoc(), 5718 TheLoop->getHeader()) 5719 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5720 << " because target does not support scalable vectors."; 5721 }); 5722 } 5723 5724 // Beyond this point two scenarios are handled. If UserVF isn't specified 5725 // then a suitable VF is chosen. If UserVF is specified and there are 5726 // dependencies, check if it's legal. However, if a UserVF is specified and 5727 // there are no dependencies, then there's nothing to do. 5728 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5729 if (!canVectorizeReductions(UserVF)) { 5730 reportVectorizationFailure( 5731 "LV: Scalable vectorization not supported for the reduction " 5732 "operations found in this loop. Using fixed-width " 5733 "vectorization instead.", 5734 "Scalable vectorization not supported for the reduction operations " 5735 "found in this loop. Using fixed-width vectorization instead.", 5736 "ScalableVFUnfeasible", ORE, TheLoop); 5737 return computeFeasibleMaxVF( 5738 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5739 } 5740 5741 if (Legal->isSafeForAnyVectorWidth()) 5742 return UserVF; 5743 } 5744 5745 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5746 unsigned SmallestType, WidestType; 5747 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5748 unsigned WidestRegister = 5749 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 5750 .getFixedSize(); 5751 5752 // Get the maximum safe dependence distance in bits computed by LAA. 5753 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5754 // the memory accesses that is most restrictive (involved in the smallest 5755 // dependence distance). 5756 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5757 5758 // If the user vectorization factor is legally unsafe, clamp it to a safe 5759 // value. Otherwise, return as is. 5760 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5761 unsigned MaxSafeElements = 5762 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5763 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5764 5765 if (UserVF.isScalable()) { 5766 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5767 5768 // Scale VF by vscale before checking if it's safe. 5769 MaxSafeVF = ElementCount::getScalable( 5770 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5771 5772 if (MaxSafeVF.isZero()) { 5773 // The dependence distance is too small to use scalable vectors, 5774 // fallback on fixed. 5775 LLVM_DEBUG( 5776 dbgs() 5777 << "LV: Max legal vector width too small, scalable vectorization " 5778 "unfeasible. Using fixed-width vectorization instead.\n"); 5779 ORE->emit([&]() { 5780 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5781 TheLoop->getStartLoc(), 5782 TheLoop->getHeader()) 5783 << "Max legal vector width too small, scalable vectorization " 5784 << "unfeasible. Using fixed-width vectorization instead."; 5785 }); 5786 return computeFeasibleMaxVF( 5787 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5788 } 5789 } 5790 5791 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5792 5793 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5794 return UserVF; 5795 5796 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5797 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5798 << ".\n"); 5799 ORE->emit([&]() { 5800 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5801 TheLoop->getStartLoc(), 5802 TheLoop->getHeader()) 5803 << "User-specified vectorization factor " 5804 << ore::NV("UserVectorizationFactor", UserVF) 5805 << " is unsafe, clamping to maximum safe vectorization factor " 5806 << ore::NV("VectorizationFactor", MaxSafeVF); 5807 }); 5808 return MaxSafeVF; 5809 } 5810 5811 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5812 5813 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5814 // Note that both WidestRegister and WidestType may not be a powers of 2. 5815 auto MaxVectorSize = 5816 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5817 5818 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5819 << " / " << WidestType << " bits.\n"); 5820 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5821 << WidestRegister << " bits.\n"); 5822 5823 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5824 "Did not expect to pack so many elements" 5825 " into one vector!"); 5826 if (MaxVectorSize.getFixedValue() == 0) { 5827 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5828 return ElementCount::getFixed(1); 5829 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5830 isPowerOf2_32(ConstTripCount)) { 5831 // We need to clamp the VF to be the ConstTripCount. There is no point in 5832 // choosing a higher viable VF as done in the loop below. 5833 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5834 << ConstTripCount << "\n"); 5835 return ElementCount::getFixed(ConstTripCount); 5836 } 5837 5838 ElementCount MaxVF = MaxVectorSize; 5839 if (TTI.shouldMaximizeVectorBandwidth() || 5840 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5841 // Collect all viable vectorization factors larger than the default MaxVF 5842 // (i.e. MaxVectorSize). 5843 SmallVector<ElementCount, 8> VFs; 5844 auto MaxVectorSizeMaxBW = 5845 ElementCount::getFixed(WidestRegister / SmallestType); 5846 for (ElementCount VS = MaxVectorSize * 2; 5847 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5848 VFs.push_back(VS); 5849 5850 // For each VF calculate its register usage. 5851 auto RUs = calculateRegisterUsage(VFs); 5852 5853 // Select the largest VF which doesn't require more registers than existing 5854 // ones. 5855 for (int i = RUs.size() - 1; i >= 0; --i) { 5856 bool Selected = true; 5857 for (auto &pair : RUs[i].MaxLocalUsers) { 5858 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5859 if (pair.second > TargetNumRegisters) 5860 Selected = false; 5861 } 5862 if (Selected) { 5863 MaxVF = VFs[i]; 5864 break; 5865 } 5866 } 5867 if (ElementCount MinVF = 5868 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5869 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5870 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5871 << ") with target's minimum: " << MinVF << '\n'); 5872 MaxVF = MinVF; 5873 } 5874 } 5875 } 5876 return MaxVF; 5877 } 5878 5879 VectorizationFactor 5880 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5881 // FIXME: This can be fixed for scalable vectors later, because at this stage 5882 // the LoopVectorizer will only consider vectorizing a loop with scalable 5883 // vectors when the loop has a hint to enable vectorization for a given VF. 5884 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5885 5886 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5887 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5888 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5889 5890 auto Width = ElementCount::getFixed(1); 5891 const float ScalarCost = *ExpectedCost.getValue(); 5892 float Cost = ScalarCost; 5893 5894 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5895 if (ForceVectorization && MaxVF.isVector()) { 5896 // Ignore scalar width, because the user explicitly wants vectorization. 5897 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5898 // evaluation. 5899 Cost = std::numeric_limits<float>::max(); 5900 } 5901 5902 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5903 i *= 2) { 5904 // Notice that the vector loop needs to be executed less times, so 5905 // we need to divide the cost of the vector loops by the width of 5906 // the vector elements. 5907 VectorizationCostTy C = expectedCost(i); 5908 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5909 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5910 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5911 << " costs: " << (int)VectorCost << ".\n"); 5912 if (!C.second && !ForceVectorization) { 5913 LLVM_DEBUG( 5914 dbgs() << "LV: Not considering vector loop of width " << i 5915 << " because it will not generate any vector instructions.\n"); 5916 continue; 5917 } 5918 5919 // If profitable add it to ProfitableVF list. 5920 if (VectorCost < ScalarCost) { 5921 ProfitableVFs.push_back(VectorizationFactor( 5922 {i, (unsigned)VectorCost})); 5923 } 5924 5925 if (VectorCost < Cost) { 5926 Cost = VectorCost; 5927 Width = i; 5928 } 5929 } 5930 5931 if (!EnableCondStoresVectorization && NumPredStores) { 5932 reportVectorizationFailure("There are conditional stores.", 5933 "store that is conditionally executed prevents vectorization", 5934 "ConditionalStore", ORE, TheLoop); 5935 Width = ElementCount::getFixed(1); 5936 Cost = ScalarCost; 5937 } 5938 5939 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5940 << "LV: Vectorization seems to be not beneficial, " 5941 << "but was forced by a user.\n"); 5942 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5943 VectorizationFactor Factor = {Width, 5944 (unsigned)(Width.getKnownMinValue() * Cost)}; 5945 return Factor; 5946 } 5947 5948 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5949 const Loop &L, ElementCount VF) const { 5950 // Cross iteration phis such as reductions need special handling and are 5951 // currently unsupported. 5952 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5953 return Legal->isFirstOrderRecurrence(&Phi) || 5954 Legal->isReductionVariable(&Phi); 5955 })) 5956 return false; 5957 5958 // Phis with uses outside of the loop require special handling and are 5959 // currently unsupported. 5960 for (auto &Entry : Legal->getInductionVars()) { 5961 // Look for uses of the value of the induction at the last iteration. 5962 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5963 for (User *U : PostInc->users()) 5964 if (!L.contains(cast<Instruction>(U))) 5965 return false; 5966 // Look for uses of penultimate value of the induction. 5967 for (User *U : Entry.first->users()) 5968 if (!L.contains(cast<Instruction>(U))) 5969 return false; 5970 } 5971 5972 // Induction variables that are widened require special handling that is 5973 // currently not supported. 5974 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5975 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5976 this->isProfitableToScalarize(Entry.first, VF)); 5977 })) 5978 return false; 5979 5980 return true; 5981 } 5982 5983 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5984 const ElementCount VF) const { 5985 // FIXME: We need a much better cost-model to take different parameters such 5986 // as register pressure, code size increase and cost of extra branches into 5987 // account. For now we apply a very crude heuristic and only consider loops 5988 // with vectorization factors larger than a certain value. 5989 // We also consider epilogue vectorization unprofitable for targets that don't 5990 // consider interleaving beneficial (eg. MVE). 5991 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5992 return false; 5993 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5994 return true; 5995 return false; 5996 } 5997 5998 VectorizationFactor 5999 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6000 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6001 VectorizationFactor Result = VectorizationFactor::Disabled(); 6002 if (!EnableEpilogueVectorization) { 6003 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6004 return Result; 6005 } 6006 6007 if (!isScalarEpilogueAllowed()) { 6008 LLVM_DEBUG( 6009 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6010 "allowed.\n";); 6011 return Result; 6012 } 6013 6014 // FIXME: This can be fixed for scalable vectors later, because at this stage 6015 // the LoopVectorizer will only consider vectorizing a loop with scalable 6016 // vectors when the loop has a hint to enable vectorization for a given VF. 6017 if (MainLoopVF.isScalable()) { 6018 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6019 "yet supported.\n"); 6020 return Result; 6021 } 6022 6023 // Not really a cost consideration, but check for unsupported cases here to 6024 // simplify the logic. 6025 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6026 LLVM_DEBUG( 6027 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6028 "not a supported candidate.\n";); 6029 return Result; 6030 } 6031 6032 if (EpilogueVectorizationForceVF > 1) { 6033 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6034 if (LVP.hasPlanWithVFs( 6035 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6036 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6037 else { 6038 LLVM_DEBUG( 6039 dbgs() 6040 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6041 return Result; 6042 } 6043 } 6044 6045 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6046 TheLoop->getHeader()->getParent()->hasMinSize()) { 6047 LLVM_DEBUG( 6048 dbgs() 6049 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6050 return Result; 6051 } 6052 6053 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6054 return Result; 6055 6056 for (auto &NextVF : ProfitableVFs) 6057 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6058 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6059 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6060 Result = NextVF; 6061 6062 if (Result != VectorizationFactor::Disabled()) 6063 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6064 << Result.Width.getFixedValue() << "\n";); 6065 return Result; 6066 } 6067 6068 std::pair<unsigned, unsigned> 6069 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6070 unsigned MinWidth = -1U; 6071 unsigned MaxWidth = 8; 6072 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6073 6074 // For each block. 6075 for (BasicBlock *BB : TheLoop->blocks()) { 6076 // For each instruction in the loop. 6077 for (Instruction &I : BB->instructionsWithoutDebug()) { 6078 Type *T = I.getType(); 6079 6080 // Skip ignored values. 6081 if (ValuesToIgnore.count(&I)) 6082 continue; 6083 6084 // Only examine Loads, Stores and PHINodes. 6085 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6086 continue; 6087 6088 // Examine PHI nodes that are reduction variables. Update the type to 6089 // account for the recurrence type. 6090 if (auto *PN = dyn_cast<PHINode>(&I)) { 6091 if (!Legal->isReductionVariable(PN)) 6092 continue; 6093 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6094 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6095 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6096 RdxDesc.getRecurrenceType(), 6097 TargetTransformInfo::ReductionFlags())) 6098 continue; 6099 T = RdxDesc.getRecurrenceType(); 6100 } 6101 6102 // Examine the stored values. 6103 if (auto *ST = dyn_cast<StoreInst>(&I)) 6104 T = ST->getValueOperand()->getType(); 6105 6106 // Ignore loaded pointer types and stored pointer types that are not 6107 // vectorizable. 6108 // 6109 // FIXME: The check here attempts to predict whether a load or store will 6110 // be vectorized. We only know this for certain after a VF has 6111 // been selected. Here, we assume that if an access can be 6112 // vectorized, it will be. We should also look at extending this 6113 // optimization to non-pointer types. 6114 // 6115 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6116 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6117 continue; 6118 6119 MinWidth = std::min(MinWidth, 6120 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6121 MaxWidth = std::max(MaxWidth, 6122 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6123 } 6124 } 6125 6126 return {MinWidth, MaxWidth}; 6127 } 6128 6129 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6130 unsigned LoopCost) { 6131 // -- The interleave heuristics -- 6132 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6133 // There are many micro-architectural considerations that we can't predict 6134 // at this level. For example, frontend pressure (on decode or fetch) due to 6135 // code size, or the number and capabilities of the execution ports. 6136 // 6137 // We use the following heuristics to select the interleave count: 6138 // 1. If the code has reductions, then we interleave to break the cross 6139 // iteration dependency. 6140 // 2. If the loop is really small, then we interleave to reduce the loop 6141 // overhead. 6142 // 3. We don't interleave if we think that we will spill registers to memory 6143 // due to the increased register pressure. 6144 6145 if (!isScalarEpilogueAllowed()) 6146 return 1; 6147 6148 // We used the distance for the interleave count. 6149 if (Legal->getMaxSafeDepDistBytes() != -1U) 6150 return 1; 6151 6152 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6153 const bool HasReductions = !Legal->getReductionVars().empty(); 6154 // Do not interleave loops with a relatively small known or estimated trip 6155 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6156 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6157 // because with the above conditions interleaving can expose ILP and break 6158 // cross iteration dependences for reductions. 6159 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6160 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6161 return 1; 6162 6163 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6164 // We divide by these constants so assume that we have at least one 6165 // instruction that uses at least one register. 6166 for (auto& pair : R.MaxLocalUsers) { 6167 pair.second = std::max(pair.second, 1U); 6168 } 6169 6170 // We calculate the interleave count using the following formula. 6171 // Subtract the number of loop invariants from the number of available 6172 // registers. These registers are used by all of the interleaved instances. 6173 // Next, divide the remaining registers by the number of registers that is 6174 // required by the loop, in order to estimate how many parallel instances 6175 // fit without causing spills. All of this is rounded down if necessary to be 6176 // a power of two. We want power of two interleave count to simplify any 6177 // addressing operations or alignment considerations. 6178 // We also want power of two interleave counts to ensure that the induction 6179 // variable of the vector loop wraps to zero, when tail is folded by masking; 6180 // this currently happens when OptForSize, in which case IC is set to 1 above. 6181 unsigned IC = UINT_MAX; 6182 6183 for (auto& pair : R.MaxLocalUsers) { 6184 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6185 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6186 << " registers of " 6187 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6188 if (VF.isScalar()) { 6189 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6190 TargetNumRegisters = ForceTargetNumScalarRegs; 6191 } else { 6192 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6193 TargetNumRegisters = ForceTargetNumVectorRegs; 6194 } 6195 unsigned MaxLocalUsers = pair.second; 6196 unsigned LoopInvariantRegs = 0; 6197 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6198 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6199 6200 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6201 // Don't count the induction variable as interleaved. 6202 if (EnableIndVarRegisterHeur) { 6203 TmpIC = 6204 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6205 std::max(1U, (MaxLocalUsers - 1))); 6206 } 6207 6208 IC = std::min(IC, TmpIC); 6209 } 6210 6211 // Clamp the interleave ranges to reasonable counts. 6212 unsigned MaxInterleaveCount = 6213 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6214 6215 // Check if the user has overridden the max. 6216 if (VF.isScalar()) { 6217 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6218 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6219 } else { 6220 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6221 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6222 } 6223 6224 // If trip count is known or estimated compile time constant, limit the 6225 // interleave count to be less than the trip count divided by VF, provided it 6226 // is at least 1. 6227 // 6228 // For scalable vectors we can't know if interleaving is beneficial. It may 6229 // not be beneficial for small loops if none of the lanes in the second vector 6230 // iterations is enabled. However, for larger loops, there is likely to be a 6231 // similar benefit as for fixed-width vectors. For now, we choose to leave 6232 // the InterleaveCount as if vscale is '1', although if some information about 6233 // the vector is known (e.g. min vector size), we can make a better decision. 6234 if (BestKnownTC) { 6235 MaxInterleaveCount = 6236 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6237 // Make sure MaxInterleaveCount is greater than 0. 6238 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6239 } 6240 6241 assert(MaxInterleaveCount > 0 && 6242 "Maximum interleave count must be greater than 0"); 6243 6244 // Clamp the calculated IC to be between the 1 and the max interleave count 6245 // that the target and trip count allows. 6246 if (IC > MaxInterleaveCount) 6247 IC = MaxInterleaveCount; 6248 else 6249 // Make sure IC is greater than 0. 6250 IC = std::max(1u, IC); 6251 6252 assert(IC > 0 && "Interleave count must be greater than 0."); 6253 6254 // If we did not calculate the cost for VF (because the user selected the VF) 6255 // then we calculate the cost of VF here. 6256 if (LoopCost == 0) { 6257 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6258 LoopCost = *expectedCost(VF).first.getValue(); 6259 } 6260 6261 assert(LoopCost && "Non-zero loop cost expected"); 6262 6263 // Interleave if we vectorized this loop and there is a reduction that could 6264 // benefit from interleaving. 6265 if (VF.isVector() && HasReductions) { 6266 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6267 return IC; 6268 } 6269 6270 // Note that if we've already vectorized the loop we will have done the 6271 // runtime check and so interleaving won't require further checks. 6272 bool InterleavingRequiresRuntimePointerCheck = 6273 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6274 6275 // We want to interleave small loops in order to reduce the loop overhead and 6276 // potentially expose ILP opportunities. 6277 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6278 << "LV: IC is " << IC << '\n' 6279 << "LV: VF is " << VF << '\n'); 6280 const bool AggressivelyInterleaveReductions = 6281 TTI.enableAggressiveInterleaving(HasReductions); 6282 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6283 // We assume that the cost overhead is 1 and we use the cost model 6284 // to estimate the cost of the loop and interleave until the cost of the 6285 // loop overhead is about 5% of the cost of the loop. 6286 unsigned SmallIC = 6287 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6288 6289 // Interleave until store/load ports (estimated by max interleave count) are 6290 // saturated. 6291 unsigned NumStores = Legal->getNumStores(); 6292 unsigned NumLoads = Legal->getNumLoads(); 6293 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6294 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6295 6296 // If we have a scalar reduction (vector reductions are already dealt with 6297 // by this point), we can increase the critical path length if the loop 6298 // we're interleaving is inside another loop. Limit, by default to 2, so the 6299 // critical path only gets increased by one reduction operation. 6300 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6301 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6302 SmallIC = std::min(SmallIC, F); 6303 StoresIC = std::min(StoresIC, F); 6304 LoadsIC = std::min(LoadsIC, F); 6305 } 6306 6307 if (EnableLoadStoreRuntimeInterleave && 6308 std::max(StoresIC, LoadsIC) > SmallIC) { 6309 LLVM_DEBUG( 6310 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6311 return std::max(StoresIC, LoadsIC); 6312 } 6313 6314 // If there are scalar reductions and TTI has enabled aggressive 6315 // interleaving for reductions, we will interleave to expose ILP. 6316 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6317 AggressivelyInterleaveReductions) { 6318 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6319 // Interleave no less than SmallIC but not as aggressive as the normal IC 6320 // to satisfy the rare situation when resources are too limited. 6321 return std::max(IC / 2, SmallIC); 6322 } else { 6323 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6324 return SmallIC; 6325 } 6326 } 6327 6328 // Interleave if this is a large loop (small loops are already dealt with by 6329 // this point) that could benefit from interleaving. 6330 if (AggressivelyInterleaveReductions) { 6331 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6332 return IC; 6333 } 6334 6335 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6336 return 1; 6337 } 6338 6339 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6340 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6341 // This function calculates the register usage by measuring the highest number 6342 // of values that are alive at a single location. Obviously, this is a very 6343 // rough estimation. We scan the loop in a topological order in order and 6344 // assign a number to each instruction. We use RPO to ensure that defs are 6345 // met before their users. We assume that each instruction that has in-loop 6346 // users starts an interval. We record every time that an in-loop value is 6347 // used, so we have a list of the first and last occurrences of each 6348 // instruction. Next, we transpose this data structure into a multi map that 6349 // holds the list of intervals that *end* at a specific location. This multi 6350 // map allows us to perform a linear search. We scan the instructions linearly 6351 // and record each time that a new interval starts, by placing it in a set. 6352 // If we find this value in the multi-map then we remove it from the set. 6353 // The max register usage is the maximum size of the set. 6354 // We also search for instructions that are defined outside the loop, but are 6355 // used inside the loop. We need this number separately from the max-interval 6356 // usage number because when we unroll, loop-invariant values do not take 6357 // more register. 6358 LoopBlocksDFS DFS(TheLoop); 6359 DFS.perform(LI); 6360 6361 RegisterUsage RU; 6362 6363 // Each 'key' in the map opens a new interval. The values 6364 // of the map are the index of the 'last seen' usage of the 6365 // instruction that is the key. 6366 using IntervalMap = DenseMap<Instruction *, unsigned>; 6367 6368 // Maps instruction to its index. 6369 SmallVector<Instruction *, 64> IdxToInstr; 6370 // Marks the end of each interval. 6371 IntervalMap EndPoint; 6372 // Saves the list of instruction indices that are used in the loop. 6373 SmallPtrSet<Instruction *, 8> Ends; 6374 // Saves the list of values that are used in the loop but are 6375 // defined outside the loop, such as arguments and constants. 6376 SmallPtrSet<Value *, 8> LoopInvariants; 6377 6378 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6379 for (Instruction &I : BB->instructionsWithoutDebug()) { 6380 IdxToInstr.push_back(&I); 6381 6382 // Save the end location of each USE. 6383 for (Value *U : I.operands()) { 6384 auto *Instr = dyn_cast<Instruction>(U); 6385 6386 // Ignore non-instruction values such as arguments, constants, etc. 6387 if (!Instr) 6388 continue; 6389 6390 // If this instruction is outside the loop then record it and continue. 6391 if (!TheLoop->contains(Instr)) { 6392 LoopInvariants.insert(Instr); 6393 continue; 6394 } 6395 6396 // Overwrite previous end points. 6397 EndPoint[Instr] = IdxToInstr.size(); 6398 Ends.insert(Instr); 6399 } 6400 } 6401 } 6402 6403 // Saves the list of intervals that end with the index in 'key'. 6404 using InstrList = SmallVector<Instruction *, 2>; 6405 DenseMap<unsigned, InstrList> TransposeEnds; 6406 6407 // Transpose the EndPoints to a list of values that end at each index. 6408 for (auto &Interval : EndPoint) 6409 TransposeEnds[Interval.second].push_back(Interval.first); 6410 6411 SmallPtrSet<Instruction *, 8> OpenIntervals; 6412 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6413 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6414 6415 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6416 6417 // A lambda that gets the register usage for the given type and VF. 6418 const auto &TTICapture = TTI; 6419 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6420 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6421 return 0U; 6422 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6423 }; 6424 6425 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6426 Instruction *I = IdxToInstr[i]; 6427 6428 // Remove all of the instructions that end at this location. 6429 InstrList &List = TransposeEnds[i]; 6430 for (Instruction *ToRemove : List) 6431 OpenIntervals.erase(ToRemove); 6432 6433 // Ignore instructions that are never used within the loop. 6434 if (!Ends.count(I)) 6435 continue; 6436 6437 // Skip ignored values. 6438 if (ValuesToIgnore.count(I)) 6439 continue; 6440 6441 // For each VF find the maximum usage of registers. 6442 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6443 // Count the number of live intervals. 6444 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6445 6446 if (VFs[j].isScalar()) { 6447 for (auto Inst : OpenIntervals) { 6448 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6449 if (RegUsage.find(ClassID) == RegUsage.end()) 6450 RegUsage[ClassID] = 1; 6451 else 6452 RegUsage[ClassID] += 1; 6453 } 6454 } else { 6455 collectUniformsAndScalars(VFs[j]); 6456 for (auto Inst : OpenIntervals) { 6457 // Skip ignored values for VF > 1. 6458 if (VecValuesToIgnore.count(Inst)) 6459 continue; 6460 if (isScalarAfterVectorization(Inst, VFs[j])) { 6461 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6462 if (RegUsage.find(ClassID) == RegUsage.end()) 6463 RegUsage[ClassID] = 1; 6464 else 6465 RegUsage[ClassID] += 1; 6466 } else { 6467 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6468 if (RegUsage.find(ClassID) == RegUsage.end()) 6469 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6470 else 6471 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6472 } 6473 } 6474 } 6475 6476 for (auto& pair : RegUsage) { 6477 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6478 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6479 else 6480 MaxUsages[j][pair.first] = pair.second; 6481 } 6482 } 6483 6484 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6485 << OpenIntervals.size() << '\n'); 6486 6487 // Add the current instruction to the list of open intervals. 6488 OpenIntervals.insert(I); 6489 } 6490 6491 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6492 SmallMapVector<unsigned, unsigned, 4> Invariant; 6493 6494 for (auto Inst : LoopInvariants) { 6495 unsigned Usage = 6496 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6497 unsigned ClassID = 6498 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6499 if (Invariant.find(ClassID) == Invariant.end()) 6500 Invariant[ClassID] = Usage; 6501 else 6502 Invariant[ClassID] += Usage; 6503 } 6504 6505 LLVM_DEBUG({ 6506 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6507 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6508 << " item\n"; 6509 for (const auto &pair : MaxUsages[i]) { 6510 dbgs() << "LV(REG): RegisterClass: " 6511 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6512 << " registers\n"; 6513 } 6514 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6515 << " item\n"; 6516 for (const auto &pair : Invariant) { 6517 dbgs() << "LV(REG): RegisterClass: " 6518 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6519 << " registers\n"; 6520 } 6521 }); 6522 6523 RU.LoopInvariantRegs = Invariant; 6524 RU.MaxLocalUsers = MaxUsages[i]; 6525 RUs[i] = RU; 6526 } 6527 6528 return RUs; 6529 } 6530 6531 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6532 // TODO: Cost model for emulated masked load/store is completely 6533 // broken. This hack guides the cost model to use an artificially 6534 // high enough value to practically disable vectorization with such 6535 // operations, except where previously deployed legality hack allowed 6536 // using very low cost values. This is to avoid regressions coming simply 6537 // from moving "masked load/store" check from legality to cost model. 6538 // Masked Load/Gather emulation was previously never allowed. 6539 // Limited number of Masked Store/Scatter emulation was allowed. 6540 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6541 return isa<LoadInst>(I) || 6542 (isa<StoreInst>(I) && 6543 NumPredStores > NumberOfStoresToPredicate); 6544 } 6545 6546 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6547 // If we aren't vectorizing the loop, or if we've already collected the 6548 // instructions to scalarize, there's nothing to do. Collection may already 6549 // have occurred if we have a user-selected VF and are now computing the 6550 // expected cost for interleaving. 6551 if (VF.isScalar() || VF.isZero() || 6552 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6553 return; 6554 6555 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6556 // not profitable to scalarize any instructions, the presence of VF in the 6557 // map will indicate that we've analyzed it already. 6558 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6559 6560 // Find all the instructions that are scalar with predication in the loop and 6561 // determine if it would be better to not if-convert the blocks they are in. 6562 // If so, we also record the instructions to scalarize. 6563 for (BasicBlock *BB : TheLoop->blocks()) { 6564 if (!blockNeedsPredication(BB)) 6565 continue; 6566 for (Instruction &I : *BB) 6567 if (isScalarWithPredication(&I)) { 6568 ScalarCostsTy ScalarCosts; 6569 // Do not apply discount logic if hacked cost is needed 6570 // for emulated masked memrefs. 6571 if (!useEmulatedMaskMemRefHack(&I) && 6572 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6573 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6574 // Remember that BB will remain after vectorization. 6575 PredicatedBBsAfterVectorization.insert(BB); 6576 } 6577 } 6578 } 6579 6580 int LoopVectorizationCostModel::computePredInstDiscount( 6581 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6582 assert(!isUniformAfterVectorization(PredInst, VF) && 6583 "Instruction marked uniform-after-vectorization will be predicated"); 6584 6585 // Initialize the discount to zero, meaning that the scalar version and the 6586 // vector version cost the same. 6587 InstructionCost Discount = 0; 6588 6589 // Holds instructions to analyze. The instructions we visit are mapped in 6590 // ScalarCosts. Those instructions are the ones that would be scalarized if 6591 // we find that the scalar version costs less. 6592 SmallVector<Instruction *, 8> Worklist; 6593 6594 // Returns true if the given instruction can be scalarized. 6595 auto canBeScalarized = [&](Instruction *I) -> bool { 6596 // We only attempt to scalarize instructions forming a single-use chain 6597 // from the original predicated block that would otherwise be vectorized. 6598 // Although not strictly necessary, we give up on instructions we know will 6599 // already be scalar to avoid traversing chains that are unlikely to be 6600 // beneficial. 6601 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6602 isScalarAfterVectorization(I, VF)) 6603 return false; 6604 6605 // If the instruction is scalar with predication, it will be analyzed 6606 // separately. We ignore it within the context of PredInst. 6607 if (isScalarWithPredication(I)) 6608 return false; 6609 6610 // If any of the instruction's operands are uniform after vectorization, 6611 // the instruction cannot be scalarized. This prevents, for example, a 6612 // masked load from being scalarized. 6613 // 6614 // We assume we will only emit a value for lane zero of an instruction 6615 // marked uniform after vectorization, rather than VF identical values. 6616 // Thus, if we scalarize an instruction that uses a uniform, we would 6617 // create uses of values corresponding to the lanes we aren't emitting code 6618 // for. This behavior can be changed by allowing getScalarValue to clone 6619 // the lane zero values for uniforms rather than asserting. 6620 for (Use &U : I->operands()) 6621 if (auto *J = dyn_cast<Instruction>(U.get())) 6622 if (isUniformAfterVectorization(J, VF)) 6623 return false; 6624 6625 // Otherwise, we can scalarize the instruction. 6626 return true; 6627 }; 6628 6629 // Compute the expected cost discount from scalarizing the entire expression 6630 // feeding the predicated instruction. We currently only consider expressions 6631 // that are single-use instruction chains. 6632 Worklist.push_back(PredInst); 6633 while (!Worklist.empty()) { 6634 Instruction *I = Worklist.pop_back_val(); 6635 6636 // If we've already analyzed the instruction, there's nothing to do. 6637 if (ScalarCosts.find(I) != ScalarCosts.end()) 6638 continue; 6639 6640 // Compute the cost of the vector instruction. Note that this cost already 6641 // includes the scalarization overhead of the predicated instruction. 6642 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6643 6644 // Compute the cost of the scalarized instruction. This cost is the cost of 6645 // the instruction as if it wasn't if-converted and instead remained in the 6646 // predicated block. We will scale this cost by block probability after 6647 // computing the scalarization overhead. 6648 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6649 InstructionCost ScalarCost = 6650 VF.getKnownMinValue() * 6651 getInstructionCost(I, ElementCount::getFixed(1)).first; 6652 6653 // Compute the scalarization overhead of needed insertelement instructions 6654 // and phi nodes. 6655 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6656 ScalarCost += TTI.getScalarizationOverhead( 6657 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6658 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6659 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6660 ScalarCost += 6661 VF.getKnownMinValue() * 6662 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6663 } 6664 6665 // Compute the scalarization overhead of needed extractelement 6666 // instructions. For each of the instruction's operands, if the operand can 6667 // be scalarized, add it to the worklist; otherwise, account for the 6668 // overhead. 6669 for (Use &U : I->operands()) 6670 if (auto *J = dyn_cast<Instruction>(U.get())) { 6671 assert(VectorType::isValidElementType(J->getType()) && 6672 "Instruction has non-scalar type"); 6673 if (canBeScalarized(J)) 6674 Worklist.push_back(J); 6675 else if (needsExtract(J, VF)) { 6676 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6677 ScalarCost += TTI.getScalarizationOverhead( 6678 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6679 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6680 } 6681 } 6682 6683 // Scale the total scalar cost by block probability. 6684 ScalarCost /= getReciprocalPredBlockProb(); 6685 6686 // Compute the discount. A non-negative discount means the vector version 6687 // of the instruction costs more, and scalarizing would be beneficial. 6688 Discount += VectorCost - ScalarCost; 6689 ScalarCosts[I] = ScalarCost; 6690 } 6691 6692 return *Discount.getValue(); 6693 } 6694 6695 LoopVectorizationCostModel::VectorizationCostTy 6696 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6697 VectorizationCostTy Cost; 6698 6699 // For each block. 6700 for (BasicBlock *BB : TheLoop->blocks()) { 6701 VectorizationCostTy BlockCost; 6702 6703 // For each instruction in the old loop. 6704 for (Instruction &I : BB->instructionsWithoutDebug()) { 6705 // Skip ignored values. 6706 if (ValuesToIgnore.count(&I) || 6707 (VF.isVector() && VecValuesToIgnore.count(&I))) 6708 continue; 6709 6710 VectorizationCostTy C = getInstructionCost(&I, VF); 6711 6712 // Check if we should override the cost. 6713 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6714 C.first = InstructionCost(ForceTargetInstructionCost); 6715 6716 BlockCost.first += C.first; 6717 BlockCost.second |= C.second; 6718 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6719 << " for VF " << VF << " For instruction: " << I 6720 << '\n'); 6721 } 6722 6723 // If we are vectorizing a predicated block, it will have been 6724 // if-converted. This means that the block's instructions (aside from 6725 // stores and instructions that may divide by zero) will now be 6726 // unconditionally executed. For the scalar case, we may not always execute 6727 // the predicated block, if it is an if-else block. Thus, scale the block's 6728 // cost by the probability of executing it. blockNeedsPredication from 6729 // Legal is used so as to not include all blocks in tail folded loops. 6730 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6731 BlockCost.first /= getReciprocalPredBlockProb(); 6732 6733 Cost.first += BlockCost.first; 6734 Cost.second |= BlockCost.second; 6735 } 6736 6737 return Cost; 6738 } 6739 6740 /// Gets Address Access SCEV after verifying that the access pattern 6741 /// is loop invariant except the induction variable dependence. 6742 /// 6743 /// This SCEV can be sent to the Target in order to estimate the address 6744 /// calculation cost. 6745 static const SCEV *getAddressAccessSCEV( 6746 Value *Ptr, 6747 LoopVectorizationLegality *Legal, 6748 PredicatedScalarEvolution &PSE, 6749 const Loop *TheLoop) { 6750 6751 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6752 if (!Gep) 6753 return nullptr; 6754 6755 // We are looking for a gep with all loop invariant indices except for one 6756 // which should be an induction variable. 6757 auto SE = PSE.getSE(); 6758 unsigned NumOperands = Gep->getNumOperands(); 6759 for (unsigned i = 1; i < NumOperands; ++i) { 6760 Value *Opd = Gep->getOperand(i); 6761 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6762 !Legal->isInductionVariable(Opd)) 6763 return nullptr; 6764 } 6765 6766 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6767 return PSE.getSCEV(Ptr); 6768 } 6769 6770 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6771 return Legal->hasStride(I->getOperand(0)) || 6772 Legal->hasStride(I->getOperand(1)); 6773 } 6774 6775 InstructionCost 6776 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6777 ElementCount VF) { 6778 assert(VF.isVector() && 6779 "Scalarization cost of instruction implies vectorization."); 6780 if (VF.isScalable()) 6781 return InstructionCost::getInvalid(); 6782 6783 Type *ValTy = getMemInstValueType(I); 6784 auto SE = PSE.getSE(); 6785 6786 unsigned AS = getLoadStoreAddressSpace(I); 6787 Value *Ptr = getLoadStorePointerOperand(I); 6788 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6789 6790 // Figure out whether the access is strided and get the stride value 6791 // if it's known in compile time 6792 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6793 6794 // Get the cost of the scalar memory instruction and address computation. 6795 InstructionCost Cost = 6796 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6797 6798 // Don't pass *I here, since it is scalar but will actually be part of a 6799 // vectorized loop where the user of it is a vectorized instruction. 6800 const Align Alignment = getLoadStoreAlignment(I); 6801 Cost += VF.getKnownMinValue() * 6802 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6803 AS, TTI::TCK_RecipThroughput); 6804 6805 // Get the overhead of the extractelement and insertelement instructions 6806 // we might create due to scalarization. 6807 Cost += getScalarizationOverhead(I, VF); 6808 6809 // If we have a predicated load/store, it will need extra i1 extracts and 6810 // conditional branches, but may not be executed for each vector lane. Scale 6811 // the cost by the probability of executing the predicated block. 6812 if (isPredicatedInst(I)) { 6813 Cost /= getReciprocalPredBlockProb(); 6814 6815 // Add the cost of an i1 extract and a branch 6816 auto *Vec_i1Ty = 6817 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6818 Cost += TTI.getScalarizationOverhead( 6819 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6820 /*Insert=*/false, /*Extract=*/true); 6821 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6822 6823 if (useEmulatedMaskMemRefHack(I)) 6824 // Artificially setting to a high enough value to practically disable 6825 // vectorization with such operations. 6826 Cost = 3000000; 6827 } 6828 6829 return Cost; 6830 } 6831 6832 InstructionCost 6833 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6834 ElementCount VF) { 6835 Type *ValTy = getMemInstValueType(I); 6836 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6837 Value *Ptr = getLoadStorePointerOperand(I); 6838 unsigned AS = getLoadStoreAddressSpace(I); 6839 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6840 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6841 6842 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6843 "Stride should be 1 or -1 for consecutive memory access"); 6844 const Align Alignment = getLoadStoreAlignment(I); 6845 InstructionCost Cost = 0; 6846 if (Legal->isMaskRequired(I)) 6847 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6848 CostKind); 6849 else 6850 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6851 CostKind, I); 6852 6853 bool Reverse = ConsecutiveStride < 0; 6854 if (Reverse) 6855 Cost += 6856 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6857 return Cost; 6858 } 6859 6860 InstructionCost 6861 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6862 ElementCount VF) { 6863 assert(Legal->isUniformMemOp(*I)); 6864 6865 Type *ValTy = getMemInstValueType(I); 6866 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6867 const Align Alignment = getLoadStoreAlignment(I); 6868 unsigned AS = getLoadStoreAddressSpace(I); 6869 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6870 if (isa<LoadInst>(I)) { 6871 return TTI.getAddressComputationCost(ValTy) + 6872 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6873 CostKind) + 6874 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6875 } 6876 StoreInst *SI = cast<StoreInst>(I); 6877 6878 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6879 return TTI.getAddressComputationCost(ValTy) + 6880 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6881 CostKind) + 6882 (isLoopInvariantStoreValue 6883 ? 0 6884 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6885 VF.getKnownMinValue() - 1)); 6886 } 6887 6888 InstructionCost 6889 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6890 ElementCount VF) { 6891 Type *ValTy = getMemInstValueType(I); 6892 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6893 const Align Alignment = getLoadStoreAlignment(I); 6894 const Value *Ptr = getLoadStorePointerOperand(I); 6895 6896 return TTI.getAddressComputationCost(VectorTy) + 6897 TTI.getGatherScatterOpCost( 6898 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6899 TargetTransformInfo::TCK_RecipThroughput, I); 6900 } 6901 6902 InstructionCost 6903 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6904 ElementCount VF) { 6905 // TODO: Once we have support for interleaving with scalable vectors 6906 // we can calculate the cost properly here. 6907 if (VF.isScalable()) 6908 return InstructionCost::getInvalid(); 6909 6910 Type *ValTy = getMemInstValueType(I); 6911 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6912 unsigned AS = getLoadStoreAddressSpace(I); 6913 6914 auto Group = getInterleavedAccessGroup(I); 6915 assert(Group && "Fail to get an interleaved access group."); 6916 6917 unsigned InterleaveFactor = Group->getFactor(); 6918 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6919 6920 // Holds the indices of existing members in an interleaved load group. 6921 // An interleaved store group doesn't need this as it doesn't allow gaps. 6922 SmallVector<unsigned, 4> Indices; 6923 if (isa<LoadInst>(I)) { 6924 for (unsigned i = 0; i < InterleaveFactor; i++) 6925 if (Group->getMember(i)) 6926 Indices.push_back(i); 6927 } 6928 6929 // Calculate the cost of the whole interleaved group. 6930 bool UseMaskForGaps = 6931 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6932 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6933 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6934 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6935 6936 if (Group->isReverse()) { 6937 // TODO: Add support for reversed masked interleaved access. 6938 assert(!Legal->isMaskRequired(I) && 6939 "Reverse masked interleaved access not supported."); 6940 Cost += 6941 Group->getNumMembers() * 6942 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6943 } 6944 return Cost; 6945 } 6946 6947 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6948 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6949 // Early exit for no inloop reductions 6950 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6951 return InstructionCost::getInvalid(); 6952 auto *VectorTy = cast<VectorType>(Ty); 6953 6954 // We are looking for a pattern of, and finding the minimal acceptable cost: 6955 // reduce(mul(ext(A), ext(B))) or 6956 // reduce(mul(A, B)) or 6957 // reduce(ext(A)) or 6958 // reduce(A). 6959 // The basic idea is that we walk down the tree to do that, finding the root 6960 // reduction instruction in InLoopReductionImmediateChains. From there we find 6961 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6962 // of the components. If the reduction cost is lower then we return it for the 6963 // reduction instruction and 0 for the other instructions in the pattern. If 6964 // it is not we return an invalid cost specifying the orignal cost method 6965 // should be used. 6966 Instruction *RetI = I; 6967 if ((RetI->getOpcode() == Instruction::SExt || 6968 RetI->getOpcode() == Instruction::ZExt)) { 6969 if (!RetI->hasOneUser()) 6970 return InstructionCost::getInvalid(); 6971 RetI = RetI->user_back(); 6972 } 6973 if (RetI->getOpcode() == Instruction::Mul && 6974 RetI->user_back()->getOpcode() == Instruction::Add) { 6975 if (!RetI->hasOneUser()) 6976 return InstructionCost::getInvalid(); 6977 RetI = RetI->user_back(); 6978 } 6979 6980 // Test if the found instruction is a reduction, and if not return an invalid 6981 // cost specifying the parent to use the original cost modelling. 6982 if (!InLoopReductionImmediateChains.count(RetI)) 6983 return InstructionCost::getInvalid(); 6984 6985 // Find the reduction this chain is a part of and calculate the basic cost of 6986 // the reduction on its own. 6987 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6988 Instruction *ReductionPhi = LastChain; 6989 while (!isa<PHINode>(ReductionPhi)) 6990 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6991 6992 RecurrenceDescriptor RdxDesc = 6993 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6994 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6995 RdxDesc.getOpcode(), VectorTy, false, CostKind); 6996 6997 // Get the operand that was not the reduction chain and match it to one of the 6998 // patterns, returning the better cost if it is found. 6999 Instruction *RedOp = RetI->getOperand(1) == LastChain 7000 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7001 : dyn_cast<Instruction>(RetI->getOperand(1)); 7002 7003 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7004 7005 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7006 !TheLoop->isLoopInvariant(RedOp)) { 7007 bool IsUnsigned = isa<ZExtInst>(RedOp); 7008 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7009 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7010 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7011 CostKind); 7012 7013 InstructionCost ExtCost = 7014 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7015 TTI::CastContextHint::None, CostKind, RedOp); 7016 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7017 return I == RetI ? *RedCost.getValue() : 0; 7018 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7019 Instruction *Mul = RedOp; 7020 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7021 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7022 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7023 Op0->getOpcode() == Op1->getOpcode() && 7024 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7025 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7026 bool IsUnsigned = isa<ZExtInst>(Op0); 7027 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7028 // reduce(mul(ext, ext)) 7029 InstructionCost ExtCost = 7030 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7031 TTI::CastContextHint::None, CostKind, Op0); 7032 InstructionCost MulCost = 7033 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7034 7035 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7036 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7037 CostKind); 7038 7039 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7040 return I == RetI ? *RedCost.getValue() : 0; 7041 } else { 7042 InstructionCost MulCost = 7043 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7044 7045 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7046 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7047 CostKind); 7048 7049 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7050 return I == RetI ? *RedCost.getValue() : 0; 7051 } 7052 } 7053 7054 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7055 } 7056 7057 InstructionCost 7058 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7059 ElementCount VF) { 7060 // Calculate scalar cost only. Vectorization cost should be ready at this 7061 // moment. 7062 if (VF.isScalar()) { 7063 Type *ValTy = getMemInstValueType(I); 7064 const Align Alignment = getLoadStoreAlignment(I); 7065 unsigned AS = getLoadStoreAddressSpace(I); 7066 7067 return TTI.getAddressComputationCost(ValTy) + 7068 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7069 TTI::TCK_RecipThroughput, I); 7070 } 7071 return getWideningCost(I, VF); 7072 } 7073 7074 LoopVectorizationCostModel::VectorizationCostTy 7075 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7076 ElementCount VF) { 7077 // If we know that this instruction will remain uniform, check the cost of 7078 // the scalar version. 7079 if (isUniformAfterVectorization(I, VF)) 7080 VF = ElementCount::getFixed(1); 7081 7082 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7083 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7084 7085 // Forced scalars do not have any scalarization overhead. 7086 auto ForcedScalar = ForcedScalars.find(VF); 7087 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7088 auto InstSet = ForcedScalar->second; 7089 if (InstSet.count(I)) 7090 return VectorizationCostTy( 7091 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7092 VF.getKnownMinValue()), 7093 false); 7094 } 7095 7096 Type *VectorTy; 7097 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7098 7099 bool TypeNotScalarized = 7100 VF.isVector() && VectorTy->isVectorTy() && 7101 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7102 return VectorizationCostTy(C, TypeNotScalarized); 7103 } 7104 7105 InstructionCost 7106 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7107 ElementCount VF) const { 7108 7109 if (VF.isScalable()) 7110 return InstructionCost::getInvalid(); 7111 7112 if (VF.isScalar()) 7113 return 0; 7114 7115 InstructionCost Cost = 0; 7116 Type *RetTy = ToVectorTy(I->getType(), VF); 7117 if (!RetTy->isVoidTy() && 7118 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7119 Cost += TTI.getScalarizationOverhead( 7120 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7121 true, false); 7122 7123 // Some targets keep addresses scalar. 7124 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7125 return Cost; 7126 7127 // Some targets support efficient element stores. 7128 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7129 return Cost; 7130 7131 // Collect operands to consider. 7132 CallInst *CI = dyn_cast<CallInst>(I); 7133 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7134 7135 // Skip operands that do not require extraction/scalarization and do not incur 7136 // any overhead. 7137 SmallVector<Type *> Tys; 7138 for (auto *V : filterExtractingOperands(Ops, VF)) 7139 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7140 return Cost + TTI.getOperandsScalarizationOverhead( 7141 filterExtractingOperands(Ops, VF), Tys); 7142 } 7143 7144 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7145 if (VF.isScalar()) 7146 return; 7147 NumPredStores = 0; 7148 for (BasicBlock *BB : TheLoop->blocks()) { 7149 // For each instruction in the old loop. 7150 for (Instruction &I : *BB) { 7151 Value *Ptr = getLoadStorePointerOperand(&I); 7152 if (!Ptr) 7153 continue; 7154 7155 // TODO: We should generate better code and update the cost model for 7156 // predicated uniform stores. Today they are treated as any other 7157 // predicated store (see added test cases in 7158 // invariant-store-vectorization.ll). 7159 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7160 NumPredStores++; 7161 7162 if (Legal->isUniformMemOp(I)) { 7163 // TODO: Avoid replicating loads and stores instead of 7164 // relying on instcombine to remove them. 7165 // Load: Scalar load + broadcast 7166 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7167 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7168 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7169 continue; 7170 } 7171 7172 // We assume that widening is the best solution when possible. 7173 if (memoryInstructionCanBeWidened(&I, VF)) { 7174 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7175 int ConsecutiveStride = 7176 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7177 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7178 "Expected consecutive stride."); 7179 InstWidening Decision = 7180 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7181 setWideningDecision(&I, VF, Decision, Cost); 7182 continue; 7183 } 7184 7185 // Choose between Interleaving, Gather/Scatter or Scalarization. 7186 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7187 unsigned NumAccesses = 1; 7188 if (isAccessInterleaved(&I)) { 7189 auto Group = getInterleavedAccessGroup(&I); 7190 assert(Group && "Fail to get an interleaved access group."); 7191 7192 // Make one decision for the whole group. 7193 if (getWideningDecision(&I, VF) != CM_Unknown) 7194 continue; 7195 7196 NumAccesses = Group->getNumMembers(); 7197 if (interleavedAccessCanBeWidened(&I, VF)) 7198 InterleaveCost = getInterleaveGroupCost(&I, VF); 7199 } 7200 7201 InstructionCost GatherScatterCost = 7202 isLegalGatherOrScatter(&I) 7203 ? getGatherScatterCost(&I, VF) * NumAccesses 7204 : InstructionCost::getInvalid(); 7205 7206 InstructionCost ScalarizationCost = 7207 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7208 7209 // Choose better solution for the current VF, 7210 // write down this decision and use it during vectorization. 7211 InstructionCost Cost; 7212 InstWidening Decision; 7213 if (InterleaveCost <= GatherScatterCost && 7214 InterleaveCost < ScalarizationCost) { 7215 Decision = CM_Interleave; 7216 Cost = InterleaveCost; 7217 } else if (GatherScatterCost < ScalarizationCost) { 7218 Decision = CM_GatherScatter; 7219 Cost = GatherScatterCost; 7220 } else { 7221 assert(!VF.isScalable() && 7222 "We cannot yet scalarise for scalable vectors"); 7223 Decision = CM_Scalarize; 7224 Cost = ScalarizationCost; 7225 } 7226 // If the instructions belongs to an interleave group, the whole group 7227 // receives the same decision. The whole group receives the cost, but 7228 // the cost will actually be assigned to one instruction. 7229 if (auto Group = getInterleavedAccessGroup(&I)) 7230 setWideningDecision(Group, VF, Decision, Cost); 7231 else 7232 setWideningDecision(&I, VF, Decision, Cost); 7233 } 7234 } 7235 7236 // Make sure that any load of address and any other address computation 7237 // remains scalar unless there is gather/scatter support. This avoids 7238 // inevitable extracts into address registers, and also has the benefit of 7239 // activating LSR more, since that pass can't optimize vectorized 7240 // addresses. 7241 if (TTI.prefersVectorizedAddressing()) 7242 return; 7243 7244 // Start with all scalar pointer uses. 7245 SmallPtrSet<Instruction *, 8> AddrDefs; 7246 for (BasicBlock *BB : TheLoop->blocks()) 7247 for (Instruction &I : *BB) { 7248 Instruction *PtrDef = 7249 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7250 if (PtrDef && TheLoop->contains(PtrDef) && 7251 getWideningDecision(&I, VF) != CM_GatherScatter) 7252 AddrDefs.insert(PtrDef); 7253 } 7254 7255 // Add all instructions used to generate the addresses. 7256 SmallVector<Instruction *, 4> Worklist; 7257 append_range(Worklist, AddrDefs); 7258 while (!Worklist.empty()) { 7259 Instruction *I = Worklist.pop_back_val(); 7260 for (auto &Op : I->operands()) 7261 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7262 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7263 AddrDefs.insert(InstOp).second) 7264 Worklist.push_back(InstOp); 7265 } 7266 7267 for (auto *I : AddrDefs) { 7268 if (isa<LoadInst>(I)) { 7269 // Setting the desired widening decision should ideally be handled in 7270 // by cost functions, but since this involves the task of finding out 7271 // if the loaded register is involved in an address computation, it is 7272 // instead changed here when we know this is the case. 7273 InstWidening Decision = getWideningDecision(I, VF); 7274 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7275 // Scalarize a widened load of address. 7276 setWideningDecision( 7277 I, VF, CM_Scalarize, 7278 (VF.getKnownMinValue() * 7279 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7280 else if (auto Group = getInterleavedAccessGroup(I)) { 7281 // Scalarize an interleave group of address loads. 7282 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7283 if (Instruction *Member = Group->getMember(I)) 7284 setWideningDecision( 7285 Member, VF, CM_Scalarize, 7286 (VF.getKnownMinValue() * 7287 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7288 } 7289 } 7290 } else 7291 // Make sure I gets scalarized and a cost estimate without 7292 // scalarization overhead. 7293 ForcedScalars[VF].insert(I); 7294 } 7295 } 7296 7297 InstructionCost 7298 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7299 Type *&VectorTy) { 7300 Type *RetTy = I->getType(); 7301 if (canTruncateToMinimalBitwidth(I, VF)) 7302 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7303 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7304 auto SE = PSE.getSE(); 7305 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7306 7307 // TODO: We need to estimate the cost of intrinsic calls. 7308 switch (I->getOpcode()) { 7309 case Instruction::GetElementPtr: 7310 // We mark this instruction as zero-cost because the cost of GEPs in 7311 // vectorized code depends on whether the corresponding memory instruction 7312 // is scalarized or not. Therefore, we handle GEPs with the memory 7313 // instruction cost. 7314 return 0; 7315 case Instruction::Br: { 7316 // In cases of scalarized and predicated instructions, there will be VF 7317 // predicated blocks in the vectorized loop. Each branch around these 7318 // blocks requires also an extract of its vector compare i1 element. 7319 bool ScalarPredicatedBB = false; 7320 BranchInst *BI = cast<BranchInst>(I); 7321 if (VF.isVector() && BI->isConditional() && 7322 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7323 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7324 ScalarPredicatedBB = true; 7325 7326 if (ScalarPredicatedBB) { 7327 // Return cost for branches around scalarized and predicated blocks. 7328 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7329 auto *Vec_i1Ty = 7330 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7331 return (TTI.getScalarizationOverhead( 7332 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7333 false, true) + 7334 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7335 VF.getKnownMinValue())); 7336 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7337 // The back-edge branch will remain, as will all scalar branches. 7338 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7339 else 7340 // This branch will be eliminated by if-conversion. 7341 return 0; 7342 // Note: We currently assume zero cost for an unconditional branch inside 7343 // a predicated block since it will become a fall-through, although we 7344 // may decide in the future to call TTI for all branches. 7345 } 7346 case Instruction::PHI: { 7347 auto *Phi = cast<PHINode>(I); 7348 7349 // First-order recurrences are replaced by vector shuffles inside the loop. 7350 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7351 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7352 return TTI.getShuffleCost( 7353 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7354 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7355 7356 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7357 // converted into select instructions. We require N - 1 selects per phi 7358 // node, where N is the number of incoming values. 7359 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7360 return (Phi->getNumIncomingValues() - 1) * 7361 TTI.getCmpSelInstrCost( 7362 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7363 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7364 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7365 7366 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7367 } 7368 case Instruction::UDiv: 7369 case Instruction::SDiv: 7370 case Instruction::URem: 7371 case Instruction::SRem: 7372 // If we have a predicated instruction, it may not be executed for each 7373 // vector lane. Get the scalarization cost and scale this amount by the 7374 // probability of executing the predicated block. If the instruction is not 7375 // predicated, we fall through to the next case. 7376 if (VF.isVector() && isScalarWithPredication(I)) { 7377 InstructionCost Cost = 0; 7378 7379 // These instructions have a non-void type, so account for the phi nodes 7380 // that we will create. This cost is likely to be zero. The phi node 7381 // cost, if any, should be scaled by the block probability because it 7382 // models a copy at the end of each predicated block. 7383 Cost += VF.getKnownMinValue() * 7384 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7385 7386 // The cost of the non-predicated instruction. 7387 Cost += VF.getKnownMinValue() * 7388 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7389 7390 // The cost of insertelement and extractelement instructions needed for 7391 // scalarization. 7392 Cost += getScalarizationOverhead(I, VF); 7393 7394 // Scale the cost by the probability of executing the predicated blocks. 7395 // This assumes the predicated block for each vector lane is equally 7396 // likely. 7397 return Cost / getReciprocalPredBlockProb(); 7398 } 7399 LLVM_FALLTHROUGH; 7400 case Instruction::Add: 7401 case Instruction::FAdd: 7402 case Instruction::Sub: 7403 case Instruction::FSub: 7404 case Instruction::Mul: 7405 case Instruction::FMul: 7406 case Instruction::FDiv: 7407 case Instruction::FRem: 7408 case Instruction::Shl: 7409 case Instruction::LShr: 7410 case Instruction::AShr: 7411 case Instruction::And: 7412 case Instruction::Or: 7413 case Instruction::Xor: { 7414 // Since we will replace the stride by 1 the multiplication should go away. 7415 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7416 return 0; 7417 7418 // Detect reduction patterns 7419 InstructionCost RedCost; 7420 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7421 .isValid()) 7422 return RedCost; 7423 7424 // Certain instructions can be cheaper to vectorize if they have a constant 7425 // second vector operand. One example of this are shifts on x86. 7426 Value *Op2 = I->getOperand(1); 7427 TargetTransformInfo::OperandValueProperties Op2VP; 7428 TargetTransformInfo::OperandValueKind Op2VK = 7429 TTI.getOperandInfo(Op2, Op2VP); 7430 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7431 Op2VK = TargetTransformInfo::OK_UniformValue; 7432 7433 SmallVector<const Value *, 4> Operands(I->operand_values()); 7434 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7435 return N * TTI.getArithmeticInstrCost( 7436 I->getOpcode(), VectorTy, CostKind, 7437 TargetTransformInfo::OK_AnyValue, 7438 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7439 } 7440 case Instruction::FNeg: { 7441 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7442 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7443 return N * TTI.getArithmeticInstrCost( 7444 I->getOpcode(), VectorTy, CostKind, 7445 TargetTransformInfo::OK_AnyValue, 7446 TargetTransformInfo::OK_AnyValue, 7447 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7448 I->getOperand(0), I); 7449 } 7450 case Instruction::Select: { 7451 SelectInst *SI = cast<SelectInst>(I); 7452 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7453 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7454 7455 const Value *Op0, *Op1; 7456 using namespace llvm::PatternMatch; 7457 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7458 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7459 // select x, y, false --> x & y 7460 // select x, true, y --> x | y 7461 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7462 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7463 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7464 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7465 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7466 Op1->getType()->getScalarSizeInBits() == 1); 7467 7468 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7469 return TTI.getArithmeticInstrCost( 7470 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7471 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7472 } 7473 7474 Type *CondTy = SI->getCondition()->getType(); 7475 if (!ScalarCond) 7476 CondTy = VectorType::get(CondTy, VF); 7477 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7478 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7479 } 7480 case Instruction::ICmp: 7481 case Instruction::FCmp: { 7482 Type *ValTy = I->getOperand(0)->getType(); 7483 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7484 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7485 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7486 VectorTy = ToVectorTy(ValTy, VF); 7487 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7488 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7489 } 7490 case Instruction::Store: 7491 case Instruction::Load: { 7492 ElementCount Width = VF; 7493 if (Width.isVector()) { 7494 InstWidening Decision = getWideningDecision(I, Width); 7495 assert(Decision != CM_Unknown && 7496 "CM decision should be taken at this point"); 7497 if (Decision == CM_Scalarize) 7498 Width = ElementCount::getFixed(1); 7499 } 7500 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7501 return getMemoryInstructionCost(I, VF); 7502 } 7503 case Instruction::ZExt: 7504 case Instruction::SExt: 7505 case Instruction::FPToUI: 7506 case Instruction::FPToSI: 7507 case Instruction::FPExt: 7508 case Instruction::PtrToInt: 7509 case Instruction::IntToPtr: 7510 case Instruction::SIToFP: 7511 case Instruction::UIToFP: 7512 case Instruction::Trunc: 7513 case Instruction::FPTrunc: 7514 case Instruction::BitCast: { 7515 // Computes the CastContextHint from a Load/Store instruction. 7516 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7517 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7518 "Expected a load or a store!"); 7519 7520 if (VF.isScalar() || !TheLoop->contains(I)) 7521 return TTI::CastContextHint::Normal; 7522 7523 switch (getWideningDecision(I, VF)) { 7524 case LoopVectorizationCostModel::CM_GatherScatter: 7525 return TTI::CastContextHint::GatherScatter; 7526 case LoopVectorizationCostModel::CM_Interleave: 7527 return TTI::CastContextHint::Interleave; 7528 case LoopVectorizationCostModel::CM_Scalarize: 7529 case LoopVectorizationCostModel::CM_Widen: 7530 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7531 : TTI::CastContextHint::Normal; 7532 case LoopVectorizationCostModel::CM_Widen_Reverse: 7533 return TTI::CastContextHint::Reversed; 7534 case LoopVectorizationCostModel::CM_Unknown: 7535 llvm_unreachable("Instr did not go through cost modelling?"); 7536 } 7537 7538 llvm_unreachable("Unhandled case!"); 7539 }; 7540 7541 unsigned Opcode = I->getOpcode(); 7542 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7543 // For Trunc, the context is the only user, which must be a StoreInst. 7544 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7545 if (I->hasOneUse()) 7546 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7547 CCH = ComputeCCH(Store); 7548 } 7549 // For Z/Sext, the context is the operand, which must be a LoadInst. 7550 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7551 Opcode == Instruction::FPExt) { 7552 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7553 CCH = ComputeCCH(Load); 7554 } 7555 7556 // We optimize the truncation of induction variables having constant 7557 // integer steps. The cost of these truncations is the same as the scalar 7558 // operation. 7559 if (isOptimizableIVTruncate(I, VF)) { 7560 auto *Trunc = cast<TruncInst>(I); 7561 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7562 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7563 } 7564 7565 // Detect reduction patterns 7566 InstructionCost RedCost; 7567 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7568 .isValid()) 7569 return RedCost; 7570 7571 Type *SrcScalarTy = I->getOperand(0)->getType(); 7572 Type *SrcVecTy = 7573 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7574 if (canTruncateToMinimalBitwidth(I, VF)) { 7575 // This cast is going to be shrunk. This may remove the cast or it might 7576 // turn it into slightly different cast. For example, if MinBW == 16, 7577 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7578 // 7579 // Calculate the modified src and dest types. 7580 Type *MinVecTy = VectorTy; 7581 if (Opcode == Instruction::Trunc) { 7582 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7583 VectorTy = 7584 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7585 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7586 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7587 VectorTy = 7588 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7589 } 7590 } 7591 7592 unsigned N; 7593 if (isScalarAfterVectorization(I, VF)) { 7594 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7595 N = VF.getKnownMinValue(); 7596 } else 7597 N = 1; 7598 return N * 7599 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7600 } 7601 case Instruction::Call: { 7602 bool NeedToScalarize; 7603 CallInst *CI = cast<CallInst>(I); 7604 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7605 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7606 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7607 return std::min(CallCost, IntrinsicCost); 7608 } 7609 return CallCost; 7610 } 7611 case Instruction::ExtractValue: 7612 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7613 default: 7614 // The cost of executing VF copies of the scalar instruction. This opcode 7615 // is unknown. Assume that it is the same as 'mul'. 7616 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7617 Instruction::Mul, VectorTy, CostKind) + 7618 getScalarizationOverhead(I, VF); 7619 } // end of switch. 7620 } 7621 7622 char LoopVectorize::ID = 0; 7623 7624 static const char lv_name[] = "Loop Vectorization"; 7625 7626 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7627 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7628 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7629 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7630 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7631 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7632 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7633 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7634 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7635 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7636 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7637 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7638 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7639 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7640 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7641 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7642 7643 namespace llvm { 7644 7645 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7646 7647 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7648 bool VectorizeOnlyWhenForced) { 7649 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7650 } 7651 7652 } // end namespace llvm 7653 7654 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7655 // Check if the pointer operand of a load or store instruction is 7656 // consecutive. 7657 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7658 return Legal->isConsecutivePtr(Ptr); 7659 return false; 7660 } 7661 7662 void LoopVectorizationCostModel::collectValuesToIgnore() { 7663 // Ignore ephemeral values. 7664 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7665 7666 // Ignore type-promoting instructions we identified during reduction 7667 // detection. 7668 for (auto &Reduction : Legal->getReductionVars()) { 7669 RecurrenceDescriptor &RedDes = Reduction.second; 7670 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7671 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7672 } 7673 // Ignore type-casting instructions we identified during induction 7674 // detection. 7675 for (auto &Induction : Legal->getInductionVars()) { 7676 InductionDescriptor &IndDes = Induction.second; 7677 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7678 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7679 } 7680 } 7681 7682 void LoopVectorizationCostModel::collectInLoopReductions() { 7683 for (auto &Reduction : Legal->getReductionVars()) { 7684 PHINode *Phi = Reduction.first; 7685 RecurrenceDescriptor &RdxDesc = Reduction.second; 7686 7687 // We don't collect reductions that are type promoted (yet). 7688 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7689 continue; 7690 7691 // If the target would prefer this reduction to happen "in-loop", then we 7692 // want to record it as such. 7693 unsigned Opcode = RdxDesc.getOpcode(); 7694 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7695 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7696 TargetTransformInfo::ReductionFlags())) 7697 continue; 7698 7699 // Check that we can correctly put the reductions into the loop, by 7700 // finding the chain of operations that leads from the phi to the loop 7701 // exit value. 7702 SmallVector<Instruction *, 4> ReductionOperations = 7703 RdxDesc.getReductionOpChain(Phi, TheLoop); 7704 bool InLoop = !ReductionOperations.empty(); 7705 if (InLoop) { 7706 InLoopReductionChains[Phi] = ReductionOperations; 7707 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7708 Instruction *LastChain = Phi; 7709 for (auto *I : ReductionOperations) { 7710 InLoopReductionImmediateChains[I] = LastChain; 7711 LastChain = I; 7712 } 7713 } 7714 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7715 << " reduction for phi: " << *Phi << "\n"); 7716 } 7717 } 7718 7719 // TODO: we could return a pair of values that specify the max VF and 7720 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7721 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7722 // doesn't have a cost model that can choose which plan to execute if 7723 // more than one is generated. 7724 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7725 LoopVectorizationCostModel &CM) { 7726 unsigned WidestType; 7727 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7728 return WidestVectorRegBits / WidestType; 7729 } 7730 7731 VectorizationFactor 7732 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7733 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7734 ElementCount VF = UserVF; 7735 // Outer loop handling: They may require CFG and instruction level 7736 // transformations before even evaluating whether vectorization is profitable. 7737 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7738 // the vectorization pipeline. 7739 if (!OrigLoop->isInnermost()) { 7740 // If the user doesn't provide a vectorization factor, determine a 7741 // reasonable one. 7742 if (UserVF.isZero()) { 7743 VF = ElementCount::getFixed(determineVPlanVF( 7744 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7745 .getFixedSize(), 7746 CM)); 7747 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7748 7749 // Make sure we have a VF > 1 for stress testing. 7750 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7751 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7752 << "overriding computed VF.\n"); 7753 VF = ElementCount::getFixed(4); 7754 } 7755 } 7756 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7757 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7758 "VF needs to be a power of two"); 7759 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7760 << "VF " << VF << " to build VPlans.\n"); 7761 buildVPlans(VF, VF); 7762 7763 // For VPlan build stress testing, we bail out after VPlan construction. 7764 if (VPlanBuildStressTest) 7765 return VectorizationFactor::Disabled(); 7766 7767 return {VF, 0 /*Cost*/}; 7768 } 7769 7770 LLVM_DEBUG( 7771 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7772 "VPlan-native path.\n"); 7773 return VectorizationFactor::Disabled(); 7774 } 7775 7776 Optional<VectorizationFactor> 7777 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7778 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7779 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7780 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7781 return None; 7782 7783 // Invalidate interleave groups if all blocks of loop will be predicated. 7784 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7785 !useMaskedInterleavedAccesses(*TTI)) { 7786 LLVM_DEBUG( 7787 dbgs() 7788 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7789 "which requires masked-interleaved support.\n"); 7790 if (CM.InterleaveInfo.invalidateGroups()) 7791 // Invalidating interleave groups also requires invalidating all decisions 7792 // based on them, which includes widening decisions and uniform and scalar 7793 // values. 7794 CM.invalidateCostModelingDecisions(); 7795 } 7796 7797 ElementCount MaxVF = MaybeMaxVF.getValue(); 7798 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7799 7800 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7801 if (!UserVF.isZero() && 7802 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7803 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7804 // VFs here, this should be reverted to only use legal UserVFs once the 7805 // loop below supports scalable VFs. 7806 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7807 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7808 << " VF " << VF << ".\n"); 7809 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7810 "VF needs to be a power of two"); 7811 // Collect the instructions (and their associated costs) that will be more 7812 // profitable to scalarize. 7813 CM.selectUserVectorizationFactor(VF); 7814 CM.collectInLoopReductions(); 7815 buildVPlansWithVPRecipes(VF, VF); 7816 LLVM_DEBUG(printPlans(dbgs())); 7817 return {{VF, 0}}; 7818 } 7819 7820 assert(!MaxVF.isScalable() && 7821 "Scalable vectors not yet supported beyond this point"); 7822 7823 for (ElementCount VF = ElementCount::getFixed(1); 7824 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7825 // Collect Uniform and Scalar instructions after vectorization with VF. 7826 CM.collectUniformsAndScalars(VF); 7827 7828 // Collect the instructions (and their associated costs) that will be more 7829 // profitable to scalarize. 7830 if (VF.isVector()) 7831 CM.collectInstsToScalarize(VF); 7832 } 7833 7834 CM.collectInLoopReductions(); 7835 7836 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7837 LLVM_DEBUG(printPlans(dbgs())); 7838 if (MaxVF.isScalar()) 7839 return VectorizationFactor::Disabled(); 7840 7841 // Select the optimal vectorization factor. 7842 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 7843 7844 // Check if it is profitable to vectorize with runtime checks. 7845 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7846 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7847 bool PragmaThresholdReached = 7848 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7849 bool ThresholdReached = 7850 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7851 if ((ThresholdReached && !Hints.allowReordering()) || 7852 PragmaThresholdReached) { 7853 ORE->emit([&]() { 7854 return OptimizationRemarkAnalysisAliasing( 7855 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7856 OrigLoop->getHeader()) 7857 << "loop not vectorized: cannot prove it is safe to reorder " 7858 "memory operations"; 7859 }); 7860 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7861 Hints.emitRemarkWithHints(); 7862 return VectorizationFactor::Disabled(); 7863 } 7864 } 7865 return SelectedVF; 7866 } 7867 7868 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7869 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7870 << '\n'); 7871 BestVF = VF; 7872 BestUF = UF; 7873 7874 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7875 return !Plan->hasVF(VF); 7876 }); 7877 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7878 } 7879 7880 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7881 DominatorTree *DT) { 7882 // Perform the actual loop transformation. 7883 7884 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7885 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7886 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7887 7888 VPTransformState State{ 7889 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7890 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7891 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7892 State.CanonicalIV = ILV.Induction; 7893 7894 ILV.printDebugTracesAtStart(); 7895 7896 //===------------------------------------------------===// 7897 // 7898 // Notice: any optimization or new instruction that go 7899 // into the code below should also be implemented in 7900 // the cost-model. 7901 // 7902 //===------------------------------------------------===// 7903 7904 // 2. Copy and widen instructions from the old loop into the new loop. 7905 VPlans.front()->execute(&State); 7906 7907 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7908 // predication, updating analyses. 7909 ILV.fixVectorizedLoop(State); 7910 7911 ILV.printDebugTracesAtEnd(); 7912 } 7913 7914 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7915 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7916 for (const auto &Plan : VPlans) 7917 if (PrintVPlansInDotFormat) 7918 Plan->printDOT(O); 7919 else 7920 Plan->print(O); 7921 } 7922 #endif 7923 7924 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7925 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7926 7927 // We create new control-flow for the vectorized loop, so the original exit 7928 // conditions will be dead after vectorization if it's only used by the 7929 // terminator 7930 SmallVector<BasicBlock*> ExitingBlocks; 7931 OrigLoop->getExitingBlocks(ExitingBlocks); 7932 for (auto *BB : ExitingBlocks) { 7933 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7934 if (!Cmp || !Cmp->hasOneUse()) 7935 continue; 7936 7937 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7938 if (!DeadInstructions.insert(Cmp).second) 7939 continue; 7940 7941 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7942 // TODO: can recurse through operands in general 7943 for (Value *Op : Cmp->operands()) { 7944 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7945 DeadInstructions.insert(cast<Instruction>(Op)); 7946 } 7947 } 7948 7949 // We create new "steps" for induction variable updates to which the original 7950 // induction variables map. An original update instruction will be dead if 7951 // all its users except the induction variable are dead. 7952 auto *Latch = OrigLoop->getLoopLatch(); 7953 for (auto &Induction : Legal->getInductionVars()) { 7954 PHINode *Ind = Induction.first; 7955 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7956 7957 // If the tail is to be folded by masking, the primary induction variable, 7958 // if exists, isn't dead: it will be used for masking. Don't kill it. 7959 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7960 continue; 7961 7962 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7963 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7964 })) 7965 DeadInstructions.insert(IndUpdate); 7966 7967 // We record as "Dead" also the type-casting instructions we had identified 7968 // during induction analysis. We don't need any handling for them in the 7969 // vectorized loop because we have proven that, under a proper runtime 7970 // test guarding the vectorized loop, the value of the phi, and the casted 7971 // value of the phi, are the same. The last instruction in this casting chain 7972 // will get its scalar/vector/widened def from the scalar/vector/widened def 7973 // of the respective phi node. Any other casts in the induction def-use chain 7974 // have no other uses outside the phi update chain, and will be ignored. 7975 InductionDescriptor &IndDes = Induction.second; 7976 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7977 DeadInstructions.insert(Casts.begin(), Casts.end()); 7978 } 7979 } 7980 7981 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7982 7983 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7984 7985 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7986 Instruction::BinaryOps BinOp) { 7987 // When unrolling and the VF is 1, we only need to add a simple scalar. 7988 Type *Ty = Val->getType(); 7989 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7990 7991 if (Ty->isFloatingPointTy()) { 7992 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7993 7994 // Floating-point operations inherit FMF via the builder's flags. 7995 Value *MulOp = Builder.CreateFMul(C, Step); 7996 return Builder.CreateBinOp(BinOp, Val, MulOp); 7997 } 7998 Constant *C = ConstantInt::get(Ty, StartIdx); 7999 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8000 } 8001 8002 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8003 SmallVector<Metadata *, 4> MDs; 8004 // Reserve first location for self reference to the LoopID metadata node. 8005 MDs.push_back(nullptr); 8006 bool IsUnrollMetadata = false; 8007 MDNode *LoopID = L->getLoopID(); 8008 if (LoopID) { 8009 // First find existing loop unrolling disable metadata. 8010 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8011 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8012 if (MD) { 8013 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8014 IsUnrollMetadata = 8015 S && S->getString().startswith("llvm.loop.unroll.disable"); 8016 } 8017 MDs.push_back(LoopID->getOperand(i)); 8018 } 8019 } 8020 8021 if (!IsUnrollMetadata) { 8022 // Add runtime unroll disable metadata. 8023 LLVMContext &Context = L->getHeader()->getContext(); 8024 SmallVector<Metadata *, 1> DisableOperands; 8025 DisableOperands.push_back( 8026 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8027 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8028 MDs.push_back(DisableNode); 8029 MDNode *NewLoopID = MDNode::get(Context, MDs); 8030 // Set operand 0 to refer to the loop id itself. 8031 NewLoopID->replaceOperandWith(0, NewLoopID); 8032 L->setLoopID(NewLoopID); 8033 } 8034 } 8035 8036 //===--------------------------------------------------------------------===// 8037 // EpilogueVectorizerMainLoop 8038 //===--------------------------------------------------------------------===// 8039 8040 /// This function is partially responsible for generating the control flow 8041 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8042 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8043 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8044 Loop *Lp = createVectorLoopSkeleton(""); 8045 8046 // Generate the code to check the minimum iteration count of the vector 8047 // epilogue (see below). 8048 EPI.EpilogueIterationCountCheck = 8049 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8050 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8051 8052 // Generate the code to check any assumptions that we've made for SCEV 8053 // expressions. 8054 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8055 8056 // Generate the code that checks at runtime if arrays overlap. We put the 8057 // checks into a separate block to make the more common case of few elements 8058 // faster. 8059 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8060 8061 // Generate the iteration count check for the main loop, *after* the check 8062 // for the epilogue loop, so that the path-length is shorter for the case 8063 // that goes directly through the vector epilogue. The longer-path length for 8064 // the main loop is compensated for, by the gain from vectorizing the larger 8065 // trip count. Note: the branch will get updated later on when we vectorize 8066 // the epilogue. 8067 EPI.MainLoopIterationCountCheck = 8068 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8069 8070 // Generate the induction variable. 8071 OldInduction = Legal->getPrimaryInduction(); 8072 Type *IdxTy = Legal->getWidestInductionType(); 8073 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8074 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8075 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8076 EPI.VectorTripCount = CountRoundDown; 8077 Induction = 8078 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8079 getDebugLocFromInstOrOperands(OldInduction)); 8080 8081 // Skip induction resume value creation here because they will be created in 8082 // the second pass. If we created them here, they wouldn't be used anyway, 8083 // because the vplan in the second pass still contains the inductions from the 8084 // original loop. 8085 8086 return completeLoopSkeleton(Lp, OrigLoopID); 8087 } 8088 8089 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8090 LLVM_DEBUG({ 8091 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8092 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8093 << ", Main Loop UF:" << EPI.MainLoopUF 8094 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8095 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8096 }); 8097 } 8098 8099 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8100 DEBUG_WITH_TYPE(VerboseDebug, { 8101 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8102 }); 8103 } 8104 8105 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8106 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8107 assert(L && "Expected valid Loop."); 8108 assert(Bypass && "Expected valid bypass basic block."); 8109 unsigned VFactor = 8110 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8111 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8112 Value *Count = getOrCreateTripCount(L); 8113 // Reuse existing vector loop preheader for TC checks. 8114 // Note that new preheader block is generated for vector loop. 8115 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8116 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8117 8118 // Generate code to check if the loop's trip count is less than VF * UF of the 8119 // main vector loop. 8120 auto P = 8121 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8122 8123 Value *CheckMinIters = Builder.CreateICmp( 8124 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8125 "min.iters.check"); 8126 8127 if (!ForEpilogue) 8128 TCCheckBlock->setName("vector.main.loop.iter.check"); 8129 8130 // Create new preheader for vector loop. 8131 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8132 DT, LI, nullptr, "vector.ph"); 8133 8134 if (ForEpilogue) { 8135 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8136 DT->getNode(Bypass)->getIDom()) && 8137 "TC check is expected to dominate Bypass"); 8138 8139 // Update dominator for Bypass & LoopExit. 8140 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8141 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8142 8143 LoopBypassBlocks.push_back(TCCheckBlock); 8144 8145 // Save the trip count so we don't have to regenerate it in the 8146 // vec.epilog.iter.check. This is safe to do because the trip count 8147 // generated here dominates the vector epilog iter check. 8148 EPI.TripCount = Count; 8149 } 8150 8151 ReplaceInstWithInst( 8152 TCCheckBlock->getTerminator(), 8153 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8154 8155 return TCCheckBlock; 8156 } 8157 8158 //===--------------------------------------------------------------------===// 8159 // EpilogueVectorizerEpilogueLoop 8160 //===--------------------------------------------------------------------===// 8161 8162 /// This function is partially responsible for generating the control flow 8163 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8164 BasicBlock * 8165 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8166 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8167 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8168 8169 // Now, compare the remaining count and if there aren't enough iterations to 8170 // execute the vectorized epilogue skip to the scalar part. 8171 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8172 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8173 LoopVectorPreHeader = 8174 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8175 LI, nullptr, "vec.epilog.ph"); 8176 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8177 VecEpilogueIterationCountCheck); 8178 8179 // Adjust the control flow taking the state info from the main loop 8180 // vectorization into account. 8181 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8182 "expected this to be saved from the previous pass."); 8183 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8184 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8185 8186 DT->changeImmediateDominator(LoopVectorPreHeader, 8187 EPI.MainLoopIterationCountCheck); 8188 8189 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8190 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8191 8192 if (EPI.SCEVSafetyCheck) 8193 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8194 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8195 if (EPI.MemSafetyCheck) 8196 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8197 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8198 8199 DT->changeImmediateDominator( 8200 VecEpilogueIterationCountCheck, 8201 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8202 8203 DT->changeImmediateDominator(LoopScalarPreHeader, 8204 EPI.EpilogueIterationCountCheck); 8205 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8206 8207 // Keep track of bypass blocks, as they feed start values to the induction 8208 // phis in the scalar loop preheader. 8209 if (EPI.SCEVSafetyCheck) 8210 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8211 if (EPI.MemSafetyCheck) 8212 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8213 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8214 8215 // Generate a resume induction for the vector epilogue and put it in the 8216 // vector epilogue preheader 8217 Type *IdxTy = Legal->getWidestInductionType(); 8218 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8219 LoopVectorPreHeader->getFirstNonPHI()); 8220 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8221 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8222 EPI.MainLoopIterationCountCheck); 8223 8224 // Generate the induction variable. 8225 OldInduction = Legal->getPrimaryInduction(); 8226 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8227 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8228 Value *StartIdx = EPResumeVal; 8229 Induction = 8230 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8231 getDebugLocFromInstOrOperands(OldInduction)); 8232 8233 // Generate induction resume values. These variables save the new starting 8234 // indexes for the scalar loop. They are used to test if there are any tail 8235 // iterations left once the vector loop has completed. 8236 // Note that when the vectorized epilogue is skipped due to iteration count 8237 // check, then the resume value for the induction variable comes from 8238 // the trip count of the main vector loop, hence passing the AdditionalBypass 8239 // argument. 8240 createInductionResumeValues(Lp, CountRoundDown, 8241 {VecEpilogueIterationCountCheck, 8242 EPI.VectorTripCount} /* AdditionalBypass */); 8243 8244 AddRuntimeUnrollDisableMetaData(Lp); 8245 return completeLoopSkeleton(Lp, OrigLoopID); 8246 } 8247 8248 BasicBlock * 8249 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8250 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8251 8252 assert(EPI.TripCount && 8253 "Expected trip count to have been safed in the first pass."); 8254 assert( 8255 (!isa<Instruction>(EPI.TripCount) || 8256 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8257 "saved trip count does not dominate insertion point."); 8258 Value *TC = EPI.TripCount; 8259 IRBuilder<> Builder(Insert->getTerminator()); 8260 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8261 8262 // Generate code to check if the loop's trip count is less than VF * UF of the 8263 // vector epilogue loop. 8264 auto P = 8265 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8266 8267 Value *CheckMinIters = Builder.CreateICmp( 8268 P, Count, 8269 ConstantInt::get(Count->getType(), 8270 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8271 "min.epilog.iters.check"); 8272 8273 ReplaceInstWithInst( 8274 Insert->getTerminator(), 8275 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8276 8277 LoopBypassBlocks.push_back(Insert); 8278 return Insert; 8279 } 8280 8281 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8282 LLVM_DEBUG({ 8283 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8284 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8285 << ", Main Loop UF:" << EPI.MainLoopUF 8286 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8287 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8288 }); 8289 } 8290 8291 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8292 DEBUG_WITH_TYPE(VerboseDebug, { 8293 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8294 }); 8295 } 8296 8297 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8298 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8299 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8300 bool PredicateAtRangeStart = Predicate(Range.Start); 8301 8302 for (ElementCount TmpVF = Range.Start * 2; 8303 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8304 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8305 Range.End = TmpVF; 8306 break; 8307 } 8308 8309 return PredicateAtRangeStart; 8310 } 8311 8312 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8313 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8314 /// of VF's starting at a given VF and extending it as much as possible. Each 8315 /// vectorization decision can potentially shorten this sub-range during 8316 /// buildVPlan(). 8317 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8318 ElementCount MaxVF) { 8319 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8320 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8321 VFRange SubRange = {VF, MaxVFPlusOne}; 8322 VPlans.push_back(buildVPlan(SubRange)); 8323 VF = SubRange.End; 8324 } 8325 } 8326 8327 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8328 VPlanPtr &Plan) { 8329 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8330 8331 // Look for cached value. 8332 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8333 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8334 if (ECEntryIt != EdgeMaskCache.end()) 8335 return ECEntryIt->second; 8336 8337 VPValue *SrcMask = createBlockInMask(Src, Plan); 8338 8339 // The terminator has to be a branch inst! 8340 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8341 assert(BI && "Unexpected terminator found"); 8342 8343 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8344 return EdgeMaskCache[Edge] = SrcMask; 8345 8346 // If source is an exiting block, we know the exit edge is dynamically dead 8347 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8348 // adding uses of an otherwise potentially dead instruction. 8349 if (OrigLoop->isLoopExiting(Src)) 8350 return EdgeMaskCache[Edge] = SrcMask; 8351 8352 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8353 assert(EdgeMask && "No Edge Mask found for condition"); 8354 8355 if (BI->getSuccessor(0) != Dst) 8356 EdgeMask = Builder.createNot(EdgeMask); 8357 8358 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8359 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8360 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8361 // The select version does not introduce new UB if SrcMask is false and 8362 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8363 VPValue *False = Plan->getOrAddVPValue( 8364 ConstantInt::getFalse(BI->getCondition()->getType())); 8365 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8366 } 8367 8368 return EdgeMaskCache[Edge] = EdgeMask; 8369 } 8370 8371 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8372 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8373 8374 // Look for cached value. 8375 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8376 if (BCEntryIt != BlockMaskCache.end()) 8377 return BCEntryIt->second; 8378 8379 // All-one mask is modelled as no-mask following the convention for masked 8380 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8381 VPValue *BlockMask = nullptr; 8382 8383 if (OrigLoop->getHeader() == BB) { 8384 if (!CM.blockNeedsPredication(BB)) 8385 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8386 8387 // Create the block in mask as the first non-phi instruction in the block. 8388 VPBuilder::InsertPointGuard Guard(Builder); 8389 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8390 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8391 8392 // Introduce the early-exit compare IV <= BTC to form header block mask. 8393 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8394 // Start by constructing the desired canonical IV. 8395 VPValue *IV = nullptr; 8396 if (Legal->getPrimaryInduction()) 8397 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8398 else { 8399 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8400 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8401 IV = IVRecipe->getVPValue(); 8402 } 8403 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8404 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8405 8406 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8407 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8408 // as a second argument, we only pass the IV here and extract the 8409 // tripcount from the transform state where codegen of the VP instructions 8410 // happen. 8411 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8412 } else { 8413 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8414 } 8415 return BlockMaskCache[BB] = BlockMask; 8416 } 8417 8418 // This is the block mask. We OR all incoming edges. 8419 for (auto *Predecessor : predecessors(BB)) { 8420 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8421 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8422 return BlockMaskCache[BB] = EdgeMask; 8423 8424 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8425 BlockMask = EdgeMask; 8426 continue; 8427 } 8428 8429 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8430 } 8431 8432 return BlockMaskCache[BB] = BlockMask; 8433 } 8434 8435 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8436 ArrayRef<VPValue *> Operands, 8437 VFRange &Range, 8438 VPlanPtr &Plan) { 8439 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8440 "Must be called with either a load or store"); 8441 8442 auto willWiden = [&](ElementCount VF) -> bool { 8443 if (VF.isScalar()) 8444 return false; 8445 LoopVectorizationCostModel::InstWidening Decision = 8446 CM.getWideningDecision(I, VF); 8447 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8448 "CM decision should be taken at this point."); 8449 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8450 return true; 8451 if (CM.isScalarAfterVectorization(I, VF) || 8452 CM.isProfitableToScalarize(I, VF)) 8453 return false; 8454 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8455 }; 8456 8457 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8458 return nullptr; 8459 8460 VPValue *Mask = nullptr; 8461 if (Legal->isMaskRequired(I)) 8462 Mask = createBlockInMask(I->getParent(), Plan); 8463 8464 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8465 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8466 8467 StoreInst *Store = cast<StoreInst>(I); 8468 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8469 Mask); 8470 } 8471 8472 VPWidenIntOrFpInductionRecipe * 8473 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8474 ArrayRef<VPValue *> Operands) const { 8475 // Check if this is an integer or fp induction. If so, build the recipe that 8476 // produces its scalar and vector values. 8477 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8478 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8479 II.getKind() == InductionDescriptor::IK_FpInduction) { 8480 assert(II.getStartValue() == 8481 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8482 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8483 return new VPWidenIntOrFpInductionRecipe( 8484 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8485 } 8486 8487 return nullptr; 8488 } 8489 8490 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8491 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8492 VPlan &Plan) const { 8493 // Optimize the special case where the source is a constant integer 8494 // induction variable. Notice that we can only optimize the 'trunc' case 8495 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8496 // (c) other casts depend on pointer size. 8497 8498 // Determine whether \p K is a truncation based on an induction variable that 8499 // can be optimized. 8500 auto isOptimizableIVTruncate = 8501 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8502 return [=](ElementCount VF) -> bool { 8503 return CM.isOptimizableIVTruncate(K, VF); 8504 }; 8505 }; 8506 8507 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8508 isOptimizableIVTruncate(I), Range)) { 8509 8510 InductionDescriptor II = 8511 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8512 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8513 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8514 Start, nullptr, I); 8515 } 8516 return nullptr; 8517 } 8518 8519 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8520 ArrayRef<VPValue *> Operands, 8521 VPlanPtr &Plan) { 8522 // If all incoming values are equal, the incoming VPValue can be used directly 8523 // instead of creating a new VPBlendRecipe. 8524 VPValue *FirstIncoming = Operands[0]; 8525 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8526 return FirstIncoming == Inc; 8527 })) { 8528 return Operands[0]; 8529 } 8530 8531 // We know that all PHIs in non-header blocks are converted into selects, so 8532 // we don't have to worry about the insertion order and we can just use the 8533 // builder. At this point we generate the predication tree. There may be 8534 // duplications since this is a simple recursive scan, but future 8535 // optimizations will clean it up. 8536 SmallVector<VPValue *, 2> OperandsWithMask; 8537 unsigned NumIncoming = Phi->getNumIncomingValues(); 8538 8539 for (unsigned In = 0; In < NumIncoming; In++) { 8540 VPValue *EdgeMask = 8541 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8542 assert((EdgeMask || NumIncoming == 1) && 8543 "Multiple predecessors with one having a full mask"); 8544 OperandsWithMask.push_back(Operands[In]); 8545 if (EdgeMask) 8546 OperandsWithMask.push_back(EdgeMask); 8547 } 8548 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8549 } 8550 8551 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8552 ArrayRef<VPValue *> Operands, 8553 VFRange &Range) const { 8554 8555 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8556 [this, CI](ElementCount VF) { 8557 return CM.isScalarWithPredication(CI, VF); 8558 }, 8559 Range); 8560 8561 if (IsPredicated) 8562 return nullptr; 8563 8564 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8565 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8566 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8567 ID == Intrinsic::pseudoprobe || 8568 ID == Intrinsic::experimental_noalias_scope_decl)) 8569 return nullptr; 8570 8571 auto willWiden = [&](ElementCount VF) -> bool { 8572 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8573 // The following case may be scalarized depending on the VF. 8574 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8575 // version of the instruction. 8576 // Is it beneficial to perform intrinsic call compared to lib call? 8577 bool NeedToScalarize = false; 8578 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8579 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8580 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8581 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8582 "Either the intrinsic cost or vector call cost must be valid"); 8583 return UseVectorIntrinsic || !NeedToScalarize; 8584 }; 8585 8586 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8587 return nullptr; 8588 8589 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8590 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8591 } 8592 8593 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8594 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8595 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8596 // Instruction should be widened, unless it is scalar after vectorization, 8597 // scalarization is profitable or it is predicated. 8598 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8599 return CM.isScalarAfterVectorization(I, VF) || 8600 CM.isProfitableToScalarize(I, VF) || 8601 CM.isScalarWithPredication(I, VF); 8602 }; 8603 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8604 Range); 8605 } 8606 8607 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8608 ArrayRef<VPValue *> Operands) const { 8609 auto IsVectorizableOpcode = [](unsigned Opcode) { 8610 switch (Opcode) { 8611 case Instruction::Add: 8612 case Instruction::And: 8613 case Instruction::AShr: 8614 case Instruction::BitCast: 8615 case Instruction::FAdd: 8616 case Instruction::FCmp: 8617 case Instruction::FDiv: 8618 case Instruction::FMul: 8619 case Instruction::FNeg: 8620 case Instruction::FPExt: 8621 case Instruction::FPToSI: 8622 case Instruction::FPToUI: 8623 case Instruction::FPTrunc: 8624 case Instruction::FRem: 8625 case Instruction::FSub: 8626 case Instruction::ICmp: 8627 case Instruction::IntToPtr: 8628 case Instruction::LShr: 8629 case Instruction::Mul: 8630 case Instruction::Or: 8631 case Instruction::PtrToInt: 8632 case Instruction::SDiv: 8633 case Instruction::Select: 8634 case Instruction::SExt: 8635 case Instruction::Shl: 8636 case Instruction::SIToFP: 8637 case Instruction::SRem: 8638 case Instruction::Sub: 8639 case Instruction::Trunc: 8640 case Instruction::UDiv: 8641 case Instruction::UIToFP: 8642 case Instruction::URem: 8643 case Instruction::Xor: 8644 case Instruction::ZExt: 8645 return true; 8646 } 8647 return false; 8648 }; 8649 8650 if (!IsVectorizableOpcode(I->getOpcode())) 8651 return nullptr; 8652 8653 // Success: widen this instruction. 8654 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8655 } 8656 8657 VPBasicBlock *VPRecipeBuilder::handleReplication( 8658 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8659 VPlanPtr &Plan) { 8660 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8661 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8662 Range); 8663 8664 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8665 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8666 Range); 8667 8668 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8669 IsUniform, IsPredicated); 8670 setRecipe(I, Recipe); 8671 Plan->addVPValue(I, Recipe); 8672 8673 // Find if I uses a predicated instruction. If so, it will use its scalar 8674 // value. Avoid hoisting the insert-element which packs the scalar value into 8675 // a vector value, as that happens iff all users use the vector value. 8676 for (VPValue *Op : Recipe->operands()) { 8677 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8678 if (!PredR) 8679 continue; 8680 auto *RepR = 8681 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8682 assert(RepR->isPredicated() && 8683 "expected Replicate recipe to be predicated"); 8684 RepR->setAlsoPack(false); 8685 } 8686 8687 // Finalize the recipe for Instr, first if it is not predicated. 8688 if (!IsPredicated) { 8689 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8690 VPBB->appendRecipe(Recipe); 8691 return VPBB; 8692 } 8693 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8694 assert(VPBB->getSuccessors().empty() && 8695 "VPBB has successors when handling predicated replication."); 8696 // Record predicated instructions for above packing optimizations. 8697 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8698 VPBlockUtils::insertBlockAfter(Region, VPBB); 8699 auto *RegSucc = new VPBasicBlock(); 8700 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8701 return RegSucc; 8702 } 8703 8704 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8705 VPRecipeBase *PredRecipe, 8706 VPlanPtr &Plan) { 8707 // Instructions marked for predication are replicated and placed under an 8708 // if-then construct to prevent side-effects. 8709 8710 // Generate recipes to compute the block mask for this region. 8711 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8712 8713 // Build the triangular if-then region. 8714 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8715 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8716 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8717 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8718 auto *PHIRecipe = Instr->getType()->isVoidTy() 8719 ? nullptr 8720 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8721 if (PHIRecipe) { 8722 Plan->removeVPValueFor(Instr); 8723 Plan->addVPValue(Instr, PHIRecipe); 8724 } 8725 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8726 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8727 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8728 8729 // Note: first set Entry as region entry and then connect successors starting 8730 // from it in order, to propagate the "parent" of each VPBasicBlock. 8731 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8732 VPBlockUtils::connectBlocks(Pred, Exit); 8733 8734 return Region; 8735 } 8736 8737 VPRecipeOrVPValueTy 8738 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8739 ArrayRef<VPValue *> Operands, 8740 VFRange &Range, VPlanPtr &Plan) { 8741 // First, check for specific widening recipes that deal with calls, memory 8742 // operations, inductions and Phi nodes. 8743 if (auto *CI = dyn_cast<CallInst>(Instr)) 8744 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8745 8746 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8747 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8748 8749 VPRecipeBase *Recipe; 8750 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8751 if (Phi->getParent() != OrigLoop->getHeader()) 8752 return tryToBlend(Phi, Operands, Plan); 8753 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8754 return toVPRecipeResult(Recipe); 8755 8756 if (Legal->isReductionVariable(Phi)) { 8757 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8758 assert(RdxDesc.getRecurrenceStartValue() == 8759 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8760 VPValue *StartV = Operands[0]; 8761 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8762 } 8763 8764 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8765 } 8766 8767 if (isa<TruncInst>(Instr) && 8768 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8769 Range, *Plan))) 8770 return toVPRecipeResult(Recipe); 8771 8772 if (!shouldWiden(Instr, Range)) 8773 return nullptr; 8774 8775 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8776 return toVPRecipeResult(new VPWidenGEPRecipe( 8777 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8778 8779 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8780 bool InvariantCond = 8781 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8782 return toVPRecipeResult(new VPWidenSelectRecipe( 8783 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8784 } 8785 8786 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8787 } 8788 8789 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8790 ElementCount MaxVF) { 8791 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8792 8793 // Collect instructions from the original loop that will become trivially dead 8794 // in the vectorized loop. We don't need to vectorize these instructions. For 8795 // example, original induction update instructions can become dead because we 8796 // separately emit induction "steps" when generating code for the new loop. 8797 // Similarly, we create a new latch condition when setting up the structure 8798 // of the new loop, so the old one can become dead. 8799 SmallPtrSet<Instruction *, 4> DeadInstructions; 8800 collectTriviallyDeadInstructions(DeadInstructions); 8801 8802 // Add assume instructions we need to drop to DeadInstructions, to prevent 8803 // them from being added to the VPlan. 8804 // TODO: We only need to drop assumes in blocks that get flattend. If the 8805 // control flow is preserved, we should keep them. 8806 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8807 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8808 8809 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8810 // Dead instructions do not need sinking. Remove them from SinkAfter. 8811 for (Instruction *I : DeadInstructions) 8812 SinkAfter.erase(I); 8813 8814 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8815 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8816 VFRange SubRange = {VF, MaxVFPlusOne}; 8817 VPlans.push_back( 8818 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8819 VF = SubRange.End; 8820 } 8821 } 8822 8823 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8824 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8825 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8826 8827 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8828 8829 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8830 8831 // --------------------------------------------------------------------------- 8832 // Pre-construction: record ingredients whose recipes we'll need to further 8833 // process after constructing the initial VPlan. 8834 // --------------------------------------------------------------------------- 8835 8836 // Mark instructions we'll need to sink later and their targets as 8837 // ingredients whose recipe we'll need to record. 8838 for (auto &Entry : SinkAfter) { 8839 RecipeBuilder.recordRecipeOf(Entry.first); 8840 RecipeBuilder.recordRecipeOf(Entry.second); 8841 } 8842 for (auto &Reduction : CM.getInLoopReductionChains()) { 8843 PHINode *Phi = Reduction.first; 8844 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8845 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8846 8847 RecipeBuilder.recordRecipeOf(Phi); 8848 for (auto &R : ReductionOperations) { 8849 RecipeBuilder.recordRecipeOf(R); 8850 // For min/max reducitons, where we have a pair of icmp/select, we also 8851 // need to record the ICmp recipe, so it can be removed later. 8852 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8853 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8854 } 8855 } 8856 8857 // For each interleave group which is relevant for this (possibly trimmed) 8858 // Range, add it to the set of groups to be later applied to the VPlan and add 8859 // placeholders for its members' Recipes which we'll be replacing with a 8860 // single VPInterleaveRecipe. 8861 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8862 auto applyIG = [IG, this](ElementCount VF) -> bool { 8863 return (VF.isVector() && // Query is illegal for VF == 1 8864 CM.getWideningDecision(IG->getInsertPos(), VF) == 8865 LoopVectorizationCostModel::CM_Interleave); 8866 }; 8867 if (!getDecisionAndClampRange(applyIG, Range)) 8868 continue; 8869 InterleaveGroups.insert(IG); 8870 for (unsigned i = 0; i < IG->getFactor(); i++) 8871 if (Instruction *Member = IG->getMember(i)) 8872 RecipeBuilder.recordRecipeOf(Member); 8873 }; 8874 8875 // --------------------------------------------------------------------------- 8876 // Build initial VPlan: Scan the body of the loop in a topological order to 8877 // visit each basic block after having visited its predecessor basic blocks. 8878 // --------------------------------------------------------------------------- 8879 8880 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8881 auto Plan = std::make_unique<VPlan>(); 8882 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8883 Plan->setEntry(VPBB); 8884 8885 // Scan the body of the loop in a topological order to visit each basic block 8886 // after having visited its predecessor basic blocks. 8887 LoopBlocksDFS DFS(OrigLoop); 8888 DFS.perform(LI); 8889 8890 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8891 // Relevant instructions from basic block BB will be grouped into VPRecipe 8892 // ingredients and fill a new VPBasicBlock. 8893 unsigned VPBBsForBB = 0; 8894 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8895 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8896 VPBB = FirstVPBBForBB; 8897 Builder.setInsertPoint(VPBB); 8898 8899 // Introduce each ingredient into VPlan. 8900 // TODO: Model and preserve debug instrinsics in VPlan. 8901 for (Instruction &I : BB->instructionsWithoutDebug()) { 8902 Instruction *Instr = &I; 8903 8904 // First filter out irrelevant instructions, to ensure no recipes are 8905 // built for them. 8906 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8907 continue; 8908 8909 SmallVector<VPValue *, 4> Operands; 8910 auto *Phi = dyn_cast<PHINode>(Instr); 8911 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8912 Operands.push_back(Plan->getOrAddVPValue( 8913 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8914 } else { 8915 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8916 Operands = {OpRange.begin(), OpRange.end()}; 8917 } 8918 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8919 Instr, Operands, Range, Plan)) { 8920 // If Instr can be simplified to an existing VPValue, use it. 8921 if (RecipeOrValue.is<VPValue *>()) { 8922 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8923 continue; 8924 } 8925 // Otherwise, add the new recipe. 8926 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8927 for (auto *Def : Recipe->definedValues()) { 8928 auto *UV = Def->getUnderlyingValue(); 8929 Plan->addVPValue(UV, Def); 8930 } 8931 8932 RecipeBuilder.setRecipe(Instr, Recipe); 8933 VPBB->appendRecipe(Recipe); 8934 continue; 8935 } 8936 8937 // Otherwise, if all widening options failed, Instruction is to be 8938 // replicated. This may create a successor for VPBB. 8939 VPBasicBlock *NextVPBB = 8940 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8941 if (NextVPBB != VPBB) { 8942 VPBB = NextVPBB; 8943 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8944 : ""); 8945 } 8946 } 8947 } 8948 8949 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8950 // may also be empty, such as the last one VPBB, reflecting original 8951 // basic-blocks with no recipes. 8952 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8953 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8954 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8955 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8956 delete PreEntry; 8957 8958 // --------------------------------------------------------------------------- 8959 // Transform initial VPlan: Apply previously taken decisions, in order, to 8960 // bring the VPlan to its final state. 8961 // --------------------------------------------------------------------------- 8962 8963 // Apply Sink-After legal constraints. 8964 for (auto &Entry : SinkAfter) { 8965 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8966 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8967 // If the target is in a replication region, make sure to move Sink to the 8968 // block after it, not into the replication region itself. 8969 if (auto *Region = 8970 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8971 if (Region->isReplicator()) { 8972 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8973 VPBasicBlock *NextBlock = 8974 cast<VPBasicBlock>(Region->getSuccessors().front()); 8975 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8976 continue; 8977 } 8978 } 8979 Sink->moveAfter(Target); 8980 } 8981 8982 // Interleave memory: for each Interleave Group we marked earlier as relevant 8983 // for this VPlan, replace the Recipes widening its memory instructions with a 8984 // single VPInterleaveRecipe at its insertion point. 8985 for (auto IG : InterleaveGroups) { 8986 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8987 RecipeBuilder.getRecipe(IG->getInsertPos())); 8988 SmallVector<VPValue *, 4> StoredValues; 8989 for (unsigned i = 0; i < IG->getFactor(); ++i) 8990 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8991 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8992 8993 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8994 Recipe->getMask()); 8995 VPIG->insertBefore(Recipe); 8996 unsigned J = 0; 8997 for (unsigned i = 0; i < IG->getFactor(); ++i) 8998 if (Instruction *Member = IG->getMember(i)) { 8999 if (!Member->getType()->isVoidTy()) { 9000 VPValue *OriginalV = Plan->getVPValue(Member); 9001 Plan->removeVPValueFor(Member); 9002 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9003 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9004 J++; 9005 } 9006 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9007 } 9008 } 9009 9010 // Adjust the recipes for any inloop reductions. 9011 if (Range.Start.isVector()) 9012 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9013 9014 // Finally, if tail is folded by masking, introduce selects between the phi 9015 // and the live-out instruction of each reduction, at the end of the latch. 9016 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9017 Builder.setInsertPoint(VPBB); 9018 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9019 for (auto &Reduction : Legal->getReductionVars()) { 9020 if (CM.isInLoopReduction(Reduction.first)) 9021 continue; 9022 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9023 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9024 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9025 } 9026 } 9027 9028 std::string PlanName; 9029 raw_string_ostream RSO(PlanName); 9030 ElementCount VF = Range.Start; 9031 Plan->addVF(VF); 9032 RSO << "Initial VPlan for VF={" << VF; 9033 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9034 Plan->addVF(VF); 9035 RSO << "," << VF; 9036 } 9037 RSO << "},UF>=1"; 9038 RSO.flush(); 9039 Plan->setName(PlanName); 9040 9041 return Plan; 9042 } 9043 9044 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9045 // Outer loop handling: They may require CFG and instruction level 9046 // transformations before even evaluating whether vectorization is profitable. 9047 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9048 // the vectorization pipeline. 9049 assert(!OrigLoop->isInnermost()); 9050 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9051 9052 // Create new empty VPlan 9053 auto Plan = std::make_unique<VPlan>(); 9054 9055 // Build hierarchical CFG 9056 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9057 HCFGBuilder.buildHierarchicalCFG(); 9058 9059 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9060 VF *= 2) 9061 Plan->addVF(VF); 9062 9063 if (EnableVPlanPredication) { 9064 VPlanPredicator VPP(*Plan); 9065 VPP.predicate(); 9066 9067 // Avoid running transformation to recipes until masked code generation in 9068 // VPlan-native path is in place. 9069 return Plan; 9070 } 9071 9072 SmallPtrSet<Instruction *, 1> DeadInstructions; 9073 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9074 Legal->getInductionVars(), 9075 DeadInstructions, *PSE.getSE()); 9076 return Plan; 9077 } 9078 9079 // Adjust the recipes for any inloop reductions. The chain of instructions 9080 // leading from the loop exit instr to the phi need to be converted to 9081 // reductions, with one operand being vector and the other being the scalar 9082 // reduction chain. 9083 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9084 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9085 for (auto &Reduction : CM.getInLoopReductionChains()) { 9086 PHINode *Phi = Reduction.first; 9087 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9088 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9089 9090 // ReductionOperations are orders top-down from the phi's use to the 9091 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9092 // which of the two operands will remain scalar and which will be reduced. 9093 // For minmax the chain will be the select instructions. 9094 Instruction *Chain = Phi; 9095 for (Instruction *R : ReductionOperations) { 9096 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9097 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9098 9099 VPValue *ChainOp = Plan->getVPValue(Chain); 9100 unsigned FirstOpId; 9101 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9102 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9103 "Expected to replace a VPWidenSelectSC"); 9104 FirstOpId = 1; 9105 } else { 9106 assert(isa<VPWidenRecipe>(WidenRecipe) && 9107 "Expected to replace a VPWidenSC"); 9108 FirstOpId = 0; 9109 } 9110 unsigned VecOpId = 9111 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9112 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9113 9114 auto *CondOp = CM.foldTailByMasking() 9115 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9116 : nullptr; 9117 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9118 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9119 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9120 Plan->removeVPValueFor(R); 9121 Plan->addVPValue(R, RedRecipe); 9122 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9123 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9124 WidenRecipe->eraseFromParent(); 9125 9126 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9127 VPRecipeBase *CompareRecipe = 9128 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9129 assert(isa<VPWidenRecipe>(CompareRecipe) && 9130 "Expected to replace a VPWidenSC"); 9131 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9132 "Expected no remaining users"); 9133 CompareRecipe->eraseFromParent(); 9134 } 9135 Chain = R; 9136 } 9137 } 9138 } 9139 9140 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9141 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9142 VPSlotTracker &SlotTracker) const { 9143 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9144 IG->getInsertPos()->printAsOperand(O, false); 9145 O << ", "; 9146 getAddr()->printAsOperand(O, SlotTracker); 9147 VPValue *Mask = getMask(); 9148 if (Mask) { 9149 O << ", "; 9150 Mask->printAsOperand(O, SlotTracker); 9151 } 9152 for (unsigned i = 0; i < IG->getFactor(); ++i) 9153 if (Instruction *I = IG->getMember(i)) 9154 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9155 } 9156 #endif 9157 9158 void VPWidenCallRecipe::execute(VPTransformState &State) { 9159 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9160 *this, State); 9161 } 9162 9163 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9164 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9165 this, *this, InvariantCond, State); 9166 } 9167 9168 void VPWidenRecipe::execute(VPTransformState &State) { 9169 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9170 } 9171 9172 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9173 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9174 *this, State.UF, State.VF, IsPtrLoopInvariant, 9175 IsIndexLoopInvariant, State); 9176 } 9177 9178 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9179 assert(!State.Instance && "Int or FP induction being replicated."); 9180 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9181 getTruncInst(), getVPValue(0), 9182 getCastValue(), State); 9183 } 9184 9185 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9186 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9187 this, State); 9188 } 9189 9190 void VPBlendRecipe::execute(VPTransformState &State) { 9191 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9192 // We know that all PHIs in non-header blocks are converted into 9193 // selects, so we don't have to worry about the insertion order and we 9194 // can just use the builder. 9195 // At this point we generate the predication tree. There may be 9196 // duplications since this is a simple recursive scan, but future 9197 // optimizations will clean it up. 9198 9199 unsigned NumIncoming = getNumIncomingValues(); 9200 9201 // Generate a sequence of selects of the form: 9202 // SELECT(Mask3, In3, 9203 // SELECT(Mask2, In2, 9204 // SELECT(Mask1, In1, 9205 // In0))) 9206 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9207 // are essentially undef are taken from In0. 9208 InnerLoopVectorizer::VectorParts Entry(State.UF); 9209 for (unsigned In = 0; In < NumIncoming; ++In) { 9210 for (unsigned Part = 0; Part < State.UF; ++Part) { 9211 // We might have single edge PHIs (blocks) - use an identity 9212 // 'select' for the first PHI operand. 9213 Value *In0 = State.get(getIncomingValue(In), Part); 9214 if (In == 0) 9215 Entry[Part] = In0; // Initialize with the first incoming value. 9216 else { 9217 // Select between the current value and the previous incoming edge 9218 // based on the incoming mask. 9219 Value *Cond = State.get(getMask(In), Part); 9220 Entry[Part] = 9221 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9222 } 9223 } 9224 } 9225 for (unsigned Part = 0; Part < State.UF; ++Part) 9226 State.set(this, Entry[Part], Part); 9227 } 9228 9229 void VPInterleaveRecipe::execute(VPTransformState &State) { 9230 assert(!State.Instance && "Interleave group being replicated."); 9231 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9232 getStoredValues(), getMask()); 9233 } 9234 9235 void VPReductionRecipe::execute(VPTransformState &State) { 9236 assert(!State.Instance && "Reduction being replicated."); 9237 Value *PrevInChain = State.get(getChainOp(), 0); 9238 for (unsigned Part = 0; Part < State.UF; ++Part) { 9239 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9240 bool IsOrdered = useOrderedReductions(*RdxDesc); 9241 Value *NewVecOp = State.get(getVecOp(), Part); 9242 if (VPValue *Cond = getCondOp()) { 9243 Value *NewCond = State.get(Cond, Part); 9244 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9245 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9246 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9247 Constant *IdenVec = 9248 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9249 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9250 NewVecOp = Select; 9251 } 9252 Value *NewRed; 9253 Value *NextInChain; 9254 if (IsOrdered) { 9255 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9256 PrevInChain); 9257 PrevInChain = NewRed; 9258 } else { 9259 PrevInChain = State.get(getChainOp(), Part); 9260 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9261 } 9262 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9263 NextInChain = 9264 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9265 NewRed, PrevInChain); 9266 } else if (IsOrdered) 9267 NextInChain = NewRed; 9268 else { 9269 NextInChain = State.Builder.CreateBinOp( 9270 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9271 PrevInChain); 9272 } 9273 State.set(this, NextInChain, Part); 9274 } 9275 } 9276 9277 void VPReplicateRecipe::execute(VPTransformState &State) { 9278 if (State.Instance) { // Generate a single instance. 9279 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9280 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9281 *State.Instance, IsPredicated, State); 9282 // Insert scalar instance packing it into a vector. 9283 if (AlsoPack && State.VF.isVector()) { 9284 // If we're constructing lane 0, initialize to start from poison. 9285 if (State.Instance->Lane.isFirstLane()) { 9286 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9287 Value *Poison = PoisonValue::get( 9288 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9289 State.set(this, Poison, State.Instance->Part); 9290 } 9291 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9292 } 9293 return; 9294 } 9295 9296 // Generate scalar instances for all VF lanes of all UF parts, unless the 9297 // instruction is uniform inwhich case generate only the first lane for each 9298 // of the UF parts. 9299 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9300 assert((!State.VF.isScalable() || IsUniform) && 9301 "Can't scalarize a scalable vector"); 9302 for (unsigned Part = 0; Part < State.UF; ++Part) 9303 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9304 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9305 VPIteration(Part, Lane), IsPredicated, 9306 State); 9307 } 9308 9309 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9310 assert(State.Instance && "Branch on Mask works only on single instance."); 9311 9312 unsigned Part = State.Instance->Part; 9313 unsigned Lane = State.Instance->Lane.getKnownLane(); 9314 9315 Value *ConditionBit = nullptr; 9316 VPValue *BlockInMask = getMask(); 9317 if (BlockInMask) { 9318 ConditionBit = State.get(BlockInMask, Part); 9319 if (ConditionBit->getType()->isVectorTy()) 9320 ConditionBit = State.Builder.CreateExtractElement( 9321 ConditionBit, State.Builder.getInt32(Lane)); 9322 } else // Block in mask is all-one. 9323 ConditionBit = State.Builder.getTrue(); 9324 9325 // Replace the temporary unreachable terminator with a new conditional branch, 9326 // whose two destinations will be set later when they are created. 9327 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9328 assert(isa<UnreachableInst>(CurrentTerminator) && 9329 "Expected to replace unreachable terminator with conditional branch."); 9330 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9331 CondBr->setSuccessor(0, nullptr); 9332 ReplaceInstWithInst(CurrentTerminator, CondBr); 9333 } 9334 9335 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9336 assert(State.Instance && "Predicated instruction PHI works per instance."); 9337 Instruction *ScalarPredInst = 9338 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9339 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9340 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9341 assert(PredicatingBB && "Predicated block has no single predecessor."); 9342 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9343 "operand must be VPReplicateRecipe"); 9344 9345 // By current pack/unpack logic we need to generate only a single phi node: if 9346 // a vector value for the predicated instruction exists at this point it means 9347 // the instruction has vector users only, and a phi for the vector value is 9348 // needed. In this case the recipe of the predicated instruction is marked to 9349 // also do that packing, thereby "hoisting" the insert-element sequence. 9350 // Otherwise, a phi node for the scalar value is needed. 9351 unsigned Part = State.Instance->Part; 9352 if (State.hasVectorValue(getOperand(0), Part)) { 9353 Value *VectorValue = State.get(getOperand(0), Part); 9354 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9355 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9356 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9357 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9358 if (State.hasVectorValue(this, Part)) 9359 State.reset(this, VPhi, Part); 9360 else 9361 State.set(this, VPhi, Part); 9362 // NOTE: Currently we need to update the value of the operand, so the next 9363 // predicated iteration inserts its generated value in the correct vector. 9364 State.reset(getOperand(0), VPhi, Part); 9365 } else { 9366 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9367 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9368 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9369 PredicatingBB); 9370 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9371 if (State.hasScalarValue(this, *State.Instance)) 9372 State.reset(this, Phi, *State.Instance); 9373 else 9374 State.set(this, Phi, *State.Instance); 9375 // NOTE: Currently we need to update the value of the operand, so the next 9376 // predicated iteration inserts its generated value in the correct vector. 9377 State.reset(getOperand(0), Phi, *State.Instance); 9378 } 9379 } 9380 9381 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9382 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9383 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9384 StoredValue ? nullptr : getVPValue(), 9385 getAddr(), StoredValue, getMask()); 9386 } 9387 9388 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9389 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9390 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9391 // for predication. 9392 static ScalarEpilogueLowering getScalarEpilogueLowering( 9393 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9394 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9395 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9396 LoopVectorizationLegality &LVL) { 9397 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9398 // don't look at hints or options, and don't request a scalar epilogue. 9399 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9400 // LoopAccessInfo (due to code dependency and not being able to reliably get 9401 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9402 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9403 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9404 // back to the old way and vectorize with versioning when forced. See D81345.) 9405 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9406 PGSOQueryType::IRPass) && 9407 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9408 return CM_ScalarEpilogueNotAllowedOptSize; 9409 9410 // 2) If set, obey the directives 9411 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9412 switch (PreferPredicateOverEpilogue) { 9413 case PreferPredicateTy::ScalarEpilogue: 9414 return CM_ScalarEpilogueAllowed; 9415 case PreferPredicateTy::PredicateElseScalarEpilogue: 9416 return CM_ScalarEpilogueNotNeededUsePredicate; 9417 case PreferPredicateTy::PredicateOrDontVectorize: 9418 return CM_ScalarEpilogueNotAllowedUsePredicate; 9419 }; 9420 } 9421 9422 // 3) If set, obey the hints 9423 switch (Hints.getPredicate()) { 9424 case LoopVectorizeHints::FK_Enabled: 9425 return CM_ScalarEpilogueNotNeededUsePredicate; 9426 case LoopVectorizeHints::FK_Disabled: 9427 return CM_ScalarEpilogueAllowed; 9428 }; 9429 9430 // 4) if the TTI hook indicates this is profitable, request predication. 9431 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9432 LVL.getLAI())) 9433 return CM_ScalarEpilogueNotNeededUsePredicate; 9434 9435 return CM_ScalarEpilogueAllowed; 9436 } 9437 9438 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9439 // If Values have been set for this Def return the one relevant for \p Part. 9440 if (hasVectorValue(Def, Part)) 9441 return Data.PerPartOutput[Def][Part]; 9442 9443 if (!hasScalarValue(Def, {Part, 0})) { 9444 Value *IRV = Def->getLiveInIRValue(); 9445 Value *B = ILV->getBroadcastInstrs(IRV); 9446 set(Def, B, Part); 9447 return B; 9448 } 9449 9450 Value *ScalarValue = get(Def, {Part, 0}); 9451 // If we aren't vectorizing, we can just copy the scalar map values over 9452 // to the vector map. 9453 if (VF.isScalar()) { 9454 set(Def, ScalarValue, Part); 9455 return ScalarValue; 9456 } 9457 9458 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9459 bool IsUniform = RepR && RepR->isUniform(); 9460 9461 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9462 // Check if there is a scalar value for the selected lane. 9463 if (!hasScalarValue(Def, {Part, LastLane})) { 9464 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9465 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9466 "unexpected recipe found to be invariant"); 9467 IsUniform = true; 9468 LastLane = 0; 9469 } 9470 9471 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9472 9473 // Set the insert point after the last scalarized instruction. This 9474 // ensures the insertelement sequence will directly follow the scalar 9475 // definitions. 9476 auto OldIP = Builder.saveIP(); 9477 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9478 Builder.SetInsertPoint(&*NewIP); 9479 9480 // However, if we are vectorizing, we need to construct the vector values. 9481 // If the value is known to be uniform after vectorization, we can just 9482 // broadcast the scalar value corresponding to lane zero for each unroll 9483 // iteration. Otherwise, we construct the vector values using 9484 // insertelement instructions. Since the resulting vectors are stored in 9485 // State, we will only generate the insertelements once. 9486 Value *VectorValue = nullptr; 9487 if (IsUniform) { 9488 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9489 set(Def, VectorValue, Part); 9490 } else { 9491 // Initialize packing with insertelements to start from undef. 9492 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9493 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9494 set(Def, Undef, Part); 9495 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9496 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9497 VectorValue = get(Def, Part); 9498 } 9499 Builder.restoreIP(OldIP); 9500 return VectorValue; 9501 } 9502 9503 // Process the loop in the VPlan-native vectorization path. This path builds 9504 // VPlan upfront in the vectorization pipeline, which allows to apply 9505 // VPlan-to-VPlan transformations from the very beginning without modifying the 9506 // input LLVM IR. 9507 static bool processLoopInVPlanNativePath( 9508 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9509 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9510 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9511 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9512 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9513 LoopVectorizationRequirements &Requirements) { 9514 9515 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9516 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9517 return false; 9518 } 9519 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9520 Function *F = L->getHeader()->getParent(); 9521 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9522 9523 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9524 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9525 9526 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9527 &Hints, IAI); 9528 // Use the planner for outer loop vectorization. 9529 // TODO: CM is not used at this point inside the planner. Turn CM into an 9530 // optional argument if we don't need it in the future. 9531 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9532 Requirements, ORE); 9533 9534 // Get user vectorization factor. 9535 ElementCount UserVF = Hints.getWidth(); 9536 9537 // Plan how to best vectorize, return the best VF and its cost. 9538 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9539 9540 // If we are stress testing VPlan builds, do not attempt to generate vector 9541 // code. Masked vector code generation support will follow soon. 9542 // Also, do not attempt to vectorize if no vector code will be produced. 9543 if (VPlanBuildStressTest || EnableVPlanPredication || 9544 VectorizationFactor::Disabled() == VF) 9545 return false; 9546 9547 LVP.setBestPlan(VF.Width, 1); 9548 9549 { 9550 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9551 F->getParent()->getDataLayout()); 9552 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9553 &CM, BFI, PSI, Checks); 9554 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9555 << L->getHeader()->getParent()->getName() << "\"\n"); 9556 LVP.executePlan(LB, DT); 9557 } 9558 9559 // Mark the loop as already vectorized to avoid vectorizing again. 9560 Hints.setAlreadyVectorized(); 9561 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9562 return true; 9563 } 9564 9565 // Emit a remark if there are stores to floats that required a floating point 9566 // extension. If the vectorized loop was generated with floating point there 9567 // will be a performance penalty from the conversion overhead and the change in 9568 // the vector width. 9569 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9570 SmallVector<Instruction *, 4> Worklist; 9571 for (BasicBlock *BB : L->getBlocks()) { 9572 for (Instruction &Inst : *BB) { 9573 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9574 if (S->getValueOperand()->getType()->isFloatTy()) 9575 Worklist.push_back(S); 9576 } 9577 } 9578 } 9579 9580 // Traverse the floating point stores upwards searching, for floating point 9581 // conversions. 9582 SmallPtrSet<const Instruction *, 4> Visited; 9583 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9584 while (!Worklist.empty()) { 9585 auto *I = Worklist.pop_back_val(); 9586 if (!L->contains(I)) 9587 continue; 9588 if (!Visited.insert(I).second) 9589 continue; 9590 9591 // Emit a remark if the floating point store required a floating 9592 // point conversion. 9593 // TODO: More work could be done to identify the root cause such as a 9594 // constant or a function return type and point the user to it. 9595 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9596 ORE->emit([&]() { 9597 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9598 I->getDebugLoc(), L->getHeader()) 9599 << "floating point conversion changes vector width. " 9600 << "Mixed floating point precision requires an up/down " 9601 << "cast that will negatively impact performance."; 9602 }); 9603 9604 for (Use &Op : I->operands()) 9605 if (auto *OpI = dyn_cast<Instruction>(Op)) 9606 Worklist.push_back(OpI); 9607 } 9608 } 9609 9610 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9611 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9612 !EnableLoopInterleaving), 9613 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9614 !EnableLoopVectorization) {} 9615 9616 bool LoopVectorizePass::processLoop(Loop *L) { 9617 assert((EnableVPlanNativePath || L->isInnermost()) && 9618 "VPlan-native path is not enabled. Only process inner loops."); 9619 9620 #ifndef NDEBUG 9621 const std::string DebugLocStr = getDebugLocString(L); 9622 #endif /* NDEBUG */ 9623 9624 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9625 << L->getHeader()->getParent()->getName() << "\" from " 9626 << DebugLocStr << "\n"); 9627 9628 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9629 9630 LLVM_DEBUG( 9631 dbgs() << "LV: Loop hints:" 9632 << " force=" 9633 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9634 ? "disabled" 9635 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9636 ? "enabled" 9637 : "?")) 9638 << " width=" << Hints.getWidth() 9639 << " unroll=" << Hints.getInterleave() << "\n"); 9640 9641 // Function containing loop 9642 Function *F = L->getHeader()->getParent(); 9643 9644 // Looking at the diagnostic output is the only way to determine if a loop 9645 // was vectorized (other than looking at the IR or machine code), so it 9646 // is important to generate an optimization remark for each loop. Most of 9647 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9648 // generated as OptimizationRemark and OptimizationRemarkMissed are 9649 // less verbose reporting vectorized loops and unvectorized loops that may 9650 // benefit from vectorization, respectively. 9651 9652 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9653 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9654 return false; 9655 } 9656 9657 PredicatedScalarEvolution PSE(*SE, *L); 9658 9659 // Check if it is legal to vectorize the loop. 9660 LoopVectorizationRequirements Requirements; 9661 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9662 &Requirements, &Hints, DB, AC, BFI, PSI); 9663 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9664 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9665 Hints.emitRemarkWithHints(); 9666 return false; 9667 } 9668 9669 // Check the function attributes and profiles to find out if this function 9670 // should be optimized for size. 9671 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9672 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9673 9674 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9675 // here. They may require CFG and instruction level transformations before 9676 // even evaluating whether vectorization is profitable. Since we cannot modify 9677 // the incoming IR, we need to build VPlan upfront in the vectorization 9678 // pipeline. 9679 if (!L->isInnermost()) 9680 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9681 ORE, BFI, PSI, Hints, Requirements); 9682 9683 assert(L->isInnermost() && "Inner loop expected."); 9684 9685 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9686 // count by optimizing for size, to minimize overheads. 9687 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9688 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9689 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9690 << "This loop is worth vectorizing only if no scalar " 9691 << "iteration overheads are incurred."); 9692 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9693 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9694 else { 9695 LLVM_DEBUG(dbgs() << "\n"); 9696 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9697 } 9698 } 9699 9700 // Check the function attributes to see if implicit floats are allowed. 9701 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9702 // an integer loop and the vector instructions selected are purely integer 9703 // vector instructions? 9704 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9705 reportVectorizationFailure( 9706 "Can't vectorize when the NoImplicitFloat attribute is used", 9707 "loop not vectorized due to NoImplicitFloat attribute", 9708 "NoImplicitFloat", ORE, L); 9709 Hints.emitRemarkWithHints(); 9710 return false; 9711 } 9712 9713 // Check if the target supports potentially unsafe FP vectorization. 9714 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9715 // for the target we're vectorizing for, to make sure none of the 9716 // additional fp-math flags can help. 9717 if (Hints.isPotentiallyUnsafe() && 9718 TTI->isFPVectorizationPotentiallyUnsafe()) { 9719 reportVectorizationFailure( 9720 "Potentially unsafe FP op prevents vectorization", 9721 "loop not vectorized due to unsafe FP support.", 9722 "UnsafeFP", ORE, L); 9723 Hints.emitRemarkWithHints(); 9724 return false; 9725 } 9726 9727 if (!Requirements.canVectorizeFPMath(Hints)) { 9728 ORE->emit([&]() { 9729 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9730 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9731 ExactFPMathInst->getDebugLoc(), 9732 ExactFPMathInst->getParent()) 9733 << "loop not vectorized: cannot prove it is safe to reorder " 9734 "floating-point operations"; 9735 }); 9736 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9737 "reorder floating-point operations\n"); 9738 Hints.emitRemarkWithHints(); 9739 return false; 9740 } 9741 9742 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9743 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9744 9745 // If an override option has been passed in for interleaved accesses, use it. 9746 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9747 UseInterleaved = EnableInterleavedMemAccesses; 9748 9749 // Analyze interleaved memory accesses. 9750 if (UseInterleaved) { 9751 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9752 } 9753 9754 // Use the cost model. 9755 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9756 F, &Hints, IAI); 9757 CM.collectValuesToIgnore(); 9758 9759 // Use the planner for vectorization. 9760 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9761 Requirements, ORE); 9762 9763 // Get user vectorization factor and interleave count. 9764 ElementCount UserVF = Hints.getWidth(); 9765 unsigned UserIC = Hints.getInterleave(); 9766 9767 // Plan how to best vectorize, return the best VF and its cost. 9768 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9769 9770 VectorizationFactor VF = VectorizationFactor::Disabled(); 9771 unsigned IC = 1; 9772 9773 if (MaybeVF) { 9774 VF = *MaybeVF; 9775 // Select the interleave count. 9776 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9777 } 9778 9779 // Identify the diagnostic messages that should be produced. 9780 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9781 bool VectorizeLoop = true, InterleaveLoop = true; 9782 if (VF.Width.isScalar()) { 9783 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9784 VecDiagMsg = std::make_pair( 9785 "VectorizationNotBeneficial", 9786 "the cost-model indicates that vectorization is not beneficial"); 9787 VectorizeLoop = false; 9788 } 9789 9790 if (!MaybeVF && UserIC > 1) { 9791 // Tell the user interleaving was avoided up-front, despite being explicitly 9792 // requested. 9793 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9794 "interleaving should be avoided up front\n"); 9795 IntDiagMsg = std::make_pair( 9796 "InterleavingAvoided", 9797 "Ignoring UserIC, because interleaving was avoided up front"); 9798 InterleaveLoop = false; 9799 } else if (IC == 1 && UserIC <= 1) { 9800 // Tell the user interleaving is not beneficial. 9801 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9802 IntDiagMsg = std::make_pair( 9803 "InterleavingNotBeneficial", 9804 "the cost-model indicates that interleaving is not beneficial"); 9805 InterleaveLoop = false; 9806 if (UserIC == 1) { 9807 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9808 IntDiagMsg.second += 9809 " and is explicitly disabled or interleave count is set to 1"; 9810 } 9811 } else if (IC > 1 && UserIC == 1) { 9812 // Tell the user interleaving is beneficial, but it explicitly disabled. 9813 LLVM_DEBUG( 9814 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9815 IntDiagMsg = std::make_pair( 9816 "InterleavingBeneficialButDisabled", 9817 "the cost-model indicates that interleaving is beneficial " 9818 "but is explicitly disabled or interleave count is set to 1"); 9819 InterleaveLoop = false; 9820 } 9821 9822 // Override IC if user provided an interleave count. 9823 IC = UserIC > 0 ? UserIC : IC; 9824 9825 // Emit diagnostic messages, if any. 9826 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9827 if (!VectorizeLoop && !InterleaveLoop) { 9828 // Do not vectorize or interleaving the loop. 9829 ORE->emit([&]() { 9830 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9831 L->getStartLoc(), L->getHeader()) 9832 << VecDiagMsg.second; 9833 }); 9834 ORE->emit([&]() { 9835 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9836 L->getStartLoc(), L->getHeader()) 9837 << IntDiagMsg.second; 9838 }); 9839 return false; 9840 } else if (!VectorizeLoop && InterleaveLoop) { 9841 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9842 ORE->emit([&]() { 9843 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9844 L->getStartLoc(), L->getHeader()) 9845 << VecDiagMsg.second; 9846 }); 9847 } else if (VectorizeLoop && !InterleaveLoop) { 9848 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9849 << ") in " << DebugLocStr << '\n'); 9850 ORE->emit([&]() { 9851 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9852 L->getStartLoc(), L->getHeader()) 9853 << IntDiagMsg.second; 9854 }); 9855 } else if (VectorizeLoop && InterleaveLoop) { 9856 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9857 << ") in " << DebugLocStr << '\n'); 9858 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9859 } 9860 9861 bool DisableRuntimeUnroll = false; 9862 MDNode *OrigLoopID = L->getLoopID(); 9863 { 9864 // Optimistically generate runtime checks. Drop them if they turn out to not 9865 // be profitable. Limit the scope of Checks, so the cleanup happens 9866 // immediately after vector codegeneration is done. 9867 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9868 F->getParent()->getDataLayout()); 9869 if (!VF.Width.isScalar() || IC > 1) 9870 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9871 LVP.setBestPlan(VF.Width, IC); 9872 9873 using namespace ore; 9874 if (!VectorizeLoop) { 9875 assert(IC > 1 && "interleave count should not be 1 or 0"); 9876 // If we decided that it is not legal to vectorize the loop, then 9877 // interleave it. 9878 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9879 &CM, BFI, PSI, Checks); 9880 LVP.executePlan(Unroller, DT); 9881 9882 ORE->emit([&]() { 9883 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9884 L->getHeader()) 9885 << "interleaved loop (interleaved count: " 9886 << NV("InterleaveCount", IC) << ")"; 9887 }); 9888 } else { 9889 // If we decided that it is *legal* to vectorize the loop, then do it. 9890 9891 // Consider vectorizing the epilogue too if it's profitable. 9892 VectorizationFactor EpilogueVF = 9893 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9894 if (EpilogueVF.Width.isVector()) { 9895 9896 // The first pass vectorizes the main loop and creates a scalar epilogue 9897 // to be vectorized by executing the plan (potentially with a different 9898 // factor) again shortly afterwards. 9899 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9900 EpilogueVF.Width.getKnownMinValue(), 9901 1); 9902 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9903 EPI, &LVL, &CM, BFI, PSI, Checks); 9904 9905 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9906 LVP.executePlan(MainILV, DT); 9907 ++LoopsVectorized; 9908 9909 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9910 formLCSSARecursively(*L, *DT, LI, SE); 9911 9912 // Second pass vectorizes the epilogue and adjusts the control flow 9913 // edges from the first pass. 9914 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9915 EPI.MainLoopVF = EPI.EpilogueVF; 9916 EPI.MainLoopUF = EPI.EpilogueUF; 9917 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9918 ORE, EPI, &LVL, &CM, BFI, PSI, 9919 Checks); 9920 LVP.executePlan(EpilogILV, DT); 9921 ++LoopsEpilogueVectorized; 9922 9923 if (!MainILV.areSafetyChecksAdded()) 9924 DisableRuntimeUnroll = true; 9925 } else { 9926 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9927 &LVL, &CM, BFI, PSI, Checks); 9928 LVP.executePlan(LB, DT); 9929 ++LoopsVectorized; 9930 9931 // Add metadata to disable runtime unrolling a scalar loop when there 9932 // are no runtime checks about strides and memory. A scalar loop that is 9933 // rarely used is not worth unrolling. 9934 if (!LB.areSafetyChecksAdded()) 9935 DisableRuntimeUnroll = true; 9936 } 9937 // Report the vectorization decision. 9938 ORE->emit([&]() { 9939 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9940 L->getHeader()) 9941 << "vectorized loop (vectorization width: " 9942 << NV("VectorizationFactor", VF.Width) 9943 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9944 }); 9945 } 9946 9947 if (ORE->allowExtraAnalysis(LV_NAME)) 9948 checkMixedPrecision(L, ORE); 9949 } 9950 9951 Optional<MDNode *> RemainderLoopID = 9952 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9953 LLVMLoopVectorizeFollowupEpilogue}); 9954 if (RemainderLoopID.hasValue()) { 9955 L->setLoopID(RemainderLoopID.getValue()); 9956 } else { 9957 if (DisableRuntimeUnroll) 9958 AddRuntimeUnrollDisableMetaData(L); 9959 9960 // Mark the loop as already vectorized to avoid vectorizing again. 9961 Hints.setAlreadyVectorized(); 9962 } 9963 9964 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9965 return true; 9966 } 9967 9968 LoopVectorizeResult LoopVectorizePass::runImpl( 9969 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9970 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9971 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9972 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9973 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9974 SE = &SE_; 9975 LI = &LI_; 9976 TTI = &TTI_; 9977 DT = &DT_; 9978 BFI = &BFI_; 9979 TLI = TLI_; 9980 AA = &AA_; 9981 AC = &AC_; 9982 GetLAA = &GetLAA_; 9983 DB = &DB_; 9984 ORE = &ORE_; 9985 PSI = PSI_; 9986 9987 // Don't attempt if 9988 // 1. the target claims to have no vector registers, and 9989 // 2. interleaving won't help ILP. 9990 // 9991 // The second condition is necessary because, even if the target has no 9992 // vector registers, loop vectorization may still enable scalar 9993 // interleaving. 9994 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9995 TTI->getMaxInterleaveFactor(1) < 2) 9996 return LoopVectorizeResult(false, false); 9997 9998 bool Changed = false, CFGChanged = false; 9999 10000 // The vectorizer requires loops to be in simplified form. 10001 // Since simplification may add new inner loops, it has to run before the 10002 // legality and profitability checks. This means running the loop vectorizer 10003 // will simplify all loops, regardless of whether anything end up being 10004 // vectorized. 10005 for (auto &L : *LI) 10006 Changed |= CFGChanged |= 10007 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10008 10009 // Build up a worklist of inner-loops to vectorize. This is necessary as 10010 // the act of vectorizing or partially unrolling a loop creates new loops 10011 // and can invalidate iterators across the loops. 10012 SmallVector<Loop *, 8> Worklist; 10013 10014 for (Loop *L : *LI) 10015 collectSupportedLoops(*L, LI, ORE, Worklist); 10016 10017 LoopsAnalyzed += Worklist.size(); 10018 10019 // Now walk the identified inner loops. 10020 while (!Worklist.empty()) { 10021 Loop *L = Worklist.pop_back_val(); 10022 10023 // For the inner loops we actually process, form LCSSA to simplify the 10024 // transform. 10025 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10026 10027 Changed |= CFGChanged |= processLoop(L); 10028 } 10029 10030 // Process each loop nest in the function. 10031 return LoopVectorizeResult(Changed, CFGChanged); 10032 } 10033 10034 PreservedAnalyses LoopVectorizePass::run(Function &F, 10035 FunctionAnalysisManager &AM) { 10036 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10037 auto &LI = AM.getResult<LoopAnalysis>(F); 10038 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10039 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10040 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10041 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10042 auto &AA = AM.getResult<AAManager>(F); 10043 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10044 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10045 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10046 MemorySSA *MSSA = EnableMSSALoopDependency 10047 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10048 : nullptr; 10049 10050 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10051 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10052 [&](Loop &L) -> const LoopAccessInfo & { 10053 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10054 TLI, TTI, nullptr, MSSA}; 10055 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10056 }; 10057 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10058 ProfileSummaryInfo *PSI = 10059 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10060 LoopVectorizeResult Result = 10061 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10062 if (!Result.MadeAnyChange) 10063 return PreservedAnalyses::all(); 10064 PreservedAnalyses PA; 10065 10066 // We currently do not preserve loopinfo/dominator analyses with outer loop 10067 // vectorization. Until this is addressed, mark these analyses as preserved 10068 // only for non-VPlan-native path. 10069 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10070 if (!EnableVPlanNativePath) { 10071 PA.preserve<LoopAnalysis>(); 10072 PA.preserve<DominatorTreeAnalysis>(); 10073 } 10074 PA.preserve<BasicAA>(); 10075 PA.preserve<GlobalsAA>(); 10076 if (!Result.MadeCFGChange) 10077 PA.preserveSet<CFGAnalyses>(); 10078 return PA; 10079 } 10080