1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 202 // that predication is preferred, and this lists all options. I.e., the 203 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 204 // and predicate the instructions accordingly. If tail-folding fails, there are 205 // different fallback strategies depending on these values: 206 namespace PreferPredicateTy { 207 enum Option { 208 ScalarEpilogue = 0, 209 PredicateElseScalarEpilogue, 210 PredicateOrDontVectorize 211 }; 212 } // namespace PreferPredicateTy 213 214 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 215 "prefer-predicate-over-epilogue", 216 cl::init(PreferPredicateTy::ScalarEpilogue), 217 cl::Hidden, 218 cl::desc("Tail-folding and predication preferences over creating a scalar " 219 "epilogue loop."), 220 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 221 "scalar-epilogue", 222 "Don't tail-predicate loops, create scalar epilogue"), 223 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 224 "predicate-else-scalar-epilogue", 225 "prefer tail-folding, create scalar epilogue if tail " 226 "folding fails."), 227 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 228 "predicate-dont-vectorize", 229 "prefers tail-folding, don't attempt vectorization if " 230 "tail-folding fails."))); 231 232 static cl::opt<bool> MaximizeBandwidth( 233 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 234 cl::desc("Maximize bandwidth when selecting vectorization factor which " 235 "will be determined by the smallest type in loop.")); 236 237 static cl::opt<bool> EnableInterleavedMemAccesses( 238 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 239 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 240 241 /// An interleave-group may need masking if it resides in a block that needs 242 /// predication, or in order to mask away gaps. 243 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 244 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 245 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 246 247 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 248 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 249 cl::desc("We don't interleave loops with a estimated constant trip count " 250 "below this number")); 251 252 static cl::opt<unsigned> ForceTargetNumScalarRegs( 253 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 254 cl::desc("A flag that overrides the target's number of scalar registers.")); 255 256 static cl::opt<unsigned> ForceTargetNumVectorRegs( 257 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 258 cl::desc("A flag that overrides the target's number of vector registers.")); 259 260 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 261 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 262 cl::desc("A flag that overrides the target's max interleave factor for " 263 "scalar loops.")); 264 265 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 266 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "vectorized loops.")); 269 270 static cl::opt<unsigned> ForceTargetInstructionCost( 271 "force-target-instruction-cost", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's expected cost for " 273 "an instruction to a single constant value. Mostly " 274 "useful for getting consistent testing.")); 275 276 static cl::opt<bool> ForceTargetSupportsScalableVectors( 277 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 278 cl::desc( 279 "Pretend that scalable vectors are supported, even if the target does " 280 "not support them. This flag should only be used for testing.")); 281 282 static cl::opt<unsigned> SmallLoopCost( 283 "small-loop-cost", cl::init(20), cl::Hidden, 284 cl::desc( 285 "The cost of a loop that is considered 'small' by the interleaver.")); 286 287 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 288 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 289 cl::desc("Enable the use of the block frequency analysis to access PGO " 290 "heuristics minimizing code growth in cold regions and being more " 291 "aggressive in hot regions.")); 292 293 // Runtime interleave loops for load/store throughput. 294 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 295 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 296 cl::desc( 297 "Enable runtime interleaving until load/store ports are saturated")); 298 299 /// Interleave small loops with scalar reductions. 300 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 301 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 302 cl::desc("Enable interleaving for loops with small iteration counts that " 303 "contain scalar reductions to expose ILP.")); 304 305 /// The number of stores in a loop that are allowed to need predication. 306 static cl::opt<unsigned> NumberOfStoresToPredicate( 307 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 308 cl::desc("Max number of stores to be predicated behind an if.")); 309 310 static cl::opt<bool> EnableIndVarRegisterHeur( 311 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 312 cl::desc("Count the induction variable only once when interleaving")); 313 314 static cl::opt<bool> EnableCondStoresVectorization( 315 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 316 cl::desc("Enable if predication of stores during vectorization.")); 317 318 static cl::opt<unsigned> MaxNestedScalarReductionIC( 319 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 320 cl::desc("The maximum interleave count to use when interleaving a scalar " 321 "reduction in a nested loop.")); 322 323 static cl::opt<bool> 324 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 325 cl::Hidden, 326 cl::desc("Prefer in-loop vector reductions, " 327 "overriding the targets preference.")); 328 329 static cl::opt<bool> PreferPredicatedReductionSelect( 330 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 331 cl::desc( 332 "Prefer predicating a reduction operation over an after loop select.")); 333 334 cl::opt<bool> EnableVPlanNativePath( 335 "enable-vplan-native-path", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path with " 337 "support for outer loop vectorization.")); 338 339 // FIXME: Remove this switch once we have divergence analysis. Currently we 340 // assume divergent non-backedge branches when this switch is true. 341 cl::opt<bool> EnableVPlanPredication( 342 "enable-vplan-predication", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path predicator with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 /// A helper function that returns the type of loaded or stored value. 365 static Type *getMemInstValueType(Value *I) { 366 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 367 "Expected Load or Store instruction"); 368 if (auto *LI = dyn_cast<LoadInst>(I)) 369 return LI->getType(); 370 return cast<StoreInst>(I)->getValueOperand()->getType(); 371 } 372 373 /// A helper function that returns true if the given type is irregular. The 374 /// type is irregular if its allocated size doesn't equal the store size of an 375 /// element of the corresponding vector type at the given vectorization factor. 376 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 377 // Determine if an array of VF elements of type Ty is "bitcast compatible" 378 // with a <VF x Ty> vector. 379 if (VF.isVector()) { 380 auto *VectorTy = VectorType::get(Ty, VF); 381 return TypeSize::get(VF.getKnownMinValue() * 382 DL.getTypeAllocSize(Ty).getFixedValue(), 383 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 384 } 385 386 // If the vectorization factor is one, we just check if an array of type Ty 387 // requires padding between elements. 388 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 389 } 390 391 /// A helper function that returns the reciprocal of the block probability of 392 /// predicated blocks. If we return X, we are assuming the predicated block 393 /// will execute once for every X iterations of the loop header. 394 /// 395 /// TODO: We should use actual block probability here, if available. Currently, 396 /// we always assume predicated blocks have a 50% chance of executing. 397 static unsigned getReciprocalPredBlockProb() { return 2; } 398 399 /// A helper function that adds a 'fast' flag to floating-point operations. 400 static Value *addFastMathFlag(Value *V) { 401 if (isa<FPMathOperator>(V)) 402 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 403 return V; 404 } 405 406 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 407 if (isa<FPMathOperator>(V)) 408 cast<Instruction>(V)->setFastMathFlags(FMF); 409 return V; 410 } 411 412 /// A helper function that returns an integer or floating-point constant with 413 /// value C. 414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 415 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 416 : ConstantFP::get(Ty, C); 417 } 418 419 /// Returns "best known" trip count for the specified loop \p L as defined by 420 /// the following procedure: 421 /// 1) Returns exact trip count if it is known. 422 /// 2) Returns expected trip count according to profile data if any. 423 /// 3) Returns upper bound estimate if it is known. 424 /// 4) Returns None if all of the above failed. 425 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 426 // Check if exact trip count is known. 427 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 428 return ExpectedTC; 429 430 // Check if there is an expected trip count available from profile data. 431 if (LoopVectorizeWithBlockFrequency) 432 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 433 return EstimatedTC; 434 435 // Check if upper bound estimate is known. 436 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 437 return ExpectedTC; 438 439 return None; 440 } 441 442 namespace llvm { 443 444 /// InnerLoopVectorizer vectorizes loops which contain only one basic 445 /// block to a specified vectorization factor (VF). 446 /// This class performs the widening of scalars into vectors, or multiple 447 /// scalars. This class also implements the following features: 448 /// * It inserts an epilogue loop for handling loops that don't have iteration 449 /// counts that are known to be a multiple of the vectorization factor. 450 /// * It handles the code generation for reduction variables. 451 /// * Scalarization (implementation using scalars) of un-vectorizable 452 /// instructions. 453 /// InnerLoopVectorizer does not perform any vectorization-legality 454 /// checks, and relies on the caller to check for the different legality 455 /// aspects. The InnerLoopVectorizer relies on the 456 /// LoopVectorizationLegality class to provide information about the induction 457 /// and reduction variables that were found to a given vectorization factor. 458 class InnerLoopVectorizer { 459 public: 460 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 461 LoopInfo *LI, DominatorTree *DT, 462 const TargetLibraryInfo *TLI, 463 const TargetTransformInfo *TTI, AssumptionCache *AC, 464 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 465 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 466 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 467 ProfileSummaryInfo *PSI) 468 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 469 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 470 Builder(PSE.getSE()->getContext()), 471 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 472 BFI(BFI), PSI(PSI) { 473 // Query this against the original loop and save it here because the profile 474 // of the original loop header may change as the transformation happens. 475 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 476 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 477 } 478 479 virtual ~InnerLoopVectorizer() = default; 480 481 /// Create a new empty loop that will contain vectorized instructions later 482 /// on, while the old loop will be used as the scalar remainder. Control flow 483 /// is generated around the vectorized (and scalar epilogue) loops consisting 484 /// of various checks and bypasses. Return the pre-header block of the new 485 /// loop. 486 /// In the case of epilogue vectorization, this function is overriden to 487 /// handle the more complex control flow around the loops. 488 virtual BasicBlock *createVectorizedLoopSkeleton(); 489 490 /// Widen a single instruction within the innermost loop. 491 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 492 VPTransformState &State); 493 494 /// Widen a single call instruction within the innermost loop. 495 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 496 VPTransformState &State); 497 498 /// Widen a single select instruction within the innermost loop. 499 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 500 bool InvariantCond, VPTransformState &State); 501 502 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 503 void fixVectorizedLoop(); 504 505 // Return true if any runtime check is added. 506 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 507 508 /// A type for vectorized values in the new loop. Each value from the 509 /// original loop, when vectorized, is represented by UF vector values in the 510 /// new unrolled loop, where UF is the unroll factor. 511 using VectorParts = SmallVector<Value *, 2>; 512 513 /// Vectorize a single GetElementPtrInst based on information gathered and 514 /// decisions taken during planning. 515 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 516 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 517 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 518 519 /// Vectorize a single PHINode in a block. This method handles the induction 520 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 521 /// arbitrary length vectors. 522 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 523 Value *StartV, unsigned UF, ElementCount VF); 524 525 /// A helper function to scalarize a single Instruction in the innermost loop. 526 /// Generates a sequence of scalar instances for each lane between \p MinLane 527 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 528 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 529 /// Instr's operands. 530 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 531 const VPIteration &Instance, bool IfPredicateInstr, 532 VPTransformState &State); 533 534 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 535 /// is provided, the integer induction variable will first be truncated to 536 /// the corresponding type. 537 void widenIntOrFpInduction(PHINode *IV, Value *Start, 538 TruncInst *Trunc = nullptr); 539 540 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 541 /// vector or scalar value on-demand if one is not yet available. When 542 /// vectorizing a loop, we visit the definition of an instruction before its 543 /// uses. When visiting the definition, we either vectorize or scalarize the 544 /// instruction, creating an entry for it in the corresponding map. (In some 545 /// cases, such as induction variables, we will create both vector and scalar 546 /// entries.) Then, as we encounter uses of the definition, we derive values 547 /// for each scalar or vector use unless such a value is already available. 548 /// For example, if we scalarize a definition and one of its uses is vector, 549 /// we build the required vector on-demand with an insertelement sequence 550 /// when visiting the use. Otherwise, if the use is scalar, we can use the 551 /// existing scalar definition. 552 /// 553 /// Return a value in the new loop corresponding to \p V from the original 554 /// loop at unroll index \p Part. If the value has already been vectorized, 555 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 556 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 557 /// a new vector value on-demand by inserting the scalar values into a vector 558 /// with an insertelement sequence. If the value has been neither vectorized 559 /// nor scalarized, it must be loop invariant, so we simply broadcast the 560 /// value into a vector. 561 Value *getOrCreateVectorValue(Value *V, unsigned Part); 562 563 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 564 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 565 } 566 567 /// Return a value in the new loop corresponding to \p V from the original 568 /// loop at unroll and vector indices \p Instance. If the value has been 569 /// vectorized but not scalarized, the necessary extractelement instruction 570 /// will be generated. 571 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 572 573 /// Construct the vector value of a scalarized value \p V one lane at a time. 574 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 575 576 /// Try to vectorize interleaved access group \p Group with the base address 577 /// given in \p Addr, optionally masking the vector operations if \p 578 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 579 /// values in the vectorized loop. 580 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 581 ArrayRef<VPValue *> VPDefs, 582 VPTransformState &State, VPValue *Addr, 583 ArrayRef<VPValue *> StoredValues, 584 VPValue *BlockInMask = nullptr); 585 586 /// Vectorize Load and Store instructions with the base address given in \p 587 /// Addr, optionally masking the vector operations if \p BlockInMask is 588 /// non-null. Use \p State to translate given VPValues to IR values in the 589 /// vectorized loop. 590 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 591 VPValue *Def, VPValue *Addr, 592 VPValue *StoredValue, VPValue *BlockInMask); 593 594 /// Set the debug location in the builder using the debug location in 595 /// the instruction. 596 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 597 598 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 599 void fixNonInductionPHIs(void); 600 601 protected: 602 friend class LoopVectorizationPlanner; 603 604 /// A small list of PHINodes. 605 using PhiVector = SmallVector<PHINode *, 4>; 606 607 /// A type for scalarized values in the new loop. Each value from the 608 /// original loop, when scalarized, is represented by UF x VF scalar values 609 /// in the new unrolled loop, where UF is the unroll factor and VF is the 610 /// vectorization factor. 611 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 612 613 /// Set up the values of the IVs correctly when exiting the vector loop. 614 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 615 Value *CountRoundDown, Value *EndValue, 616 BasicBlock *MiddleBlock); 617 618 /// Create a new induction variable inside L. 619 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 620 Value *Step, Instruction *DL); 621 622 /// Handle all cross-iteration phis in the header. 623 void fixCrossIterationPHIs(); 624 625 /// Fix a first-order recurrence. This is the second phase of vectorizing 626 /// this phi node. 627 void fixFirstOrderRecurrence(PHINode *Phi); 628 629 /// Fix a reduction cross-iteration phi. This is the second phase of 630 /// vectorizing this phi node. 631 void fixReduction(PHINode *Phi); 632 633 /// Clear NSW/NUW flags from reduction instructions if necessary. 634 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 635 636 /// The Loop exit block may have single value PHI nodes with some 637 /// incoming value. While vectorizing we only handled real values 638 /// that were defined inside the loop and we should have one value for 639 /// each predecessor of its parent basic block. See PR14725. 640 void fixLCSSAPHIs(); 641 642 /// Iteratively sink the scalarized operands of a predicated instruction into 643 /// the block that was created for it. 644 void sinkScalarOperands(Instruction *PredInst); 645 646 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 647 /// represented as. 648 void truncateToMinimalBitwidths(); 649 650 /// Create a broadcast instruction. This method generates a broadcast 651 /// instruction (shuffle) for loop invariant values and for the induction 652 /// value. If this is the induction variable then we extend it to N, N+1, ... 653 /// this is needed because each iteration in the loop corresponds to a SIMD 654 /// element. 655 virtual Value *getBroadcastInstrs(Value *V); 656 657 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 658 /// to each vector element of Val. The sequence starts at StartIndex. 659 /// \p Opcode is relevant for FP induction variable. 660 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 661 Instruction::BinaryOps Opcode = 662 Instruction::BinaryOpsEnd); 663 664 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 665 /// variable on which to base the steps, \p Step is the size of the step, and 666 /// \p EntryVal is the value from the original loop that maps to the steps. 667 /// Note that \p EntryVal doesn't have to be an induction variable - it 668 /// can also be a truncate instruction. 669 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 670 const InductionDescriptor &ID); 671 672 /// Create a vector induction phi node based on an existing scalar one. \p 673 /// EntryVal is the value from the original loop that maps to the vector phi 674 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 675 /// truncate instruction, instead of widening the original IV, we widen a 676 /// version of the IV truncated to \p EntryVal's type. 677 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 678 Value *Step, Value *Start, 679 Instruction *EntryVal); 680 681 /// Returns true if an instruction \p I should be scalarized instead of 682 /// vectorized for the chosen vectorization factor. 683 bool shouldScalarizeInstruction(Instruction *I) const; 684 685 /// Returns true if we should generate a scalar version of \p IV. 686 bool needsScalarInduction(Instruction *IV) const; 687 688 /// If there is a cast involved in the induction variable \p ID, which should 689 /// be ignored in the vectorized loop body, this function records the 690 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 691 /// cast. We had already proved that the casted Phi is equal to the uncasted 692 /// Phi in the vectorized loop (under a runtime guard), and therefore 693 /// there is no need to vectorize the cast - the same value can be used in the 694 /// vector loop for both the Phi and the cast. 695 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 696 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 697 /// 698 /// \p EntryVal is the value from the original loop that maps to the vector 699 /// phi node and is used to distinguish what is the IV currently being 700 /// processed - original one (if \p EntryVal is a phi corresponding to the 701 /// original IV) or the "newly-created" one based on the proof mentioned above 702 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 703 /// latter case \p EntryVal is a TruncInst and we must not record anything for 704 /// that IV, but it's error-prone to expect callers of this routine to care 705 /// about that, hence this explicit parameter. 706 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 707 const Instruction *EntryVal, 708 Value *VectorLoopValue, 709 unsigned Part, 710 unsigned Lane = UINT_MAX); 711 712 /// Generate a shuffle sequence that will reverse the vector Vec. 713 virtual Value *reverseVector(Value *Vec); 714 715 /// Returns (and creates if needed) the original loop trip count. 716 Value *getOrCreateTripCount(Loop *NewLoop); 717 718 /// Returns (and creates if needed) the trip count of the widened loop. 719 Value *getOrCreateVectorTripCount(Loop *NewLoop); 720 721 /// Returns a bitcasted value to the requested vector type. 722 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 723 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 724 const DataLayout &DL); 725 726 /// Emit a bypass check to see if the vector trip count is zero, including if 727 /// it overflows. 728 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 729 730 /// Emit a bypass check to see if all of the SCEV assumptions we've 731 /// had to make are correct. 732 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 733 734 /// Emit bypass checks to check any memory assumptions we may have made. 735 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 736 737 /// Compute the transformed value of Index at offset StartValue using step 738 /// StepValue. 739 /// For integer induction, returns StartValue + Index * StepValue. 740 /// For pointer induction, returns StartValue[Index * StepValue]. 741 /// FIXME: The newly created binary instructions should contain nsw/nuw 742 /// flags, which can be found from the original scalar operations. 743 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 744 const DataLayout &DL, 745 const InductionDescriptor &ID) const; 746 747 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 748 /// vector loop preheader, middle block and scalar preheader. Also 749 /// allocate a loop object for the new vector loop and return it. 750 Loop *createVectorLoopSkeleton(StringRef Prefix); 751 752 /// Create new phi nodes for the induction variables to resume iteration count 753 /// in the scalar epilogue, from where the vectorized loop left off (given by 754 /// \p VectorTripCount). 755 /// In cases where the loop skeleton is more complicated (eg. epilogue 756 /// vectorization) and the resume values can come from an additional bypass 757 /// block, the \p AdditionalBypass pair provides information about the bypass 758 /// block and the end value on the edge from bypass to this loop. 759 void createInductionResumeValues( 760 Loop *L, Value *VectorTripCount, 761 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 762 763 /// Complete the loop skeleton by adding debug MDs, creating appropriate 764 /// conditional branches in the middle block, preparing the builder and 765 /// running the verifier. Take in the vector loop \p L as argument, and return 766 /// the preheader of the completed vector loop. 767 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 768 769 /// Add additional metadata to \p To that was not present on \p Orig. 770 /// 771 /// Currently this is used to add the noalias annotations based on the 772 /// inserted memchecks. Use this for instructions that are *cloned* into the 773 /// vector loop. 774 void addNewMetadata(Instruction *To, const Instruction *Orig); 775 776 /// Add metadata from one instruction to another. 777 /// 778 /// This includes both the original MDs from \p From and additional ones (\see 779 /// addNewMetadata). Use this for *newly created* instructions in the vector 780 /// loop. 781 void addMetadata(Instruction *To, Instruction *From); 782 783 /// Similar to the previous function but it adds the metadata to a 784 /// vector of instructions. 785 void addMetadata(ArrayRef<Value *> To, Instruction *From); 786 787 /// Allow subclasses to override and print debug traces before/after vplan 788 /// execution, when trace information is requested. 789 virtual void printDebugTracesAtStart(){}; 790 virtual void printDebugTracesAtEnd(){}; 791 792 /// The original loop. 793 Loop *OrigLoop; 794 795 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 796 /// dynamic knowledge to simplify SCEV expressions and converts them to a 797 /// more usable form. 798 PredicatedScalarEvolution &PSE; 799 800 /// Loop Info. 801 LoopInfo *LI; 802 803 /// Dominator Tree. 804 DominatorTree *DT; 805 806 /// Alias Analysis. 807 AAResults *AA; 808 809 /// Target Library Info. 810 const TargetLibraryInfo *TLI; 811 812 /// Target Transform Info. 813 const TargetTransformInfo *TTI; 814 815 /// Assumption Cache. 816 AssumptionCache *AC; 817 818 /// Interface to emit optimization remarks. 819 OptimizationRemarkEmitter *ORE; 820 821 /// LoopVersioning. It's only set up (non-null) if memchecks were 822 /// used. 823 /// 824 /// This is currently only used to add no-alias metadata based on the 825 /// memchecks. The actually versioning is performed manually. 826 std::unique_ptr<LoopVersioning> LVer; 827 828 /// The vectorization SIMD factor to use. Each vector will have this many 829 /// vector elements. 830 ElementCount VF; 831 832 /// The vectorization unroll factor to use. Each scalar is vectorized to this 833 /// many different vector instructions. 834 unsigned UF; 835 836 /// The builder that we use 837 IRBuilder<> Builder; 838 839 // --- Vectorization state --- 840 841 /// The vector-loop preheader. 842 BasicBlock *LoopVectorPreHeader; 843 844 /// The scalar-loop preheader. 845 BasicBlock *LoopScalarPreHeader; 846 847 /// Middle Block between the vector and the scalar. 848 BasicBlock *LoopMiddleBlock; 849 850 /// The (unique) ExitBlock of the scalar loop. Note that 851 /// there can be multiple exiting edges reaching this block. 852 BasicBlock *LoopExitBlock; 853 854 /// The vector loop body. 855 BasicBlock *LoopVectorBody; 856 857 /// The scalar loop body. 858 BasicBlock *LoopScalarBody; 859 860 /// A list of all bypass blocks. The first block is the entry of the loop. 861 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 862 863 /// The new Induction variable which was added to the new block. 864 PHINode *Induction = nullptr; 865 866 /// The induction variable of the old basic block. 867 PHINode *OldInduction = nullptr; 868 869 /// Maps values from the original loop to their corresponding values in the 870 /// vectorized loop. A key value can map to either vector values, scalar 871 /// values or both kinds of values, depending on whether the key was 872 /// vectorized and scalarized. 873 VectorizerValueMap VectorLoopValueMap; 874 875 /// Store instructions that were predicated. 876 SmallVector<Instruction *, 4> PredicatedInstructions; 877 878 /// Trip count of the original loop. 879 Value *TripCount = nullptr; 880 881 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 882 Value *VectorTripCount = nullptr; 883 884 /// The legality analysis. 885 LoopVectorizationLegality *Legal; 886 887 /// The profitablity analysis. 888 LoopVectorizationCostModel *Cost; 889 890 // Record whether runtime checks are added. 891 bool AddedSafetyChecks = false; 892 893 // Holds the end values for each induction variable. We save the end values 894 // so we can later fix-up the external users of the induction variables. 895 DenseMap<PHINode *, Value *> IVEndValues; 896 897 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 898 // fixed up at the end of vector code generation. 899 SmallVector<PHINode *, 8> OrigPHIsToFix; 900 901 /// BFI and PSI are used to check for profile guided size optimizations. 902 BlockFrequencyInfo *BFI; 903 ProfileSummaryInfo *PSI; 904 905 // Whether this loop should be optimized for size based on profile guided size 906 // optimizatios. 907 bool OptForSizeBasedOnProfile; 908 }; 909 910 class InnerLoopUnroller : public InnerLoopVectorizer { 911 public: 912 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 913 LoopInfo *LI, DominatorTree *DT, 914 const TargetLibraryInfo *TLI, 915 const TargetTransformInfo *TTI, AssumptionCache *AC, 916 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 917 LoopVectorizationLegality *LVL, 918 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 919 ProfileSummaryInfo *PSI) 920 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 921 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 922 BFI, PSI) {} 923 924 private: 925 Value *getBroadcastInstrs(Value *V) override; 926 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 927 Instruction::BinaryOps Opcode = 928 Instruction::BinaryOpsEnd) override; 929 Value *reverseVector(Value *Vec) override; 930 }; 931 932 /// Encapsulate information regarding vectorization of a loop and its epilogue. 933 /// This information is meant to be updated and used across two stages of 934 /// epilogue vectorization. 935 struct EpilogueLoopVectorizationInfo { 936 ElementCount MainLoopVF = ElementCount::getFixed(0); 937 unsigned MainLoopUF = 0; 938 ElementCount EpilogueVF = ElementCount::getFixed(0); 939 unsigned EpilogueUF = 0; 940 BasicBlock *MainLoopIterationCountCheck = nullptr; 941 BasicBlock *EpilogueIterationCountCheck = nullptr; 942 BasicBlock *SCEVSafetyCheck = nullptr; 943 BasicBlock *MemSafetyCheck = nullptr; 944 Value *TripCount = nullptr; 945 Value *VectorTripCount = nullptr; 946 947 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 948 unsigned EUF) 949 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 950 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 951 assert(EUF == 1 && 952 "A high UF for the epilogue loop is likely not beneficial."); 953 } 954 }; 955 956 /// An extension of the inner loop vectorizer that creates a skeleton for a 957 /// vectorized loop that has its epilogue (residual) also vectorized. 958 /// The idea is to run the vplan on a given loop twice, firstly to setup the 959 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 960 /// from the first step and vectorize the epilogue. This is achieved by 961 /// deriving two concrete strategy classes from this base class and invoking 962 /// them in succession from the loop vectorizer planner. 963 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 964 public: 965 InnerLoopAndEpilogueVectorizer( 966 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 967 DominatorTree *DT, const TargetLibraryInfo *TLI, 968 const TargetTransformInfo *TTI, AssumptionCache *AC, 969 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 970 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 971 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 972 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 973 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 974 EPI(EPI) {} 975 976 // Override this function to handle the more complex control flow around the 977 // three loops. 978 BasicBlock *createVectorizedLoopSkeleton() final override { 979 return createEpilogueVectorizedLoopSkeleton(); 980 } 981 982 /// The interface for creating a vectorized skeleton using one of two 983 /// different strategies, each corresponding to one execution of the vplan 984 /// as described above. 985 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 986 987 /// Holds and updates state information required to vectorize the main loop 988 /// and its epilogue in two separate passes. This setup helps us avoid 989 /// regenerating and recomputing runtime safety checks. It also helps us to 990 /// shorten the iteration-count-check path length for the cases where the 991 /// iteration count of the loop is so small that the main vector loop is 992 /// completely skipped. 993 EpilogueLoopVectorizationInfo &EPI; 994 }; 995 996 /// A specialized derived class of inner loop vectorizer that performs 997 /// vectorization of *main* loops in the process of vectorizing loops and their 998 /// epilogues. 999 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 1000 public: 1001 EpilogueVectorizerMainLoop( 1002 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1003 DominatorTree *DT, const TargetLibraryInfo *TLI, 1004 const TargetTransformInfo *TTI, AssumptionCache *AC, 1005 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1006 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1007 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1008 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1009 EPI, LVL, CM, BFI, PSI) {} 1010 /// Implements the interface for creating a vectorized skeleton using the 1011 /// *main loop* strategy (ie the first pass of vplan execution). 1012 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1013 1014 protected: 1015 /// Emits an iteration count bypass check once for the main loop (when \p 1016 /// ForEpilogue is false) and once for the epilogue loop (when \p 1017 /// ForEpilogue is true). 1018 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1019 bool ForEpilogue); 1020 void printDebugTracesAtStart() override; 1021 void printDebugTracesAtEnd() override; 1022 }; 1023 1024 // A specialized derived class of inner loop vectorizer that performs 1025 // vectorization of *epilogue* loops in the process of vectorizing loops and 1026 // their epilogues. 1027 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1028 public: 1029 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1030 LoopInfo *LI, DominatorTree *DT, 1031 const TargetLibraryInfo *TLI, 1032 const TargetTransformInfo *TTI, AssumptionCache *AC, 1033 OptimizationRemarkEmitter *ORE, 1034 EpilogueLoopVectorizationInfo &EPI, 1035 LoopVectorizationLegality *LVL, 1036 llvm::LoopVectorizationCostModel *CM, 1037 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1038 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1039 EPI, LVL, CM, BFI, PSI) {} 1040 /// Implements the interface for creating a vectorized skeleton using the 1041 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1042 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1043 1044 protected: 1045 /// Emits an iteration count bypass check after the main vector loop has 1046 /// finished to see if there are any iterations left to execute by either 1047 /// the vector epilogue or the scalar epilogue. 1048 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1049 BasicBlock *Bypass, 1050 BasicBlock *Insert); 1051 void printDebugTracesAtStart() override; 1052 void printDebugTracesAtEnd() override; 1053 }; 1054 } // end namespace llvm 1055 1056 /// Look for a meaningful debug location on the instruction or it's 1057 /// operands. 1058 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1059 if (!I) 1060 return I; 1061 1062 DebugLoc Empty; 1063 if (I->getDebugLoc() != Empty) 1064 return I; 1065 1066 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1067 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1068 if (OpInst->getDebugLoc() != Empty) 1069 return OpInst; 1070 } 1071 1072 return I; 1073 } 1074 1075 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1076 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1077 const DILocation *DIL = Inst->getDebugLoc(); 1078 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1079 !isa<DbgInfoIntrinsic>(Inst)) { 1080 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1081 auto NewDIL = 1082 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1083 if (NewDIL) 1084 B.SetCurrentDebugLocation(NewDIL.getValue()); 1085 else 1086 LLVM_DEBUG(dbgs() 1087 << "Failed to create new discriminator: " 1088 << DIL->getFilename() << " Line: " << DIL->getLine()); 1089 } 1090 else 1091 B.SetCurrentDebugLocation(DIL); 1092 } else 1093 B.SetCurrentDebugLocation(DebugLoc()); 1094 } 1095 1096 /// Write a record \p DebugMsg about vectorization failure to the debug 1097 /// output stream. If \p I is passed, it is an instruction that prevents 1098 /// vectorization. 1099 #ifndef NDEBUG 1100 static void debugVectorizationFailure(const StringRef DebugMsg, 1101 Instruction *I) { 1102 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1103 if (I != nullptr) 1104 dbgs() << " " << *I; 1105 else 1106 dbgs() << '.'; 1107 dbgs() << '\n'; 1108 } 1109 #endif 1110 1111 /// Create an analysis remark that explains why vectorization failed 1112 /// 1113 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1114 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1115 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1116 /// the location of the remark. \return the remark object that can be 1117 /// streamed to. 1118 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1119 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1120 Value *CodeRegion = TheLoop->getHeader(); 1121 DebugLoc DL = TheLoop->getStartLoc(); 1122 1123 if (I) { 1124 CodeRegion = I->getParent(); 1125 // If there is no debug location attached to the instruction, revert back to 1126 // using the loop's. 1127 if (I->getDebugLoc()) 1128 DL = I->getDebugLoc(); 1129 } 1130 1131 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1132 R << "loop not vectorized: "; 1133 return R; 1134 } 1135 1136 /// Return a value for Step multiplied by VF. 1137 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1138 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1139 Constant *StepVal = ConstantInt::get( 1140 Step->getType(), 1141 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1142 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1143 } 1144 1145 namespace llvm { 1146 1147 void reportVectorizationFailure(const StringRef DebugMsg, 1148 const StringRef OREMsg, const StringRef ORETag, 1149 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1150 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1151 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1152 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1153 ORETag, TheLoop, I) << OREMsg); 1154 } 1155 1156 } // end namespace llvm 1157 1158 #ifndef NDEBUG 1159 /// \return string containing a file name and a line # for the given loop. 1160 static std::string getDebugLocString(const Loop *L) { 1161 std::string Result; 1162 if (L) { 1163 raw_string_ostream OS(Result); 1164 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1165 LoopDbgLoc.print(OS); 1166 else 1167 // Just print the module name. 1168 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1169 OS.flush(); 1170 } 1171 return Result; 1172 } 1173 #endif 1174 1175 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1176 const Instruction *Orig) { 1177 // If the loop was versioned with memchecks, add the corresponding no-alias 1178 // metadata. 1179 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1180 LVer->annotateInstWithNoAlias(To, Orig); 1181 } 1182 1183 void InnerLoopVectorizer::addMetadata(Instruction *To, 1184 Instruction *From) { 1185 propagateMetadata(To, From); 1186 addNewMetadata(To, From); 1187 } 1188 1189 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1190 Instruction *From) { 1191 for (Value *V : To) { 1192 if (Instruction *I = dyn_cast<Instruction>(V)) 1193 addMetadata(I, From); 1194 } 1195 } 1196 1197 namespace llvm { 1198 1199 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1200 // lowered. 1201 enum ScalarEpilogueLowering { 1202 1203 // The default: allowing scalar epilogues. 1204 CM_ScalarEpilogueAllowed, 1205 1206 // Vectorization with OptForSize: don't allow epilogues. 1207 CM_ScalarEpilogueNotAllowedOptSize, 1208 1209 // A special case of vectorisation with OptForSize: loops with a very small 1210 // trip count are considered for vectorization under OptForSize, thereby 1211 // making sure the cost of their loop body is dominant, free of runtime 1212 // guards and scalar iteration overheads. 1213 CM_ScalarEpilogueNotAllowedLowTripLoop, 1214 1215 // Loop hint predicate indicating an epilogue is undesired. 1216 CM_ScalarEpilogueNotNeededUsePredicate, 1217 1218 // Directive indicating we must either tail fold or not vectorize 1219 CM_ScalarEpilogueNotAllowedUsePredicate 1220 }; 1221 1222 /// LoopVectorizationCostModel - estimates the expected speedups due to 1223 /// vectorization. 1224 /// In many cases vectorization is not profitable. This can happen because of 1225 /// a number of reasons. In this class we mainly attempt to predict the 1226 /// expected speedup/slowdowns due to the supported instruction set. We use the 1227 /// TargetTransformInfo to query the different backends for the cost of 1228 /// different operations. 1229 class LoopVectorizationCostModel { 1230 public: 1231 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1232 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1233 LoopVectorizationLegality *Legal, 1234 const TargetTransformInfo &TTI, 1235 const TargetLibraryInfo *TLI, DemandedBits *DB, 1236 AssumptionCache *AC, 1237 OptimizationRemarkEmitter *ORE, const Function *F, 1238 const LoopVectorizeHints *Hints, 1239 InterleavedAccessInfo &IAI) 1240 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1241 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1242 Hints(Hints), InterleaveInfo(IAI) {} 1243 1244 /// \return An upper bound for the vectorization factor, or None if 1245 /// vectorization and interleaving should be avoided up front. 1246 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1247 1248 /// \return True if runtime checks are required for vectorization, and false 1249 /// otherwise. 1250 bool runtimeChecksRequired(); 1251 1252 /// \return The most profitable vectorization factor and the cost of that VF. 1253 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1254 /// then this vectorization factor will be selected if vectorization is 1255 /// possible. 1256 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1257 VectorizationFactor 1258 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1259 const LoopVectorizationPlanner &LVP); 1260 1261 /// Setup cost-based decisions for user vectorization factor. 1262 void selectUserVectorizationFactor(ElementCount UserVF) { 1263 collectUniformsAndScalars(UserVF); 1264 collectInstsToScalarize(UserVF); 1265 } 1266 1267 /// \return The size (in bits) of the smallest and widest types in the code 1268 /// that needs to be vectorized. We ignore values that remain scalar such as 1269 /// 64 bit loop indices. 1270 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1271 1272 /// \return The desired interleave count. 1273 /// If interleave count has been specified by metadata it will be returned. 1274 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1275 /// are the selected vectorization factor and the cost of the selected VF. 1276 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1277 1278 /// Memory access instruction may be vectorized in more than one way. 1279 /// Form of instruction after vectorization depends on cost. 1280 /// This function takes cost-based decisions for Load/Store instructions 1281 /// and collects them in a map. This decisions map is used for building 1282 /// the lists of loop-uniform and loop-scalar instructions. 1283 /// The calculated cost is saved with widening decision in order to 1284 /// avoid redundant calculations. 1285 void setCostBasedWideningDecision(ElementCount VF); 1286 1287 /// A struct that represents some properties of the register usage 1288 /// of a loop. 1289 struct RegisterUsage { 1290 /// Holds the number of loop invariant values that are used in the loop. 1291 /// The key is ClassID of target-provided register class. 1292 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1293 /// Holds the maximum number of concurrent live intervals in the loop. 1294 /// The key is ClassID of target-provided register class. 1295 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1296 }; 1297 1298 /// \return Returns information about the register usages of the loop for the 1299 /// given vectorization factors. 1300 SmallVector<RegisterUsage, 8> 1301 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1302 1303 /// Collect values we want to ignore in the cost model. 1304 void collectValuesToIgnore(); 1305 1306 /// Split reductions into those that happen in the loop, and those that happen 1307 /// outside. In loop reductions are collected into InLoopReductionChains. 1308 void collectInLoopReductions(); 1309 1310 /// \returns The smallest bitwidth each instruction can be represented with. 1311 /// The vector equivalents of these instructions should be truncated to this 1312 /// type. 1313 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1314 return MinBWs; 1315 } 1316 1317 /// \returns True if it is more profitable to scalarize instruction \p I for 1318 /// vectorization factor \p VF. 1319 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1320 assert(VF.isVector() && 1321 "Profitable to scalarize relevant only for VF > 1."); 1322 1323 // Cost model is not run in the VPlan-native path - return conservative 1324 // result until this changes. 1325 if (EnableVPlanNativePath) 1326 return false; 1327 1328 auto Scalars = InstsToScalarize.find(VF); 1329 assert(Scalars != InstsToScalarize.end() && 1330 "VF not yet analyzed for scalarization profitability"); 1331 return Scalars->second.find(I) != Scalars->second.end(); 1332 } 1333 1334 /// Returns true if \p I is known to be uniform after vectorization. 1335 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1336 if (VF.isScalar()) 1337 return true; 1338 1339 // Cost model is not run in the VPlan-native path - return conservative 1340 // result until this changes. 1341 if (EnableVPlanNativePath) 1342 return false; 1343 1344 auto UniformsPerVF = Uniforms.find(VF); 1345 assert(UniformsPerVF != Uniforms.end() && 1346 "VF not yet analyzed for uniformity"); 1347 return UniformsPerVF->second.count(I); 1348 } 1349 1350 /// Returns true if \p I is known to be scalar after vectorization. 1351 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1352 if (VF.isScalar()) 1353 return true; 1354 1355 // Cost model is not run in the VPlan-native path - return conservative 1356 // result until this changes. 1357 if (EnableVPlanNativePath) 1358 return false; 1359 1360 auto ScalarsPerVF = Scalars.find(VF); 1361 assert(ScalarsPerVF != Scalars.end() && 1362 "Scalar values are not calculated for VF"); 1363 return ScalarsPerVF->second.count(I); 1364 } 1365 1366 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1367 /// for vectorization factor \p VF. 1368 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1369 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1370 !isProfitableToScalarize(I, VF) && 1371 !isScalarAfterVectorization(I, VF); 1372 } 1373 1374 /// Decision that was taken during cost calculation for memory instruction. 1375 enum InstWidening { 1376 CM_Unknown, 1377 CM_Widen, // For consecutive accesses with stride +1. 1378 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1379 CM_Interleave, 1380 CM_GatherScatter, 1381 CM_Scalarize 1382 }; 1383 1384 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1385 /// instruction \p I and vector width \p VF. 1386 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1387 unsigned Cost) { 1388 assert(VF.isVector() && "Expected VF >=2"); 1389 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1390 } 1391 1392 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1393 /// interleaving group \p Grp and vector width \p VF. 1394 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1395 ElementCount VF, InstWidening W, unsigned Cost) { 1396 assert(VF.isVector() && "Expected VF >=2"); 1397 /// Broadcast this decicion to all instructions inside the group. 1398 /// But the cost will be assigned to one instruction only. 1399 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1400 if (auto *I = Grp->getMember(i)) { 1401 if (Grp->getInsertPos() == I) 1402 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1403 else 1404 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1405 } 1406 } 1407 } 1408 1409 /// Return the cost model decision for the given instruction \p I and vector 1410 /// width \p VF. Return CM_Unknown if this instruction did not pass 1411 /// through the cost modeling. 1412 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1413 assert(VF.isVector() && "Expected VF to be a vector VF"); 1414 // Cost model is not run in the VPlan-native path - return conservative 1415 // result until this changes. 1416 if (EnableVPlanNativePath) 1417 return CM_GatherScatter; 1418 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 auto Itr = WideningDecisions.find(InstOnVF); 1421 if (Itr == WideningDecisions.end()) 1422 return CM_Unknown; 1423 return Itr->second.first; 1424 } 1425 1426 /// Return the vectorization cost for the given instruction \p I and vector 1427 /// width \p VF. 1428 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1429 assert(VF.isVector() && "Expected VF >=2"); 1430 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1431 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1432 "The cost is not calculated"); 1433 return WideningDecisions[InstOnVF].second; 1434 } 1435 1436 /// Return True if instruction \p I is an optimizable truncate whose operand 1437 /// is an induction variable. Such a truncate will be removed by adding a new 1438 /// induction variable with the destination type. 1439 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1440 // If the instruction is not a truncate, return false. 1441 auto *Trunc = dyn_cast<TruncInst>(I); 1442 if (!Trunc) 1443 return false; 1444 1445 // Get the source and destination types of the truncate. 1446 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1447 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1448 1449 // If the truncate is free for the given types, return false. Replacing a 1450 // free truncate with an induction variable would add an induction variable 1451 // update instruction to each iteration of the loop. We exclude from this 1452 // check the primary induction variable since it will need an update 1453 // instruction regardless. 1454 Value *Op = Trunc->getOperand(0); 1455 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1456 return false; 1457 1458 // If the truncated value is not an induction variable, return false. 1459 return Legal->isInductionPhi(Op); 1460 } 1461 1462 /// Collects the instructions to scalarize for each predicated instruction in 1463 /// the loop. 1464 void collectInstsToScalarize(ElementCount VF); 1465 1466 /// Collect Uniform and Scalar values for the given \p VF. 1467 /// The sets depend on CM decision for Load/Store instructions 1468 /// that may be vectorized as interleave, gather-scatter or scalarized. 1469 void collectUniformsAndScalars(ElementCount VF) { 1470 // Do the analysis once. 1471 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1472 return; 1473 setCostBasedWideningDecision(VF); 1474 collectLoopUniforms(VF); 1475 collectLoopScalars(VF); 1476 } 1477 1478 /// Returns true if the target machine supports masked store operation 1479 /// for the given \p DataType and kind of access to \p Ptr. 1480 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1481 return Legal->isConsecutivePtr(Ptr) && 1482 TTI.isLegalMaskedStore(DataType, Alignment); 1483 } 1484 1485 /// Returns true if the target machine supports masked load operation 1486 /// for the given \p DataType and kind of access to \p Ptr. 1487 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1488 return Legal->isConsecutivePtr(Ptr) && 1489 TTI.isLegalMaskedLoad(DataType, Alignment); 1490 } 1491 1492 /// Returns true if the target machine supports masked scatter operation 1493 /// for the given \p DataType. 1494 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1495 return TTI.isLegalMaskedScatter(DataType, Alignment); 1496 } 1497 1498 /// Returns true if the target machine supports masked gather operation 1499 /// for the given \p DataType. 1500 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1501 return TTI.isLegalMaskedGather(DataType, Alignment); 1502 } 1503 1504 /// Returns true if the target machine can represent \p V as a masked gather 1505 /// or scatter operation. 1506 bool isLegalGatherOrScatter(Value *V) { 1507 bool LI = isa<LoadInst>(V); 1508 bool SI = isa<StoreInst>(V); 1509 if (!LI && !SI) 1510 return false; 1511 auto *Ty = getMemInstValueType(V); 1512 Align Align = getLoadStoreAlignment(V); 1513 return (LI && isLegalMaskedGather(Ty, Align)) || 1514 (SI && isLegalMaskedScatter(Ty, Align)); 1515 } 1516 1517 /// Returns true if \p I is an instruction that will be scalarized with 1518 /// predication. Such instructions include conditional stores and 1519 /// instructions that may divide by zero. 1520 /// If a non-zero VF has been calculated, we check if I will be scalarized 1521 /// predication for that VF. 1522 bool isScalarWithPredication(Instruction *I, 1523 ElementCount VF = ElementCount::getFixed(1)); 1524 1525 // Returns true if \p I is an instruction that will be predicated either 1526 // through scalar predication or masked load/store or masked gather/scatter. 1527 // Superset of instructions that return true for isScalarWithPredication. 1528 bool isPredicatedInst(Instruction *I) { 1529 if (!blockNeedsPredication(I->getParent())) 1530 return false; 1531 // Loads and stores that need some form of masked operation are predicated 1532 // instructions. 1533 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1534 return Legal->isMaskRequired(I); 1535 return isScalarWithPredication(I); 1536 } 1537 1538 /// Returns true if \p I is a memory instruction with consecutive memory 1539 /// access that can be widened. 1540 bool 1541 memoryInstructionCanBeWidened(Instruction *I, 1542 ElementCount VF = ElementCount::getFixed(1)); 1543 1544 /// Returns true if \p I is a memory instruction in an interleaved-group 1545 /// of memory accesses that can be vectorized with wide vector loads/stores 1546 /// and shuffles. 1547 bool 1548 interleavedAccessCanBeWidened(Instruction *I, 1549 ElementCount VF = ElementCount::getFixed(1)); 1550 1551 /// Check if \p Instr belongs to any interleaved access group. 1552 bool isAccessInterleaved(Instruction *Instr) { 1553 return InterleaveInfo.isInterleaved(Instr); 1554 } 1555 1556 /// Get the interleaved access group that \p Instr belongs to. 1557 const InterleaveGroup<Instruction> * 1558 getInterleavedAccessGroup(Instruction *Instr) { 1559 return InterleaveInfo.getInterleaveGroup(Instr); 1560 } 1561 1562 /// Returns true if we're required to use a scalar epilogue for at least 1563 /// the final iteration of the original loop. 1564 bool requiresScalarEpilogue() const { 1565 if (!isScalarEpilogueAllowed()) 1566 return false; 1567 // If we might exit from anywhere but the latch, must run the exiting 1568 // iteration in scalar form. 1569 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1570 return true; 1571 return InterleaveInfo.requiresScalarEpilogue(); 1572 } 1573 1574 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1575 /// loop hint annotation. 1576 bool isScalarEpilogueAllowed() const { 1577 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1578 } 1579 1580 /// Returns true if all loop blocks should be masked to fold tail loop. 1581 bool foldTailByMasking() const { return FoldTailByMasking; } 1582 1583 bool blockNeedsPredication(BasicBlock *BB) { 1584 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1585 } 1586 1587 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1588 /// nodes to the chain of instructions representing the reductions. Uses a 1589 /// MapVector to ensure deterministic iteration order. 1590 using ReductionChainMap = 1591 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1592 1593 /// Return the chain of instructions representing an inloop reduction. 1594 const ReductionChainMap &getInLoopReductionChains() const { 1595 return InLoopReductionChains; 1596 } 1597 1598 /// Returns true if the Phi is part of an inloop reduction. 1599 bool isInLoopReduction(PHINode *Phi) const { 1600 return InLoopReductionChains.count(Phi); 1601 } 1602 1603 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1604 /// with factor VF. Return the cost of the instruction, including 1605 /// scalarization overhead if it's needed. 1606 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1607 1608 /// Estimate cost of a call instruction CI if it were vectorized with factor 1609 /// VF. Return the cost of the instruction, including scalarization overhead 1610 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1611 /// scalarized - 1612 /// i.e. either vector version isn't available, or is too expensive. 1613 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1614 bool &NeedToScalarize); 1615 1616 /// Invalidates decisions already taken by the cost model. 1617 void invalidateCostModelingDecisions() { 1618 WideningDecisions.clear(); 1619 Uniforms.clear(); 1620 Scalars.clear(); 1621 } 1622 1623 private: 1624 unsigned NumPredStores = 0; 1625 1626 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1627 /// than zero. One is returned if vectorization should best be avoided due 1628 /// to cost. 1629 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1630 ElementCount UserVF); 1631 1632 /// The vectorization cost is a combination of the cost itself and a boolean 1633 /// indicating whether any of the contributing operations will actually 1634 /// operate on 1635 /// vector values after type legalization in the backend. If this latter value 1636 /// is 1637 /// false, then all operations will be scalarized (i.e. no vectorization has 1638 /// actually taken place). 1639 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1640 1641 /// Returns the expected execution cost. The unit of the cost does 1642 /// not matter because we use the 'cost' units to compare different 1643 /// vector widths. The cost that is returned is *not* normalized by 1644 /// the factor width. 1645 VectorizationCostTy expectedCost(ElementCount VF); 1646 1647 /// Returns the execution time cost of an instruction for a given vector 1648 /// width. Vector width of one means scalar. 1649 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1650 1651 /// The cost-computation logic from getInstructionCost which provides 1652 /// the vector type as an output parameter. 1653 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1654 Type *&VectorTy); 1655 1656 /// Calculate vectorization cost of memory instruction \p I. 1657 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1658 1659 /// The cost computation for scalarized memory instruction. 1660 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1661 1662 /// The cost computation for interleaving group of memory instructions. 1663 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1664 1665 /// The cost computation for Gather/Scatter instruction. 1666 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1667 1668 /// The cost computation for widening instruction \p I with consecutive 1669 /// memory access. 1670 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1671 1672 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1673 /// Load: scalar load + broadcast. 1674 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1675 /// element) 1676 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1677 1678 /// Estimate the overhead of scalarizing an instruction. This is a 1679 /// convenience wrapper for the type-based getScalarizationOverhead API. 1680 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1681 1682 /// Returns whether the instruction is a load or store and will be a emitted 1683 /// as a vector operation. 1684 bool isConsecutiveLoadOrStore(Instruction *I); 1685 1686 /// Returns true if an artificially high cost for emulated masked memrefs 1687 /// should be used. 1688 bool useEmulatedMaskMemRefHack(Instruction *I); 1689 1690 /// Map of scalar integer values to the smallest bitwidth they can be legally 1691 /// represented as. The vector equivalents of these values should be truncated 1692 /// to this type. 1693 MapVector<Instruction *, uint64_t> MinBWs; 1694 1695 /// A type representing the costs for instructions if they were to be 1696 /// scalarized rather than vectorized. The entries are Instruction-Cost 1697 /// pairs. 1698 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1699 1700 /// A set containing all BasicBlocks that are known to present after 1701 /// vectorization as a predicated block. 1702 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1703 1704 /// Records whether it is allowed to have the original scalar loop execute at 1705 /// least once. This may be needed as a fallback loop in case runtime 1706 /// aliasing/dependence checks fail, or to handle the tail/remainder 1707 /// iterations when the trip count is unknown or doesn't divide by the VF, 1708 /// or as a peel-loop to handle gaps in interleave-groups. 1709 /// Under optsize and when the trip count is very small we don't allow any 1710 /// iterations to execute in the scalar loop. 1711 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1712 1713 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1714 bool FoldTailByMasking = false; 1715 1716 /// A map holding scalar costs for different vectorization factors. The 1717 /// presence of a cost for an instruction in the mapping indicates that the 1718 /// instruction will be scalarized when vectorizing with the associated 1719 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1720 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1721 1722 /// Holds the instructions known to be uniform after vectorization. 1723 /// The data is collected per VF. 1724 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1725 1726 /// Holds the instructions known to be scalar after vectorization. 1727 /// The data is collected per VF. 1728 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1729 1730 /// Holds the instructions (address computations) that are forced to be 1731 /// scalarized. 1732 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1733 1734 /// PHINodes of the reductions that should be expanded in-loop along with 1735 /// their associated chains of reduction operations, in program order from top 1736 /// (PHI) to bottom 1737 ReductionChainMap InLoopReductionChains; 1738 1739 /// Returns the expected difference in cost from scalarizing the expression 1740 /// feeding a predicated instruction \p PredInst. The instructions to 1741 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1742 /// non-negative return value implies the expression will be scalarized. 1743 /// Currently, only single-use chains are considered for scalarization. 1744 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1745 ElementCount VF); 1746 1747 /// Collect the instructions that are uniform after vectorization. An 1748 /// instruction is uniform if we represent it with a single scalar value in 1749 /// the vectorized loop corresponding to each vector iteration. Examples of 1750 /// uniform instructions include pointer operands of consecutive or 1751 /// interleaved memory accesses. Note that although uniformity implies an 1752 /// instruction will be scalar, the reverse is not true. In general, a 1753 /// scalarized instruction will be represented by VF scalar values in the 1754 /// vectorized loop, each corresponding to an iteration of the original 1755 /// scalar loop. 1756 void collectLoopUniforms(ElementCount VF); 1757 1758 /// Collect the instructions that are scalar after vectorization. An 1759 /// instruction is scalar if it is known to be uniform or will be scalarized 1760 /// during vectorization. Non-uniform scalarized instructions will be 1761 /// represented by VF values in the vectorized loop, each corresponding to an 1762 /// iteration of the original scalar loop. 1763 void collectLoopScalars(ElementCount VF); 1764 1765 /// Keeps cost model vectorization decision and cost for instructions. 1766 /// Right now it is used for memory instructions only. 1767 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1768 std::pair<InstWidening, unsigned>>; 1769 1770 DecisionList WideningDecisions; 1771 1772 /// Returns true if \p V is expected to be vectorized and it needs to be 1773 /// extracted. 1774 bool needsExtract(Value *V, ElementCount VF) const { 1775 Instruction *I = dyn_cast<Instruction>(V); 1776 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1777 TheLoop->isLoopInvariant(I)) 1778 return false; 1779 1780 // Assume we can vectorize V (and hence we need extraction) if the 1781 // scalars are not computed yet. This can happen, because it is called 1782 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1783 // the scalars are collected. That should be a safe assumption in most 1784 // cases, because we check if the operands have vectorizable types 1785 // beforehand in LoopVectorizationLegality. 1786 return Scalars.find(VF) == Scalars.end() || 1787 !isScalarAfterVectorization(I, VF); 1788 }; 1789 1790 /// Returns a range containing only operands needing to be extracted. 1791 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1792 ElementCount VF) { 1793 return SmallVector<Value *, 4>(make_filter_range( 1794 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1795 } 1796 1797 /// Determines if we have the infrastructure to vectorize loop \p L and its 1798 /// epilogue, assuming the main loop is vectorized by \p VF. 1799 bool isCandidateForEpilogueVectorization(const Loop &L, 1800 const ElementCount VF) const; 1801 1802 /// Returns true if epilogue vectorization is considered profitable, and 1803 /// false otherwise. 1804 /// \p VF is the vectorization factor chosen for the original loop. 1805 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1806 1807 public: 1808 /// The loop that we evaluate. 1809 Loop *TheLoop; 1810 1811 /// Predicated scalar evolution analysis. 1812 PredicatedScalarEvolution &PSE; 1813 1814 /// Loop Info analysis. 1815 LoopInfo *LI; 1816 1817 /// Vectorization legality. 1818 LoopVectorizationLegality *Legal; 1819 1820 /// Vector target information. 1821 const TargetTransformInfo &TTI; 1822 1823 /// Target Library Info. 1824 const TargetLibraryInfo *TLI; 1825 1826 /// Demanded bits analysis. 1827 DemandedBits *DB; 1828 1829 /// Assumption cache. 1830 AssumptionCache *AC; 1831 1832 /// Interface to emit optimization remarks. 1833 OptimizationRemarkEmitter *ORE; 1834 1835 const Function *TheFunction; 1836 1837 /// Loop Vectorize Hint. 1838 const LoopVectorizeHints *Hints; 1839 1840 /// The interleave access information contains groups of interleaved accesses 1841 /// with the same stride and close to each other. 1842 InterleavedAccessInfo &InterleaveInfo; 1843 1844 /// Values to ignore in the cost model. 1845 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1846 1847 /// Values to ignore in the cost model when VF > 1. 1848 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1849 1850 /// Profitable vector factors. 1851 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1852 }; 1853 1854 } // end namespace llvm 1855 1856 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1857 // vectorization. The loop needs to be annotated with #pragma omp simd 1858 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1859 // vector length information is not provided, vectorization is not considered 1860 // explicit. Interleave hints are not allowed either. These limitations will be 1861 // relaxed in the future. 1862 // Please, note that we are currently forced to abuse the pragma 'clang 1863 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1864 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1865 // provides *explicit vectorization hints* (LV can bypass legal checks and 1866 // assume that vectorization is legal). However, both hints are implemented 1867 // using the same metadata (llvm.loop.vectorize, processed by 1868 // LoopVectorizeHints). This will be fixed in the future when the native IR 1869 // representation for pragma 'omp simd' is introduced. 1870 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1871 OptimizationRemarkEmitter *ORE) { 1872 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1873 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1874 1875 // Only outer loops with an explicit vectorization hint are supported. 1876 // Unannotated outer loops are ignored. 1877 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1878 return false; 1879 1880 Function *Fn = OuterLp->getHeader()->getParent(); 1881 if (!Hints.allowVectorization(Fn, OuterLp, 1882 true /*VectorizeOnlyWhenForced*/)) { 1883 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1884 return false; 1885 } 1886 1887 if (Hints.getInterleave() > 1) { 1888 // TODO: Interleave support is future work. 1889 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1890 "outer loops.\n"); 1891 Hints.emitRemarkWithHints(); 1892 return false; 1893 } 1894 1895 return true; 1896 } 1897 1898 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1899 OptimizationRemarkEmitter *ORE, 1900 SmallVectorImpl<Loop *> &V) { 1901 // Collect inner loops and outer loops without irreducible control flow. For 1902 // now, only collect outer loops that have explicit vectorization hints. If we 1903 // are stress testing the VPlan H-CFG construction, we collect the outermost 1904 // loop of every loop nest. 1905 if (L.isInnermost() || VPlanBuildStressTest || 1906 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1907 LoopBlocksRPO RPOT(&L); 1908 RPOT.perform(LI); 1909 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1910 V.push_back(&L); 1911 // TODO: Collect inner loops inside marked outer loops in case 1912 // vectorization fails for the outer loop. Do not invoke 1913 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1914 // already known to be reducible. We can use an inherited attribute for 1915 // that. 1916 return; 1917 } 1918 } 1919 for (Loop *InnerL : L) 1920 collectSupportedLoops(*InnerL, LI, ORE, V); 1921 } 1922 1923 namespace { 1924 1925 /// The LoopVectorize Pass. 1926 struct LoopVectorize : public FunctionPass { 1927 /// Pass identification, replacement for typeid 1928 static char ID; 1929 1930 LoopVectorizePass Impl; 1931 1932 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1933 bool VectorizeOnlyWhenForced = false) 1934 : FunctionPass(ID), 1935 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1936 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1937 } 1938 1939 bool runOnFunction(Function &F) override { 1940 if (skipFunction(F)) 1941 return false; 1942 1943 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1944 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1945 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1946 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1947 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1948 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1949 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1950 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1951 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1952 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1953 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1954 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1955 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1956 1957 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1958 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1959 1960 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1961 GetLAA, *ORE, PSI).MadeAnyChange; 1962 } 1963 1964 void getAnalysisUsage(AnalysisUsage &AU) const override { 1965 AU.addRequired<AssumptionCacheTracker>(); 1966 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1967 AU.addRequired<DominatorTreeWrapperPass>(); 1968 AU.addRequired<LoopInfoWrapperPass>(); 1969 AU.addRequired<ScalarEvolutionWrapperPass>(); 1970 AU.addRequired<TargetTransformInfoWrapperPass>(); 1971 AU.addRequired<AAResultsWrapperPass>(); 1972 AU.addRequired<LoopAccessLegacyAnalysis>(); 1973 AU.addRequired<DemandedBitsWrapperPass>(); 1974 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1975 AU.addRequired<InjectTLIMappingsLegacy>(); 1976 1977 // We currently do not preserve loopinfo/dominator analyses with outer loop 1978 // vectorization. Until this is addressed, mark these analyses as preserved 1979 // only for non-VPlan-native path. 1980 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1981 if (!EnableVPlanNativePath) { 1982 AU.addPreserved<LoopInfoWrapperPass>(); 1983 AU.addPreserved<DominatorTreeWrapperPass>(); 1984 } 1985 1986 AU.addPreserved<BasicAAWrapperPass>(); 1987 AU.addPreserved<GlobalsAAWrapperPass>(); 1988 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1989 } 1990 }; 1991 1992 } // end anonymous namespace 1993 1994 //===----------------------------------------------------------------------===// 1995 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1996 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1997 //===----------------------------------------------------------------------===// 1998 1999 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2000 // We need to place the broadcast of invariant variables outside the loop, 2001 // but only if it's proven safe to do so. Else, broadcast will be inside 2002 // vector loop body. 2003 Instruction *Instr = dyn_cast<Instruction>(V); 2004 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2005 (!Instr || 2006 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2007 // Place the code for broadcasting invariant variables in the new preheader. 2008 IRBuilder<>::InsertPointGuard Guard(Builder); 2009 if (SafeToHoist) 2010 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2011 2012 // Broadcast the scalar into all locations in the vector. 2013 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2014 2015 return Shuf; 2016 } 2017 2018 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2019 const InductionDescriptor &II, Value *Step, Value *Start, 2020 Instruction *EntryVal) { 2021 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2022 "Expected either an induction phi-node or a truncate of it!"); 2023 2024 // Construct the initial value of the vector IV in the vector loop preheader 2025 auto CurrIP = Builder.saveIP(); 2026 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2027 if (isa<TruncInst>(EntryVal)) { 2028 assert(Start->getType()->isIntegerTy() && 2029 "Truncation requires an integer type"); 2030 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2031 Step = Builder.CreateTrunc(Step, TruncType); 2032 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2033 } 2034 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2035 Value *SteppedStart = 2036 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2037 2038 // We create vector phi nodes for both integer and floating-point induction 2039 // variables. Here, we determine the kind of arithmetic we will perform. 2040 Instruction::BinaryOps AddOp; 2041 Instruction::BinaryOps MulOp; 2042 if (Step->getType()->isIntegerTy()) { 2043 AddOp = Instruction::Add; 2044 MulOp = Instruction::Mul; 2045 } else { 2046 AddOp = II.getInductionOpcode(); 2047 MulOp = Instruction::FMul; 2048 } 2049 2050 // Multiply the vectorization factor by the step using integer or 2051 // floating-point arithmetic as appropriate. 2052 Value *ConstVF = 2053 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2054 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2055 2056 // Create a vector splat to use in the induction update. 2057 // 2058 // FIXME: If the step is non-constant, we create the vector splat with 2059 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2060 // handle a constant vector splat. 2061 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2062 Value *SplatVF = isa<Constant>(Mul) 2063 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2064 : Builder.CreateVectorSplat(VF, Mul); 2065 Builder.restoreIP(CurrIP); 2066 2067 // We may need to add the step a number of times, depending on the unroll 2068 // factor. The last of those goes into the PHI. 2069 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2070 &*LoopVectorBody->getFirstInsertionPt()); 2071 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2072 Instruction *LastInduction = VecInd; 2073 for (unsigned Part = 0; Part < UF; ++Part) { 2074 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2075 2076 if (isa<TruncInst>(EntryVal)) 2077 addMetadata(LastInduction, EntryVal); 2078 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2079 2080 LastInduction = cast<Instruction>(addFastMathFlag( 2081 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2082 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2083 } 2084 2085 // Move the last step to the end of the latch block. This ensures consistent 2086 // placement of all induction updates. 2087 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2088 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2089 auto *ICmp = cast<Instruction>(Br->getCondition()); 2090 LastInduction->moveBefore(ICmp); 2091 LastInduction->setName("vec.ind.next"); 2092 2093 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2094 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2095 } 2096 2097 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2098 return Cost->isScalarAfterVectorization(I, VF) || 2099 Cost->isProfitableToScalarize(I, VF); 2100 } 2101 2102 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2103 if (shouldScalarizeInstruction(IV)) 2104 return true; 2105 auto isScalarInst = [&](User *U) -> bool { 2106 auto *I = cast<Instruction>(U); 2107 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2108 }; 2109 return llvm::any_of(IV->users(), isScalarInst); 2110 } 2111 2112 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2113 const InductionDescriptor &ID, const Instruction *EntryVal, 2114 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2115 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2116 "Expected either an induction phi-node or a truncate of it!"); 2117 2118 // This induction variable is not the phi from the original loop but the 2119 // newly-created IV based on the proof that casted Phi is equal to the 2120 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2121 // re-uses the same InductionDescriptor that original IV uses but we don't 2122 // have to do any recording in this case - that is done when original IV is 2123 // processed. 2124 if (isa<TruncInst>(EntryVal)) 2125 return; 2126 2127 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2128 if (Casts.empty()) 2129 return; 2130 // Only the first Cast instruction in the Casts vector is of interest. 2131 // The rest of the Casts (if exist) have no uses outside the 2132 // induction update chain itself. 2133 Instruction *CastInst = *Casts.begin(); 2134 if (Lane < UINT_MAX) 2135 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2136 else 2137 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2138 } 2139 2140 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2141 TruncInst *Trunc) { 2142 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2143 "Primary induction variable must have an integer type"); 2144 2145 auto II = Legal->getInductionVars().find(IV); 2146 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2147 2148 auto ID = II->second; 2149 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2150 2151 // The value from the original loop to which we are mapping the new induction 2152 // variable. 2153 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2154 2155 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2156 2157 // Generate code for the induction step. Note that induction steps are 2158 // required to be loop-invariant 2159 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2160 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2161 "Induction step should be loop invariant"); 2162 if (PSE.getSE()->isSCEVable(IV->getType())) { 2163 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2164 return Exp.expandCodeFor(Step, Step->getType(), 2165 LoopVectorPreHeader->getTerminator()); 2166 } 2167 return cast<SCEVUnknown>(Step)->getValue(); 2168 }; 2169 2170 // The scalar value to broadcast. This is derived from the canonical 2171 // induction variable. If a truncation type is given, truncate the canonical 2172 // induction variable and step. Otherwise, derive these values from the 2173 // induction descriptor. 2174 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2175 Value *ScalarIV = Induction; 2176 if (IV != OldInduction) { 2177 ScalarIV = IV->getType()->isIntegerTy() 2178 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2179 : Builder.CreateCast(Instruction::SIToFP, Induction, 2180 IV->getType()); 2181 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2182 ScalarIV->setName("offset.idx"); 2183 } 2184 if (Trunc) { 2185 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2186 assert(Step->getType()->isIntegerTy() && 2187 "Truncation requires an integer step"); 2188 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2189 Step = Builder.CreateTrunc(Step, TruncType); 2190 } 2191 return ScalarIV; 2192 }; 2193 2194 // Create the vector values from the scalar IV, in the absence of creating a 2195 // vector IV. 2196 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2197 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2198 for (unsigned Part = 0; Part < UF; ++Part) { 2199 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2200 Value *EntryPart = 2201 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2202 ID.getInductionOpcode()); 2203 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2204 if (Trunc) 2205 addMetadata(EntryPart, Trunc); 2206 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2207 } 2208 }; 2209 2210 // Now do the actual transformations, and start with creating the step value. 2211 Value *Step = CreateStepValue(ID.getStep()); 2212 if (VF.isZero() || VF.isScalar()) { 2213 Value *ScalarIV = CreateScalarIV(Step); 2214 CreateSplatIV(ScalarIV, Step); 2215 return; 2216 } 2217 2218 // Determine if we want a scalar version of the induction variable. This is 2219 // true if the induction variable itself is not widened, or if it has at 2220 // least one user in the loop that is not widened. 2221 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2222 if (!NeedsScalarIV) { 2223 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2224 return; 2225 } 2226 2227 // Try to create a new independent vector induction variable. If we can't 2228 // create the phi node, we will splat the scalar induction variable in each 2229 // loop iteration. 2230 if (!shouldScalarizeInstruction(EntryVal)) { 2231 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2232 Value *ScalarIV = CreateScalarIV(Step); 2233 // Create scalar steps that can be used by instructions we will later 2234 // scalarize. Note that the addition of the scalar steps will not increase 2235 // the number of instructions in the loop in the common case prior to 2236 // InstCombine. We will be trading one vector extract for each scalar step. 2237 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2238 return; 2239 } 2240 2241 // All IV users are scalar instructions, so only emit a scalar IV, not a 2242 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2243 // predicate used by the masked loads/stores. 2244 Value *ScalarIV = CreateScalarIV(Step); 2245 if (!Cost->isScalarEpilogueAllowed()) 2246 CreateSplatIV(ScalarIV, Step); 2247 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2248 } 2249 2250 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2251 Instruction::BinaryOps BinOp) { 2252 // Create and check the types. 2253 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2254 int VLen = ValVTy->getNumElements(); 2255 2256 Type *STy = Val->getType()->getScalarType(); 2257 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2258 "Induction Step must be an integer or FP"); 2259 assert(Step->getType() == STy && "Step has wrong type"); 2260 2261 SmallVector<Constant *, 8> Indices; 2262 2263 if (STy->isIntegerTy()) { 2264 // Create a vector of consecutive numbers from zero to VF. 2265 for (int i = 0; i < VLen; ++i) 2266 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2267 2268 // Add the consecutive indices to the vector value. 2269 Constant *Cv = ConstantVector::get(Indices); 2270 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2271 Step = Builder.CreateVectorSplat(VLen, Step); 2272 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2273 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2274 // which can be found from the original scalar operations. 2275 Step = Builder.CreateMul(Cv, Step); 2276 return Builder.CreateAdd(Val, Step, "induction"); 2277 } 2278 2279 // Floating point induction. 2280 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2281 "Binary Opcode should be specified for FP induction"); 2282 // Create a vector of consecutive numbers from zero to VF. 2283 for (int i = 0; i < VLen; ++i) 2284 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2285 2286 // Add the consecutive indices to the vector value. 2287 Constant *Cv = ConstantVector::get(Indices); 2288 2289 Step = Builder.CreateVectorSplat(VLen, Step); 2290 2291 // Floating point operations had to be 'fast' to enable the induction. 2292 FastMathFlags Flags; 2293 Flags.setFast(); 2294 2295 Value *MulOp = Builder.CreateFMul(Cv, Step); 2296 if (isa<Instruction>(MulOp)) 2297 // Have to check, MulOp may be a constant 2298 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2299 2300 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2301 if (isa<Instruction>(BOp)) 2302 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2303 return BOp; 2304 } 2305 2306 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2307 Instruction *EntryVal, 2308 const InductionDescriptor &ID) { 2309 // We shouldn't have to build scalar steps if we aren't vectorizing. 2310 assert(VF.isVector() && "VF should be greater than one"); 2311 // Get the value type and ensure it and the step have the same integer type. 2312 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2313 assert(ScalarIVTy == Step->getType() && 2314 "Val and Step should have the same type"); 2315 2316 // We build scalar steps for both integer and floating-point induction 2317 // variables. Here, we determine the kind of arithmetic we will perform. 2318 Instruction::BinaryOps AddOp; 2319 Instruction::BinaryOps MulOp; 2320 if (ScalarIVTy->isIntegerTy()) { 2321 AddOp = Instruction::Add; 2322 MulOp = Instruction::Mul; 2323 } else { 2324 AddOp = ID.getInductionOpcode(); 2325 MulOp = Instruction::FMul; 2326 } 2327 2328 // Determine the number of scalars we need to generate for each unroll 2329 // iteration. If EntryVal is uniform, we only need to generate the first 2330 // lane. Otherwise, we generate all VF values. 2331 unsigned Lanes = 2332 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2333 ? 1 2334 : VF.getKnownMinValue(); 2335 assert((!VF.isScalable() || Lanes == 1) && 2336 "Should never scalarize a scalable vector"); 2337 // Compute the scalar steps and save the results in VectorLoopValueMap. 2338 for (unsigned Part = 0; Part < UF; ++Part) { 2339 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2340 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2341 ScalarIVTy->getScalarSizeInBits()); 2342 Value *StartIdx = 2343 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2344 if (ScalarIVTy->isFloatingPointTy()) 2345 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2346 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2347 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2348 // The step returned by `createStepForVF` is a runtime-evaluated value 2349 // when VF is scalable. Otherwise, it should be folded into a Constant. 2350 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2351 "Expected StartIdx to be folded to a constant when VF is not " 2352 "scalable"); 2353 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2354 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2355 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2356 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2357 } 2358 } 2359 } 2360 2361 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2362 assert(V != Induction && "The new induction variable should not be used."); 2363 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2364 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2365 2366 // If we have a stride that is replaced by one, do it here. Defer this for 2367 // the VPlan-native path until we start running Legal checks in that path. 2368 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2369 V = ConstantInt::get(V->getType(), 1); 2370 2371 // If we have a vector mapped to this value, return it. 2372 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2373 return VectorLoopValueMap.getVectorValue(V, Part); 2374 2375 // If the value has not been vectorized, check if it has been scalarized 2376 // instead. If it has been scalarized, and we actually need the value in 2377 // vector form, we will construct the vector values on demand. 2378 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2379 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2380 2381 // If we've scalarized a value, that value should be an instruction. 2382 auto *I = cast<Instruction>(V); 2383 2384 // If we aren't vectorizing, we can just copy the scalar map values over to 2385 // the vector map. 2386 if (VF.isScalar()) { 2387 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2388 return ScalarValue; 2389 } 2390 2391 // Get the last scalar instruction we generated for V and Part. If the value 2392 // is known to be uniform after vectorization, this corresponds to lane zero 2393 // of the Part unroll iteration. Otherwise, the last instruction is the one 2394 // we created for the last vector lane of the Part unroll iteration. 2395 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2396 ? 0 2397 : VF.getKnownMinValue() - 1; 2398 assert((!VF.isScalable() || LastLane == 0) && 2399 "Scalable vectorization can't lead to any scalarized values."); 2400 auto *LastInst = cast<Instruction>( 2401 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2402 2403 // Set the insert point after the last scalarized instruction. This ensures 2404 // the insertelement sequence will directly follow the scalar definitions. 2405 auto OldIP = Builder.saveIP(); 2406 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2407 Builder.SetInsertPoint(&*NewIP); 2408 2409 // However, if we are vectorizing, we need to construct the vector values. 2410 // If the value is known to be uniform after vectorization, we can just 2411 // broadcast the scalar value corresponding to lane zero for each unroll 2412 // iteration. Otherwise, we construct the vector values using insertelement 2413 // instructions. Since the resulting vectors are stored in 2414 // VectorLoopValueMap, we will only generate the insertelements once. 2415 Value *VectorValue = nullptr; 2416 if (Cost->isUniformAfterVectorization(I, VF)) { 2417 VectorValue = getBroadcastInstrs(ScalarValue); 2418 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2419 } else { 2420 // Initialize packing with insertelements to start from poison. 2421 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2422 Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); 2423 VectorLoopValueMap.setVectorValue(V, Part, Poison); 2424 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2425 packScalarIntoVectorValue(V, {Part, Lane}); 2426 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2427 } 2428 Builder.restoreIP(OldIP); 2429 return VectorValue; 2430 } 2431 2432 // If this scalar is unknown, assume that it is a constant or that it is 2433 // loop invariant. Broadcast V and save the value for future uses. 2434 Value *B = getBroadcastInstrs(V); 2435 VectorLoopValueMap.setVectorValue(V, Part, B); 2436 return B; 2437 } 2438 2439 Value * 2440 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2441 const VPIteration &Instance) { 2442 // If the value is not an instruction contained in the loop, it should 2443 // already be scalar. 2444 if (OrigLoop->isLoopInvariant(V)) 2445 return V; 2446 2447 assert(Instance.Lane > 0 2448 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2449 : true && "Uniform values only have lane zero"); 2450 2451 // If the value from the original loop has not been vectorized, it is 2452 // represented by UF x VF scalar values in the new loop. Return the requested 2453 // scalar value. 2454 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2455 return VectorLoopValueMap.getScalarValue(V, Instance); 2456 2457 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2458 // for the given unroll part. If this entry is not a vector type (i.e., the 2459 // vectorization factor is one), there is no need to generate an 2460 // extractelement instruction. 2461 auto *U = getOrCreateVectorValue(V, Instance.Part); 2462 if (!U->getType()->isVectorTy()) { 2463 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2464 return U; 2465 } 2466 2467 // Otherwise, the value from the original loop has been vectorized and is 2468 // represented by UF vector values. Extract and return the requested scalar 2469 // value from the appropriate vector lane. 2470 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2471 } 2472 2473 void InnerLoopVectorizer::packScalarIntoVectorValue( 2474 Value *V, const VPIteration &Instance) { 2475 assert(V != Induction && "The new induction variable should not be used."); 2476 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2477 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2478 2479 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2480 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2481 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2482 Builder.getInt32(Instance.Lane)); 2483 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2484 } 2485 2486 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2487 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2488 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2489 SmallVector<int, 8> ShuffleMask; 2490 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2491 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2492 2493 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2494 } 2495 2496 // Return whether we allow using masked interleave-groups (for dealing with 2497 // strided loads/stores that reside in predicated blocks, or for dealing 2498 // with gaps). 2499 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2500 // If an override option has been passed in for interleaved accesses, use it. 2501 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2502 return EnableMaskedInterleavedMemAccesses; 2503 2504 return TTI.enableMaskedInterleavedAccessVectorization(); 2505 } 2506 2507 // Try to vectorize the interleave group that \p Instr belongs to. 2508 // 2509 // E.g. Translate following interleaved load group (factor = 3): 2510 // for (i = 0; i < N; i+=3) { 2511 // R = Pic[i]; // Member of index 0 2512 // G = Pic[i+1]; // Member of index 1 2513 // B = Pic[i+2]; // Member of index 2 2514 // ... // do something to R, G, B 2515 // } 2516 // To: 2517 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2518 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2519 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2520 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2521 // 2522 // Or translate following interleaved store group (factor = 3): 2523 // for (i = 0; i < N; i+=3) { 2524 // ... do something to R, G, B 2525 // Pic[i] = R; // Member of index 0 2526 // Pic[i+1] = G; // Member of index 1 2527 // Pic[i+2] = B; // Member of index 2 2528 // } 2529 // To: 2530 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2531 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2532 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2533 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2534 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2535 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2536 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2537 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2538 VPValue *BlockInMask) { 2539 Instruction *Instr = Group->getInsertPos(); 2540 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2541 2542 // Prepare for the vector type of the interleaved load/store. 2543 Type *ScalarTy = getMemInstValueType(Instr); 2544 unsigned InterleaveFactor = Group->getFactor(); 2545 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2546 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2547 2548 // Prepare for the new pointers. 2549 SmallVector<Value *, 2> AddrParts; 2550 unsigned Index = Group->getIndex(Instr); 2551 2552 // TODO: extend the masked interleaved-group support to reversed access. 2553 assert((!BlockInMask || !Group->isReverse()) && 2554 "Reversed masked interleave-group not supported."); 2555 2556 // If the group is reverse, adjust the index to refer to the last vector lane 2557 // instead of the first. We adjust the index from the first vector lane, 2558 // rather than directly getting the pointer for lane VF - 1, because the 2559 // pointer operand of the interleaved access is supposed to be uniform. For 2560 // uniform instructions, we're only required to generate a value for the 2561 // first vector lane in each unroll iteration. 2562 assert(!VF.isScalable() && 2563 "scalable vector reverse operation is not implemented"); 2564 if (Group->isReverse()) 2565 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2566 2567 for (unsigned Part = 0; Part < UF; Part++) { 2568 Value *AddrPart = State.get(Addr, {Part, 0}); 2569 setDebugLocFromInst(Builder, AddrPart); 2570 2571 // Notice current instruction could be any index. Need to adjust the address 2572 // to the member of index 0. 2573 // 2574 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2575 // b = A[i]; // Member of index 0 2576 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2577 // 2578 // E.g. A[i+1] = a; // Member of index 1 2579 // A[i] = b; // Member of index 0 2580 // A[i+2] = c; // Member of index 2 (Current instruction) 2581 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2582 2583 bool InBounds = false; 2584 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2585 InBounds = gep->isInBounds(); 2586 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2587 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2588 2589 // Cast to the vector pointer type. 2590 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2591 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2592 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2593 } 2594 2595 setDebugLocFromInst(Builder, Instr); 2596 Value *PoisonVec = PoisonValue::get(VecTy); 2597 2598 Value *MaskForGaps = nullptr; 2599 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2600 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2601 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2602 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2603 } 2604 2605 // Vectorize the interleaved load group. 2606 if (isa<LoadInst>(Instr)) { 2607 // For each unroll part, create a wide load for the group. 2608 SmallVector<Value *, 2> NewLoads; 2609 for (unsigned Part = 0; Part < UF; Part++) { 2610 Instruction *NewLoad; 2611 if (BlockInMask || MaskForGaps) { 2612 assert(useMaskedInterleavedAccesses(*TTI) && 2613 "masked interleaved groups are not allowed."); 2614 Value *GroupMask = MaskForGaps; 2615 if (BlockInMask) { 2616 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2617 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2618 Value *ShuffledMask = Builder.CreateShuffleVector( 2619 BlockInMaskPart, 2620 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2621 "interleaved.mask"); 2622 GroupMask = MaskForGaps 2623 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2624 MaskForGaps) 2625 : ShuffledMask; 2626 } 2627 NewLoad = 2628 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2629 GroupMask, PoisonVec, "wide.masked.vec"); 2630 } 2631 else 2632 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2633 Group->getAlign(), "wide.vec"); 2634 Group->addMetadata(NewLoad); 2635 NewLoads.push_back(NewLoad); 2636 } 2637 2638 // For each member in the group, shuffle out the appropriate data from the 2639 // wide loads. 2640 unsigned J = 0; 2641 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2642 Instruction *Member = Group->getMember(I); 2643 2644 // Skip the gaps in the group. 2645 if (!Member) 2646 continue; 2647 2648 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2649 auto StrideMask = 2650 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2651 for (unsigned Part = 0; Part < UF; Part++) { 2652 Value *StridedVec = Builder.CreateShuffleVector( 2653 NewLoads[Part], StrideMask, "strided.vec"); 2654 2655 // If this member has different type, cast the result type. 2656 if (Member->getType() != ScalarTy) { 2657 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2658 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2659 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2660 } 2661 2662 if (Group->isReverse()) 2663 StridedVec = reverseVector(StridedVec); 2664 2665 State.set(VPDefs[J], Member, StridedVec, Part); 2666 } 2667 ++J; 2668 } 2669 return; 2670 } 2671 2672 // The sub vector type for current instruction. 2673 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2674 auto *SubVT = VectorType::get(ScalarTy, VF); 2675 2676 // Vectorize the interleaved store group. 2677 for (unsigned Part = 0; Part < UF; Part++) { 2678 // Collect the stored vector from each member. 2679 SmallVector<Value *, 4> StoredVecs; 2680 for (unsigned i = 0; i < InterleaveFactor; i++) { 2681 // Interleaved store group doesn't allow a gap, so each index has a member 2682 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2683 2684 Value *StoredVec = State.get(StoredValues[i], Part); 2685 2686 if (Group->isReverse()) 2687 StoredVec = reverseVector(StoredVec); 2688 2689 // If this member has different type, cast it to a unified type. 2690 2691 if (StoredVec->getType() != SubVT) 2692 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2693 2694 StoredVecs.push_back(StoredVec); 2695 } 2696 2697 // Concatenate all vectors into a wide vector. 2698 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2699 2700 // Interleave the elements in the wide vector. 2701 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2702 Value *IVec = Builder.CreateShuffleVector( 2703 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2704 "interleaved.vec"); 2705 2706 Instruction *NewStoreInstr; 2707 if (BlockInMask) { 2708 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2709 Value *ShuffledMask = Builder.CreateShuffleVector( 2710 BlockInMaskPart, 2711 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2712 "interleaved.mask"); 2713 NewStoreInstr = Builder.CreateMaskedStore( 2714 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2715 } 2716 else 2717 NewStoreInstr = 2718 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2719 2720 Group->addMetadata(NewStoreInstr); 2721 } 2722 } 2723 2724 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2725 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2726 VPValue *StoredValue, VPValue *BlockInMask) { 2727 // Attempt to issue a wide load. 2728 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2729 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2730 2731 assert((LI || SI) && "Invalid Load/Store instruction"); 2732 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2733 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2734 2735 LoopVectorizationCostModel::InstWidening Decision = 2736 Cost->getWideningDecision(Instr, VF); 2737 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2738 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2739 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2740 "CM decision is not to widen the memory instruction"); 2741 2742 Type *ScalarDataTy = getMemInstValueType(Instr); 2743 2744 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2745 const Align Alignment = getLoadStoreAlignment(Instr); 2746 2747 // Determine if the pointer operand of the access is either consecutive or 2748 // reverse consecutive. 2749 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2750 bool ConsecutiveStride = 2751 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2752 bool CreateGatherScatter = 2753 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2754 2755 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2756 // gather/scatter. Otherwise Decision should have been to Scalarize. 2757 assert((ConsecutiveStride || CreateGatherScatter) && 2758 "The instruction should be scalarized"); 2759 (void)ConsecutiveStride; 2760 2761 VectorParts BlockInMaskParts(UF); 2762 bool isMaskRequired = BlockInMask; 2763 if (isMaskRequired) 2764 for (unsigned Part = 0; Part < UF; ++Part) 2765 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2766 2767 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2768 // Calculate the pointer for the specific unroll-part. 2769 GetElementPtrInst *PartPtr = nullptr; 2770 2771 bool InBounds = false; 2772 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2773 InBounds = gep->isInBounds(); 2774 2775 if (Reverse) { 2776 assert(!VF.isScalable() && 2777 "Reversing vectors is not yet supported for scalable vectors."); 2778 2779 // If the address is consecutive but reversed, then the 2780 // wide store needs to start at the last vector element. 2781 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2782 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2783 PartPtr->setIsInBounds(InBounds); 2784 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2785 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2786 PartPtr->setIsInBounds(InBounds); 2787 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2788 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2789 } else { 2790 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2791 PartPtr = cast<GetElementPtrInst>( 2792 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2793 PartPtr->setIsInBounds(InBounds); 2794 } 2795 2796 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2797 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2798 }; 2799 2800 // Handle Stores: 2801 if (SI) { 2802 setDebugLocFromInst(Builder, SI); 2803 2804 for (unsigned Part = 0; Part < UF; ++Part) { 2805 Instruction *NewSI = nullptr; 2806 Value *StoredVal = State.get(StoredValue, Part); 2807 if (CreateGatherScatter) { 2808 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2809 Value *VectorGep = State.get(Addr, Part); 2810 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2811 MaskPart); 2812 } else { 2813 if (Reverse) { 2814 // If we store to reverse consecutive memory locations, then we need 2815 // to reverse the order of elements in the stored value. 2816 StoredVal = reverseVector(StoredVal); 2817 // We don't want to update the value in the map as it might be used in 2818 // another expression. So don't call resetVectorValue(StoredVal). 2819 } 2820 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2821 if (isMaskRequired) 2822 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2823 BlockInMaskParts[Part]); 2824 else 2825 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2826 } 2827 addMetadata(NewSI, SI); 2828 } 2829 return; 2830 } 2831 2832 // Handle loads. 2833 assert(LI && "Must have a load instruction"); 2834 setDebugLocFromInst(Builder, LI); 2835 for (unsigned Part = 0; Part < UF; ++Part) { 2836 Value *NewLI; 2837 if (CreateGatherScatter) { 2838 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2839 Value *VectorGep = State.get(Addr, Part); 2840 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2841 nullptr, "wide.masked.gather"); 2842 addMetadata(NewLI, LI); 2843 } else { 2844 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2845 if (isMaskRequired) 2846 NewLI = Builder.CreateMaskedLoad( 2847 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2848 "wide.masked.load"); 2849 else 2850 NewLI = 2851 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2852 2853 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2854 addMetadata(NewLI, LI); 2855 if (Reverse) 2856 NewLI = reverseVector(NewLI); 2857 } 2858 2859 State.set(Def, Instr, NewLI, Part); 2860 } 2861 } 2862 2863 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2864 const VPIteration &Instance, 2865 bool IfPredicateInstr, 2866 VPTransformState &State) { 2867 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2868 2869 setDebugLocFromInst(Builder, Instr); 2870 2871 // Does this instruction return a value ? 2872 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2873 2874 Instruction *Cloned = Instr->clone(); 2875 if (!IsVoidRetTy) 2876 Cloned->setName(Instr->getName() + ".cloned"); 2877 2878 // Replace the operands of the cloned instructions with their scalar 2879 // equivalents in the new loop. 2880 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2881 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2882 auto InputInstance = Instance; 2883 if (!Operand || !OrigLoop->contains(Operand) || 2884 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2885 InputInstance.Lane = 0; 2886 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2887 Cloned->setOperand(op, NewOp); 2888 } 2889 addNewMetadata(Cloned, Instr); 2890 2891 // Place the cloned scalar in the new loop. 2892 Builder.Insert(Cloned); 2893 2894 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2895 // representing scalar values in VPTransformState. Add the cloned scalar to 2896 // the scalar map entry. 2897 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2898 2899 // If we just cloned a new assumption, add it the assumption cache. 2900 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2901 if (II->getIntrinsicID() == Intrinsic::assume) 2902 AC->registerAssumption(II); 2903 2904 // End if-block. 2905 if (IfPredicateInstr) 2906 PredicatedInstructions.push_back(Cloned); 2907 } 2908 2909 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2910 Value *End, Value *Step, 2911 Instruction *DL) { 2912 BasicBlock *Header = L->getHeader(); 2913 BasicBlock *Latch = L->getLoopLatch(); 2914 // As we're just creating this loop, it's possible no latch exists 2915 // yet. If so, use the header as this will be a single block loop. 2916 if (!Latch) 2917 Latch = Header; 2918 2919 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2920 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2921 setDebugLocFromInst(Builder, OldInst); 2922 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2923 2924 Builder.SetInsertPoint(Latch->getTerminator()); 2925 setDebugLocFromInst(Builder, OldInst); 2926 2927 // Create i+1 and fill the PHINode. 2928 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2929 Induction->addIncoming(Start, L->getLoopPreheader()); 2930 Induction->addIncoming(Next, Latch); 2931 // Create the compare. 2932 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2933 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 2934 2935 // Now we have two terminators. Remove the old one from the block. 2936 Latch->getTerminator()->eraseFromParent(); 2937 2938 return Induction; 2939 } 2940 2941 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2942 if (TripCount) 2943 return TripCount; 2944 2945 assert(L && "Create Trip Count for null loop."); 2946 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2947 // Find the loop boundaries. 2948 ScalarEvolution *SE = PSE.getSE(); 2949 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2950 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2951 "Invalid loop count"); 2952 2953 Type *IdxTy = Legal->getWidestInductionType(); 2954 assert(IdxTy && "No type for induction"); 2955 2956 // The exit count might have the type of i64 while the phi is i32. This can 2957 // happen if we have an induction variable that is sign extended before the 2958 // compare. The only way that we get a backedge taken count is that the 2959 // induction variable was signed and as such will not overflow. In such a case 2960 // truncation is legal. 2961 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2962 IdxTy->getPrimitiveSizeInBits()) 2963 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2964 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2965 2966 // Get the total trip count from the count by adding 1. 2967 const SCEV *ExitCount = SE->getAddExpr( 2968 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2969 2970 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2971 2972 // Expand the trip count and place the new instructions in the preheader. 2973 // Notice that the pre-header does not change, only the loop body. 2974 SCEVExpander Exp(*SE, DL, "induction"); 2975 2976 // Count holds the overall loop count (N). 2977 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2978 L->getLoopPreheader()->getTerminator()); 2979 2980 if (TripCount->getType()->isPointerTy()) 2981 TripCount = 2982 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2983 L->getLoopPreheader()->getTerminator()); 2984 2985 return TripCount; 2986 } 2987 2988 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2989 if (VectorTripCount) 2990 return VectorTripCount; 2991 2992 Value *TC = getOrCreateTripCount(L); 2993 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2994 2995 Type *Ty = TC->getType(); 2996 // This is where we can make the step a runtime constant. 2997 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 2998 2999 // If the tail is to be folded by masking, round the number of iterations N 3000 // up to a multiple of Step instead of rounding down. This is done by first 3001 // adding Step-1 and then rounding down. Note that it's ok if this addition 3002 // overflows: the vector induction variable will eventually wrap to zero given 3003 // that it starts at zero and its Step is a power of two; the loop will then 3004 // exit, with the last early-exit vector comparison also producing all-true. 3005 if (Cost->foldTailByMasking()) { 3006 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3007 "VF*UF must be a power of 2 when folding tail by masking"); 3008 assert(!VF.isScalable() && 3009 "Tail folding not yet supported for scalable vectors"); 3010 TC = Builder.CreateAdd( 3011 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3012 } 3013 3014 // Now we need to generate the expression for the part of the loop that the 3015 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3016 // iterations are not required for correctness, or N - Step, otherwise. Step 3017 // is equal to the vectorization factor (number of SIMD elements) times the 3018 // unroll factor (number of SIMD instructions). 3019 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3020 3021 // There are two cases where we need to ensure (at least) the last iteration 3022 // runs in the scalar remainder loop. Thus, if the step evenly divides 3023 // the trip count, we set the remainder to be equal to the step. If the step 3024 // does not evenly divide the trip count, no adjustment is necessary since 3025 // there will already be scalar iterations. Note that the minimum iterations 3026 // check ensures that N >= Step. The cases are: 3027 // 1) If there is a non-reversed interleaved group that may speculatively 3028 // access memory out-of-bounds. 3029 // 2) If any instruction may follow a conditionally taken exit. That is, if 3030 // the loop contains multiple exiting blocks, or a single exiting block 3031 // which is not the latch. 3032 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3033 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3034 R = Builder.CreateSelect(IsZero, Step, R); 3035 } 3036 3037 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3038 3039 return VectorTripCount; 3040 } 3041 3042 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3043 const DataLayout &DL) { 3044 // Verify that V is a vector type with same number of elements as DstVTy. 3045 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3046 unsigned VF = DstFVTy->getNumElements(); 3047 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3048 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3049 Type *SrcElemTy = SrcVecTy->getElementType(); 3050 Type *DstElemTy = DstFVTy->getElementType(); 3051 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3052 "Vector elements must have same size"); 3053 3054 // Do a direct cast if element types are castable. 3055 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3056 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3057 } 3058 // V cannot be directly casted to desired vector type. 3059 // May happen when V is a floating point vector but DstVTy is a vector of 3060 // pointers or vice-versa. Handle this using a two-step bitcast using an 3061 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3062 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3063 "Only one type should be a pointer type"); 3064 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3065 "Only one type should be a floating point type"); 3066 Type *IntTy = 3067 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3068 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3069 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3070 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3071 } 3072 3073 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3074 BasicBlock *Bypass) { 3075 Value *Count = getOrCreateTripCount(L); 3076 // Reuse existing vector loop preheader for TC checks. 3077 // Note that new preheader block is generated for vector loop. 3078 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3079 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3080 3081 // Generate code to check if the loop's trip count is less than VF * UF, or 3082 // equal to it in case a scalar epilogue is required; this implies that the 3083 // vector trip count is zero. This check also covers the case where adding one 3084 // to the backedge-taken count overflowed leading to an incorrect trip count 3085 // of zero. In this case we will also jump to the scalar loop. 3086 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3087 : ICmpInst::ICMP_ULT; 3088 3089 // If tail is to be folded, vector loop takes care of all iterations. 3090 Value *CheckMinIters = Builder.getFalse(); 3091 if (!Cost->foldTailByMasking()) { 3092 Value *Step = 3093 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3094 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3095 } 3096 // Create new preheader for vector loop. 3097 LoopVectorPreHeader = 3098 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3099 "vector.ph"); 3100 3101 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3102 DT->getNode(Bypass)->getIDom()) && 3103 "TC check is expected to dominate Bypass"); 3104 3105 // Update dominator for Bypass & LoopExit. 3106 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3107 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3108 3109 ReplaceInstWithInst( 3110 TCCheckBlock->getTerminator(), 3111 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3112 LoopBypassBlocks.push_back(TCCheckBlock); 3113 } 3114 3115 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3116 // Reuse existing vector loop preheader for SCEV checks. 3117 // Note that new preheader block is generated for vector loop. 3118 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3119 3120 // Generate the code to check that the SCEV assumptions that we made. 3121 // We want the new basic block to start at the first instruction in a 3122 // sequence of instructions that form a check. 3123 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3124 "scev.check"); 3125 Value *SCEVCheck = Exp.expandCodeForPredicate( 3126 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3127 3128 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3129 if (C->isZero()) 3130 return; 3131 3132 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3133 (OptForSizeBasedOnProfile && 3134 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3135 "Cannot SCEV check stride or overflow when optimizing for size"); 3136 3137 SCEVCheckBlock->setName("vector.scevcheck"); 3138 // Create new preheader for vector loop. 3139 LoopVectorPreHeader = 3140 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3141 nullptr, "vector.ph"); 3142 3143 // Update dominator only if this is first RT check. 3144 if (LoopBypassBlocks.empty()) { 3145 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3146 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3147 } 3148 3149 ReplaceInstWithInst( 3150 SCEVCheckBlock->getTerminator(), 3151 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3152 LoopBypassBlocks.push_back(SCEVCheckBlock); 3153 AddedSafetyChecks = true; 3154 } 3155 3156 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3157 // VPlan-native path does not do any analysis for runtime checks currently. 3158 if (EnableVPlanNativePath) 3159 return; 3160 3161 // Reuse existing vector loop preheader for runtime memory checks. 3162 // Note that new preheader block is generated for vector loop. 3163 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3164 3165 // Generate the code that checks in runtime if arrays overlap. We put the 3166 // checks into a separate block to make the more common case of few elements 3167 // faster. 3168 auto *LAI = Legal->getLAI(); 3169 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3170 if (!RtPtrChecking.Need) 3171 return; 3172 3173 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3174 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3175 "Cannot emit memory checks when optimizing for size, unless forced " 3176 "to vectorize."); 3177 ORE->emit([&]() { 3178 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3179 L->getStartLoc(), L->getHeader()) 3180 << "Code-size may be reduced by not forcing " 3181 "vectorization, or by source-code modifications " 3182 "eliminating the need for runtime checks " 3183 "(e.g., adding 'restrict')."; 3184 }); 3185 } 3186 3187 MemCheckBlock->setName("vector.memcheck"); 3188 // Create new preheader for vector loop. 3189 LoopVectorPreHeader = 3190 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3191 "vector.ph"); 3192 3193 auto *CondBranch = cast<BranchInst>( 3194 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3195 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3196 LoopBypassBlocks.push_back(MemCheckBlock); 3197 AddedSafetyChecks = true; 3198 3199 // Update dominator only if this is first RT check. 3200 if (LoopBypassBlocks.empty()) { 3201 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3202 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3203 } 3204 3205 Instruction *FirstCheckInst; 3206 Instruction *MemRuntimeCheck; 3207 std::tie(FirstCheckInst, MemRuntimeCheck) = 3208 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3209 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3210 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3211 "claimed checks are required"); 3212 CondBranch->setCondition(MemRuntimeCheck); 3213 3214 // We currently don't use LoopVersioning for the actual loop cloning but we 3215 // still use it to add the noalias metadata. 3216 LVer = std::make_unique<LoopVersioning>( 3217 *Legal->getLAI(), 3218 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3219 DT, PSE.getSE()); 3220 LVer->prepareNoAliasMetadata(); 3221 } 3222 3223 Value *InnerLoopVectorizer::emitTransformedIndex( 3224 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3225 const InductionDescriptor &ID) const { 3226 3227 SCEVExpander Exp(*SE, DL, "induction"); 3228 auto Step = ID.getStep(); 3229 auto StartValue = ID.getStartValue(); 3230 assert(Index->getType() == Step->getType() && 3231 "Index type does not match StepValue type"); 3232 3233 // Note: the IR at this point is broken. We cannot use SE to create any new 3234 // SCEV and then expand it, hoping that SCEV's simplification will give us 3235 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3236 // lead to various SCEV crashes. So all we can do is to use builder and rely 3237 // on InstCombine for future simplifications. Here we handle some trivial 3238 // cases only. 3239 auto CreateAdd = [&B](Value *X, Value *Y) { 3240 assert(X->getType() == Y->getType() && "Types don't match!"); 3241 if (auto *CX = dyn_cast<ConstantInt>(X)) 3242 if (CX->isZero()) 3243 return Y; 3244 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3245 if (CY->isZero()) 3246 return X; 3247 return B.CreateAdd(X, Y); 3248 }; 3249 3250 auto CreateMul = [&B](Value *X, Value *Y) { 3251 assert(X->getType() == Y->getType() && "Types don't match!"); 3252 if (auto *CX = dyn_cast<ConstantInt>(X)) 3253 if (CX->isOne()) 3254 return Y; 3255 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3256 if (CY->isOne()) 3257 return X; 3258 return B.CreateMul(X, Y); 3259 }; 3260 3261 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3262 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3263 // the DomTree is not kept up-to-date for additional blocks generated in the 3264 // vector loop. By using the header as insertion point, we guarantee that the 3265 // expanded instructions dominate all their uses. 3266 auto GetInsertPoint = [this, &B]() { 3267 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3268 if (InsertBB != LoopVectorBody && 3269 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3270 return LoopVectorBody->getTerminator(); 3271 return &*B.GetInsertPoint(); 3272 }; 3273 switch (ID.getKind()) { 3274 case InductionDescriptor::IK_IntInduction: { 3275 assert(Index->getType() == StartValue->getType() && 3276 "Index type does not match StartValue type"); 3277 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3278 return B.CreateSub(StartValue, Index); 3279 auto *Offset = CreateMul( 3280 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3281 return CreateAdd(StartValue, Offset); 3282 } 3283 case InductionDescriptor::IK_PtrInduction: { 3284 assert(isa<SCEVConstant>(Step) && 3285 "Expected constant step for pointer induction"); 3286 return B.CreateGEP( 3287 StartValue->getType()->getPointerElementType(), StartValue, 3288 CreateMul(Index, 3289 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3290 } 3291 case InductionDescriptor::IK_FpInduction: { 3292 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3293 auto InductionBinOp = ID.getInductionBinOp(); 3294 assert(InductionBinOp && 3295 (InductionBinOp->getOpcode() == Instruction::FAdd || 3296 InductionBinOp->getOpcode() == Instruction::FSub) && 3297 "Original bin op should be defined for FP induction"); 3298 3299 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3300 3301 // Floating point operations had to be 'fast' to enable the induction. 3302 FastMathFlags Flags; 3303 Flags.setFast(); 3304 3305 Value *MulExp = B.CreateFMul(StepValue, Index); 3306 if (isa<Instruction>(MulExp)) 3307 // We have to check, the MulExp may be a constant. 3308 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3309 3310 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3311 "induction"); 3312 if (isa<Instruction>(BOp)) 3313 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3314 3315 return BOp; 3316 } 3317 case InductionDescriptor::IK_NoInduction: 3318 return nullptr; 3319 } 3320 llvm_unreachable("invalid enum"); 3321 } 3322 3323 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3324 LoopScalarBody = OrigLoop->getHeader(); 3325 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3326 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3327 assert(LoopExitBlock && "Must have an exit block"); 3328 assert(LoopVectorPreHeader && "Invalid loop structure"); 3329 3330 LoopMiddleBlock = 3331 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3332 LI, nullptr, Twine(Prefix) + "middle.block"); 3333 LoopScalarPreHeader = 3334 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3335 nullptr, Twine(Prefix) + "scalar.ph"); 3336 3337 // Set up branch from middle block to the exit and scalar preheader blocks. 3338 // completeLoopSkeleton will update the condition to use an iteration check, 3339 // if required to decide whether to execute the remainder. 3340 BranchInst *BrInst = 3341 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3342 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3343 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3344 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3345 3346 // We intentionally don't let SplitBlock to update LoopInfo since 3347 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3348 // LoopVectorBody is explicitly added to the correct place few lines later. 3349 LoopVectorBody = 3350 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3351 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3352 3353 // Update dominator for loop exit. 3354 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3355 3356 // Create and register the new vector loop. 3357 Loop *Lp = LI->AllocateLoop(); 3358 Loop *ParentLoop = OrigLoop->getParentLoop(); 3359 3360 // Insert the new loop into the loop nest and register the new basic blocks 3361 // before calling any utilities such as SCEV that require valid LoopInfo. 3362 if (ParentLoop) { 3363 ParentLoop->addChildLoop(Lp); 3364 } else { 3365 LI->addTopLevelLoop(Lp); 3366 } 3367 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3368 return Lp; 3369 } 3370 3371 void InnerLoopVectorizer::createInductionResumeValues( 3372 Loop *L, Value *VectorTripCount, 3373 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3374 assert(VectorTripCount && L && "Expected valid arguments"); 3375 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3376 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3377 "Inconsistent information about additional bypass."); 3378 // We are going to resume the execution of the scalar loop. 3379 // Go over all of the induction variables that we found and fix the 3380 // PHIs that are left in the scalar version of the loop. 3381 // The starting values of PHI nodes depend on the counter of the last 3382 // iteration in the vectorized loop. 3383 // If we come from a bypass edge then we need to start from the original 3384 // start value. 3385 for (auto &InductionEntry : Legal->getInductionVars()) { 3386 PHINode *OrigPhi = InductionEntry.first; 3387 InductionDescriptor II = InductionEntry.second; 3388 3389 // Create phi nodes to merge from the backedge-taken check block. 3390 PHINode *BCResumeVal = 3391 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3392 LoopScalarPreHeader->getTerminator()); 3393 // Copy original phi DL over to the new one. 3394 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3395 Value *&EndValue = IVEndValues[OrigPhi]; 3396 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3397 if (OrigPhi == OldInduction) { 3398 // We know what the end value is. 3399 EndValue = VectorTripCount; 3400 } else { 3401 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3402 Type *StepType = II.getStep()->getType(); 3403 Instruction::CastOps CastOp = 3404 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3405 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3406 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3407 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3408 EndValue->setName("ind.end"); 3409 3410 // Compute the end value for the additional bypass (if applicable). 3411 if (AdditionalBypass.first) { 3412 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3413 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3414 StepType, true); 3415 CRD = 3416 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3417 EndValueFromAdditionalBypass = 3418 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3419 EndValueFromAdditionalBypass->setName("ind.end"); 3420 } 3421 } 3422 // The new PHI merges the original incoming value, in case of a bypass, 3423 // or the value at the end of the vectorized loop. 3424 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3425 3426 // Fix the scalar body counter (PHI node). 3427 // The old induction's phi node in the scalar body needs the truncated 3428 // value. 3429 for (BasicBlock *BB : LoopBypassBlocks) 3430 BCResumeVal->addIncoming(II.getStartValue(), BB); 3431 3432 if (AdditionalBypass.first) 3433 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3434 EndValueFromAdditionalBypass); 3435 3436 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3437 } 3438 } 3439 3440 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3441 MDNode *OrigLoopID) { 3442 assert(L && "Expected valid loop."); 3443 3444 // The trip counts should be cached by now. 3445 Value *Count = getOrCreateTripCount(L); 3446 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3447 3448 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3449 3450 // Add a check in the middle block to see if we have completed 3451 // all of the iterations in the first vector loop. 3452 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3453 // If tail is to be folded, we know we don't need to run the remainder. 3454 if (!Cost->foldTailByMasking()) { 3455 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3456 Count, VectorTripCount, "cmp.n", 3457 LoopMiddleBlock->getTerminator()); 3458 3459 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3460 // of the corresponding compare because they may have ended up with 3461 // different line numbers and we want to avoid awkward line stepping while 3462 // debugging. Eg. if the compare has got a line number inside the loop. 3463 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3464 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3465 } 3466 3467 // Get ready to start creating new instructions into the vectorized body. 3468 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3469 "Inconsistent vector loop preheader"); 3470 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3471 3472 Optional<MDNode *> VectorizedLoopID = 3473 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3474 LLVMLoopVectorizeFollowupVectorized}); 3475 if (VectorizedLoopID.hasValue()) { 3476 L->setLoopID(VectorizedLoopID.getValue()); 3477 3478 // Do not setAlreadyVectorized if loop attributes have been defined 3479 // explicitly. 3480 return LoopVectorPreHeader; 3481 } 3482 3483 // Keep all loop hints from the original loop on the vector loop (we'll 3484 // replace the vectorizer-specific hints below). 3485 if (MDNode *LID = OrigLoop->getLoopID()) 3486 L->setLoopID(LID); 3487 3488 LoopVectorizeHints Hints(L, true, *ORE); 3489 Hints.setAlreadyVectorized(); 3490 3491 #ifdef EXPENSIVE_CHECKS 3492 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3493 LI->verify(*DT); 3494 #endif 3495 3496 return LoopVectorPreHeader; 3497 } 3498 3499 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3500 /* 3501 In this function we generate a new loop. The new loop will contain 3502 the vectorized instructions while the old loop will continue to run the 3503 scalar remainder. 3504 3505 [ ] <-- loop iteration number check. 3506 / | 3507 / v 3508 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3509 | / | 3510 | / v 3511 || [ ] <-- vector pre header. 3512 |/ | 3513 | v 3514 | [ ] \ 3515 | [ ]_| <-- vector loop. 3516 | | 3517 | v 3518 | -[ ] <--- middle-block. 3519 | / | 3520 | / v 3521 -|- >[ ] <--- new preheader. 3522 | | 3523 | v 3524 | [ ] \ 3525 | [ ]_| <-- old scalar loop to handle remainder. 3526 \ | 3527 \ v 3528 >[ ] <-- exit block. 3529 ... 3530 */ 3531 3532 // Get the metadata of the original loop before it gets modified. 3533 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3534 3535 // Create an empty vector loop, and prepare basic blocks for the runtime 3536 // checks. 3537 Loop *Lp = createVectorLoopSkeleton(""); 3538 3539 // Now, compare the new count to zero. If it is zero skip the vector loop and 3540 // jump to the scalar loop. This check also covers the case where the 3541 // backedge-taken count is uint##_max: adding one to it will overflow leading 3542 // to an incorrect trip count of zero. In this (rare) case we will also jump 3543 // to the scalar loop. 3544 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3545 3546 // Generate the code to check any assumptions that we've made for SCEV 3547 // expressions. 3548 emitSCEVChecks(Lp, LoopScalarPreHeader); 3549 3550 // Generate the code that checks in runtime if arrays overlap. We put the 3551 // checks into a separate block to make the more common case of few elements 3552 // faster. 3553 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3554 3555 // Some loops have a single integer induction variable, while other loops 3556 // don't. One example is c++ iterators that often have multiple pointer 3557 // induction variables. In the code below we also support a case where we 3558 // don't have a single induction variable. 3559 // 3560 // We try to obtain an induction variable from the original loop as hard 3561 // as possible. However if we don't find one that: 3562 // - is an integer 3563 // - counts from zero, stepping by one 3564 // - is the size of the widest induction variable type 3565 // then we create a new one. 3566 OldInduction = Legal->getPrimaryInduction(); 3567 Type *IdxTy = Legal->getWidestInductionType(); 3568 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3569 // The loop step is equal to the vectorization factor (num of SIMD elements) 3570 // times the unroll factor (num of SIMD instructions). 3571 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3572 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3573 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3574 Induction = 3575 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3576 getDebugLocFromInstOrOperands(OldInduction)); 3577 3578 // Emit phis for the new starting index of the scalar loop. 3579 createInductionResumeValues(Lp, CountRoundDown); 3580 3581 return completeLoopSkeleton(Lp, OrigLoopID); 3582 } 3583 3584 // Fix up external users of the induction variable. At this point, we are 3585 // in LCSSA form, with all external PHIs that use the IV having one input value, 3586 // coming from the remainder loop. We need those PHIs to also have a correct 3587 // value for the IV when arriving directly from the middle block. 3588 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3589 const InductionDescriptor &II, 3590 Value *CountRoundDown, Value *EndValue, 3591 BasicBlock *MiddleBlock) { 3592 // There are two kinds of external IV usages - those that use the value 3593 // computed in the last iteration (the PHI) and those that use the penultimate 3594 // value (the value that feeds into the phi from the loop latch). 3595 // We allow both, but they, obviously, have different values. 3596 3597 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3598 3599 DenseMap<Value *, Value *> MissingVals; 3600 3601 // An external user of the last iteration's value should see the value that 3602 // the remainder loop uses to initialize its own IV. 3603 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3604 for (User *U : PostInc->users()) { 3605 Instruction *UI = cast<Instruction>(U); 3606 if (!OrigLoop->contains(UI)) { 3607 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3608 MissingVals[UI] = EndValue; 3609 } 3610 } 3611 3612 // An external user of the penultimate value need to see EndValue - Step. 3613 // The simplest way to get this is to recompute it from the constituent SCEVs, 3614 // that is Start + (Step * (CRD - 1)). 3615 for (User *U : OrigPhi->users()) { 3616 auto *UI = cast<Instruction>(U); 3617 if (!OrigLoop->contains(UI)) { 3618 const DataLayout &DL = 3619 OrigLoop->getHeader()->getModule()->getDataLayout(); 3620 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3621 3622 IRBuilder<> B(MiddleBlock->getTerminator()); 3623 Value *CountMinusOne = B.CreateSub( 3624 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3625 Value *CMO = 3626 !II.getStep()->getType()->isIntegerTy() 3627 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3628 II.getStep()->getType()) 3629 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3630 CMO->setName("cast.cmo"); 3631 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3632 Escape->setName("ind.escape"); 3633 MissingVals[UI] = Escape; 3634 } 3635 } 3636 3637 for (auto &I : MissingVals) { 3638 PHINode *PHI = cast<PHINode>(I.first); 3639 // One corner case we have to handle is two IVs "chasing" each-other, 3640 // that is %IV2 = phi [...], [ %IV1, %latch ] 3641 // In this case, if IV1 has an external use, we need to avoid adding both 3642 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3643 // don't already have an incoming value for the middle block. 3644 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3645 PHI->addIncoming(I.second, MiddleBlock); 3646 } 3647 } 3648 3649 namespace { 3650 3651 struct CSEDenseMapInfo { 3652 static bool canHandle(const Instruction *I) { 3653 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3654 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3655 } 3656 3657 static inline Instruction *getEmptyKey() { 3658 return DenseMapInfo<Instruction *>::getEmptyKey(); 3659 } 3660 3661 static inline Instruction *getTombstoneKey() { 3662 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3663 } 3664 3665 static unsigned getHashValue(const Instruction *I) { 3666 assert(canHandle(I) && "Unknown instruction!"); 3667 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3668 I->value_op_end())); 3669 } 3670 3671 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3672 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3673 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3674 return LHS == RHS; 3675 return LHS->isIdenticalTo(RHS); 3676 } 3677 }; 3678 3679 } // end anonymous namespace 3680 3681 ///Perform cse of induction variable instructions. 3682 static void cse(BasicBlock *BB) { 3683 // Perform simple cse. 3684 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3685 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3686 Instruction *In = &*I++; 3687 3688 if (!CSEDenseMapInfo::canHandle(In)) 3689 continue; 3690 3691 // Check if we can replace this instruction with any of the 3692 // visited instructions. 3693 if (Instruction *V = CSEMap.lookup(In)) { 3694 In->replaceAllUsesWith(V); 3695 In->eraseFromParent(); 3696 continue; 3697 } 3698 3699 CSEMap[In] = In; 3700 } 3701 } 3702 3703 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3704 ElementCount VF, 3705 bool &NeedToScalarize) { 3706 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3707 Function *F = CI->getCalledFunction(); 3708 Type *ScalarRetTy = CI->getType(); 3709 SmallVector<Type *, 4> Tys, ScalarTys; 3710 for (auto &ArgOp : CI->arg_operands()) 3711 ScalarTys.push_back(ArgOp->getType()); 3712 3713 // Estimate cost of scalarized vector call. The source operands are assumed 3714 // to be vectors, so we need to extract individual elements from there, 3715 // execute VF scalar calls, and then gather the result into the vector return 3716 // value. 3717 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3718 TTI::TCK_RecipThroughput); 3719 if (VF.isScalar()) 3720 return ScalarCallCost; 3721 3722 // Compute corresponding vector type for return value and arguments. 3723 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3724 for (Type *ScalarTy : ScalarTys) 3725 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3726 3727 // Compute costs of unpacking argument values for the scalar calls and 3728 // packing the return values to a vector. 3729 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3730 3731 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3732 3733 // If we can't emit a vector call for this function, then the currently found 3734 // cost is the cost we need to return. 3735 NeedToScalarize = true; 3736 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3737 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3738 3739 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3740 return Cost; 3741 3742 // If the corresponding vector cost is cheaper, return its cost. 3743 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3744 TTI::TCK_RecipThroughput); 3745 if (VectorCallCost < Cost) { 3746 NeedToScalarize = false; 3747 return VectorCallCost; 3748 } 3749 return Cost; 3750 } 3751 3752 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3753 ElementCount VF) { 3754 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3755 assert(ID && "Expected intrinsic call!"); 3756 3757 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3758 return TTI.getIntrinsicInstrCost(CostAttrs, 3759 TargetTransformInfo::TCK_RecipThroughput); 3760 } 3761 3762 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3763 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3764 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3765 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3766 } 3767 3768 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3769 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3770 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3771 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3772 } 3773 3774 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3775 // For every instruction `I` in MinBWs, truncate the operands, create a 3776 // truncated version of `I` and reextend its result. InstCombine runs 3777 // later and will remove any ext/trunc pairs. 3778 SmallPtrSet<Value *, 4> Erased; 3779 for (const auto &KV : Cost->getMinimalBitwidths()) { 3780 // If the value wasn't vectorized, we must maintain the original scalar 3781 // type. The absence of the value from VectorLoopValueMap indicates that it 3782 // wasn't vectorized. 3783 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3784 continue; 3785 for (unsigned Part = 0; Part < UF; ++Part) { 3786 Value *I = getOrCreateVectorValue(KV.first, Part); 3787 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3788 continue; 3789 Type *OriginalTy = I->getType(); 3790 Type *ScalarTruncatedTy = 3791 IntegerType::get(OriginalTy->getContext(), KV.second); 3792 auto *TruncatedTy = FixedVectorType::get( 3793 ScalarTruncatedTy, 3794 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3795 if (TruncatedTy == OriginalTy) 3796 continue; 3797 3798 IRBuilder<> B(cast<Instruction>(I)); 3799 auto ShrinkOperand = [&](Value *V) -> Value * { 3800 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3801 if (ZI->getSrcTy() == TruncatedTy) 3802 return ZI->getOperand(0); 3803 return B.CreateZExtOrTrunc(V, TruncatedTy); 3804 }; 3805 3806 // The actual instruction modification depends on the instruction type, 3807 // unfortunately. 3808 Value *NewI = nullptr; 3809 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3810 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3811 ShrinkOperand(BO->getOperand(1))); 3812 3813 // Any wrapping introduced by shrinking this operation shouldn't be 3814 // considered undefined behavior. So, we can't unconditionally copy 3815 // arithmetic wrapping flags to NewI. 3816 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3817 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3818 NewI = 3819 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3820 ShrinkOperand(CI->getOperand(1))); 3821 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3822 NewI = B.CreateSelect(SI->getCondition(), 3823 ShrinkOperand(SI->getTrueValue()), 3824 ShrinkOperand(SI->getFalseValue())); 3825 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3826 switch (CI->getOpcode()) { 3827 default: 3828 llvm_unreachable("Unhandled cast!"); 3829 case Instruction::Trunc: 3830 NewI = ShrinkOperand(CI->getOperand(0)); 3831 break; 3832 case Instruction::SExt: 3833 NewI = B.CreateSExtOrTrunc( 3834 CI->getOperand(0), 3835 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3836 break; 3837 case Instruction::ZExt: 3838 NewI = B.CreateZExtOrTrunc( 3839 CI->getOperand(0), 3840 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3841 break; 3842 } 3843 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3844 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3845 ->getNumElements(); 3846 auto *O0 = B.CreateZExtOrTrunc( 3847 SI->getOperand(0), 3848 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3849 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3850 ->getNumElements(); 3851 auto *O1 = B.CreateZExtOrTrunc( 3852 SI->getOperand(1), 3853 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3854 3855 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3856 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3857 // Don't do anything with the operands, just extend the result. 3858 continue; 3859 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3860 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3861 ->getNumElements(); 3862 auto *O0 = B.CreateZExtOrTrunc( 3863 IE->getOperand(0), 3864 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3865 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3866 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3867 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3868 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3869 ->getNumElements(); 3870 auto *O0 = B.CreateZExtOrTrunc( 3871 EE->getOperand(0), 3872 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3873 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3874 } else { 3875 // If we don't know what to do, be conservative and don't do anything. 3876 continue; 3877 } 3878 3879 // Lastly, extend the result. 3880 NewI->takeName(cast<Instruction>(I)); 3881 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3882 I->replaceAllUsesWith(Res); 3883 cast<Instruction>(I)->eraseFromParent(); 3884 Erased.insert(I); 3885 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3886 } 3887 } 3888 3889 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3890 for (const auto &KV : Cost->getMinimalBitwidths()) { 3891 // If the value wasn't vectorized, we must maintain the original scalar 3892 // type. The absence of the value from VectorLoopValueMap indicates that it 3893 // wasn't vectorized. 3894 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3895 continue; 3896 for (unsigned Part = 0; Part < UF; ++Part) { 3897 Value *I = getOrCreateVectorValue(KV.first, Part); 3898 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3899 if (Inst && Inst->use_empty()) { 3900 Value *NewI = Inst->getOperand(0); 3901 Inst->eraseFromParent(); 3902 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3903 } 3904 } 3905 } 3906 } 3907 3908 void InnerLoopVectorizer::fixVectorizedLoop() { 3909 // Insert truncates and extends for any truncated instructions as hints to 3910 // InstCombine. 3911 if (VF.isVector()) 3912 truncateToMinimalBitwidths(); 3913 3914 // Fix widened non-induction PHIs by setting up the PHI operands. 3915 if (OrigPHIsToFix.size()) { 3916 assert(EnableVPlanNativePath && 3917 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3918 fixNonInductionPHIs(); 3919 } 3920 3921 // At this point every instruction in the original loop is widened to a 3922 // vector form. Now we need to fix the recurrences in the loop. These PHI 3923 // nodes are currently empty because we did not want to introduce cycles. 3924 // This is the second stage of vectorizing recurrences. 3925 fixCrossIterationPHIs(); 3926 3927 // Forget the original basic block. 3928 PSE.getSE()->forgetLoop(OrigLoop); 3929 3930 // Fix-up external users of the induction variables. 3931 for (auto &Entry : Legal->getInductionVars()) 3932 fixupIVUsers(Entry.first, Entry.second, 3933 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3934 IVEndValues[Entry.first], LoopMiddleBlock); 3935 3936 fixLCSSAPHIs(); 3937 for (Instruction *PI : PredicatedInstructions) 3938 sinkScalarOperands(&*PI); 3939 3940 // Remove redundant induction instructions. 3941 cse(LoopVectorBody); 3942 3943 // Set/update profile weights for the vector and remainder loops as original 3944 // loop iterations are now distributed among them. Note that original loop 3945 // represented by LoopScalarBody becomes remainder loop after vectorization. 3946 // 3947 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3948 // end up getting slightly roughened result but that should be OK since 3949 // profile is not inherently precise anyway. Note also possible bypass of 3950 // vector code caused by legality checks is ignored, assigning all the weight 3951 // to the vector loop, optimistically. 3952 // 3953 // For scalable vectorization we can't know at compile time how many iterations 3954 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3955 // vscale of '1'. 3956 setProfileInfoAfterUnrolling( 3957 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3958 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3959 } 3960 3961 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3962 // In order to support recurrences we need to be able to vectorize Phi nodes. 3963 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3964 // stage #2: We now need to fix the recurrences by adding incoming edges to 3965 // the currently empty PHI nodes. At this point every instruction in the 3966 // original loop is widened to a vector form so we can use them to construct 3967 // the incoming edges. 3968 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3969 // Handle first-order recurrences and reductions that need to be fixed. 3970 if (Legal->isFirstOrderRecurrence(&Phi)) 3971 fixFirstOrderRecurrence(&Phi); 3972 else if (Legal->isReductionVariable(&Phi)) 3973 fixReduction(&Phi); 3974 } 3975 } 3976 3977 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3978 // This is the second phase of vectorizing first-order recurrences. An 3979 // overview of the transformation is described below. Suppose we have the 3980 // following loop. 3981 // 3982 // for (int i = 0; i < n; ++i) 3983 // b[i] = a[i] - a[i - 1]; 3984 // 3985 // There is a first-order recurrence on "a". For this loop, the shorthand 3986 // scalar IR looks like: 3987 // 3988 // scalar.ph: 3989 // s_init = a[-1] 3990 // br scalar.body 3991 // 3992 // scalar.body: 3993 // i = phi [0, scalar.ph], [i+1, scalar.body] 3994 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3995 // s2 = a[i] 3996 // b[i] = s2 - s1 3997 // br cond, scalar.body, ... 3998 // 3999 // In this example, s1 is a recurrence because it's value depends on the 4000 // previous iteration. In the first phase of vectorization, we created a 4001 // temporary value for s1. We now complete the vectorization and produce the 4002 // shorthand vector IR shown below (for VF = 4, UF = 1). 4003 // 4004 // vector.ph: 4005 // v_init = vector(..., ..., ..., a[-1]) 4006 // br vector.body 4007 // 4008 // vector.body 4009 // i = phi [0, vector.ph], [i+4, vector.body] 4010 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4011 // v2 = a[i, i+1, i+2, i+3]; 4012 // v3 = vector(v1(3), v2(0, 1, 2)) 4013 // b[i, i+1, i+2, i+3] = v2 - v3 4014 // br cond, vector.body, middle.block 4015 // 4016 // middle.block: 4017 // x = v2(3) 4018 // br scalar.ph 4019 // 4020 // scalar.ph: 4021 // s_init = phi [x, middle.block], [a[-1], otherwise] 4022 // br scalar.body 4023 // 4024 // After execution completes the vector loop, we extract the next value of 4025 // the recurrence (x) to use as the initial value in the scalar loop. 4026 4027 // Get the original loop preheader and single loop latch. 4028 auto *Preheader = OrigLoop->getLoopPreheader(); 4029 auto *Latch = OrigLoop->getLoopLatch(); 4030 4031 // Get the initial and previous values of the scalar recurrence. 4032 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4033 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4034 4035 // Create a vector from the initial value. 4036 auto *VectorInit = ScalarInit; 4037 if (VF.isVector()) { 4038 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4039 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4040 VectorInit = Builder.CreateInsertElement( 4041 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4042 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4043 } 4044 4045 // We constructed a temporary phi node in the first phase of vectorization. 4046 // This phi node will eventually be deleted. 4047 Builder.SetInsertPoint( 4048 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4049 4050 // Create a phi node for the new recurrence. The current value will either be 4051 // the initial value inserted into a vector or loop-varying vector value. 4052 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4053 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4054 4055 // Get the vectorized previous value of the last part UF - 1. It appears last 4056 // among all unrolled iterations, due to the order of their construction. 4057 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4058 4059 // Find and set the insertion point after the previous value if it is an 4060 // instruction. 4061 BasicBlock::iterator InsertPt; 4062 // Note that the previous value may have been constant-folded so it is not 4063 // guaranteed to be an instruction in the vector loop. 4064 // FIXME: Loop invariant values do not form recurrences. We should deal with 4065 // them earlier. 4066 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4067 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4068 else { 4069 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4070 if (isa<PHINode>(PreviousLastPart)) 4071 // If the previous value is a phi node, we should insert after all the phi 4072 // nodes in the block containing the PHI to avoid breaking basic block 4073 // verification. Note that the basic block may be different to 4074 // LoopVectorBody, in case we predicate the loop. 4075 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4076 else 4077 InsertPt = ++PreviousInst->getIterator(); 4078 } 4079 Builder.SetInsertPoint(&*InsertPt); 4080 4081 // We will construct a vector for the recurrence by combining the values for 4082 // the current and previous iterations. This is the required shuffle mask. 4083 assert(!VF.isScalable()); 4084 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4085 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4086 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4087 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4088 4089 // The vector from which to take the initial value for the current iteration 4090 // (actual or unrolled). Initially, this is the vector phi node. 4091 Value *Incoming = VecPhi; 4092 4093 // Shuffle the current and previous vector and update the vector parts. 4094 for (unsigned Part = 0; Part < UF; ++Part) { 4095 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4096 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4097 auto *Shuffle = 4098 VF.isVector() 4099 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4100 : Incoming; 4101 PhiPart->replaceAllUsesWith(Shuffle); 4102 cast<Instruction>(PhiPart)->eraseFromParent(); 4103 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4104 Incoming = PreviousPart; 4105 } 4106 4107 // Fix the latch value of the new recurrence in the vector loop. 4108 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4109 4110 // Extract the last vector element in the middle block. This will be the 4111 // initial value for the recurrence when jumping to the scalar loop. 4112 auto *ExtractForScalar = Incoming; 4113 if (VF.isVector()) { 4114 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4115 ExtractForScalar = Builder.CreateExtractElement( 4116 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4117 "vector.recur.extract"); 4118 } 4119 // Extract the second last element in the middle block if the 4120 // Phi is used outside the loop. We need to extract the phi itself 4121 // and not the last element (the phi update in the current iteration). This 4122 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4123 // when the scalar loop is not run at all. 4124 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4125 if (VF.isVector()) 4126 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4127 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4128 "vector.recur.extract.for.phi"); 4129 // When loop is unrolled without vectorizing, initialize 4130 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4131 // `Incoming`. This is analogous to the vectorized case above: extracting the 4132 // second last element when VF > 1. 4133 else if (UF > 1) 4134 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4135 4136 // Fix the initial value of the original recurrence in the scalar loop. 4137 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4138 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4139 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4140 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4141 Start->addIncoming(Incoming, BB); 4142 } 4143 4144 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4145 Phi->setName("scalar.recur"); 4146 4147 // Finally, fix users of the recurrence outside the loop. The users will need 4148 // either the last value of the scalar recurrence or the last value of the 4149 // vector recurrence we extracted in the middle block. Since the loop is in 4150 // LCSSA form, we just need to find all the phi nodes for the original scalar 4151 // recurrence in the exit block, and then add an edge for the middle block. 4152 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4153 if (LCSSAPhi.getIncomingValue(0) == Phi) { 4154 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4155 } 4156 } 4157 } 4158 4159 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4160 // Get it's reduction variable descriptor. 4161 assert(Legal->isReductionVariable(Phi) && 4162 "Unable to find the reduction variable"); 4163 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4164 4165 RecurKind RK = RdxDesc.getRecurrenceKind(); 4166 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4167 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4168 setDebugLocFromInst(Builder, ReductionStartValue); 4169 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4170 4171 // This is the vector-clone of the value that leaves the loop. 4172 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4173 4174 // Wrap flags are in general invalid after vectorization, clear them. 4175 clearReductionWrapFlags(RdxDesc); 4176 4177 // Fix the vector-loop phi. 4178 4179 // Reductions do not have to start at zero. They can start with 4180 // any loop invariant values. 4181 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4182 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4183 4184 for (unsigned Part = 0; Part < UF; ++Part) { 4185 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4186 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4187 cast<PHINode>(VecRdxPhi) 4188 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4189 } 4190 4191 // Before each round, move the insertion point right between 4192 // the PHIs and the values we are going to write. 4193 // This allows us to write both PHINodes and the extractelement 4194 // instructions. 4195 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4196 4197 setDebugLocFromInst(Builder, LoopExitInst); 4198 4199 // If tail is folded by masking, the vector value to leave the loop should be 4200 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4201 // instead of the former. For an inloop reduction the reduction will already 4202 // be predicated, and does not need to be handled here. 4203 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4204 for (unsigned Part = 0; Part < UF; ++Part) { 4205 Value *VecLoopExitInst = 4206 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4207 Value *Sel = nullptr; 4208 for (User *U : VecLoopExitInst->users()) { 4209 if (isa<SelectInst>(U)) { 4210 assert(!Sel && "Reduction exit feeding two selects"); 4211 Sel = U; 4212 } else 4213 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4214 } 4215 assert(Sel && "Reduction exit feeds no select"); 4216 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4217 4218 // If the target can create a predicated operator for the reduction at no 4219 // extra cost in the loop (for example a predicated vadd), it can be 4220 // cheaper for the select to remain in the loop than be sunk out of it, 4221 // and so use the select value for the phi instead of the old 4222 // LoopExitValue. 4223 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4224 if (PreferPredicatedReductionSelect || 4225 TTI->preferPredicatedReductionSelect( 4226 RdxDesc.getOpcode(), Phi->getType(), 4227 TargetTransformInfo::ReductionFlags())) { 4228 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4229 VecRdxPhi->setIncomingValueForBlock( 4230 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4231 } 4232 } 4233 } 4234 4235 // If the vector reduction can be performed in a smaller type, we truncate 4236 // then extend the loop exit value to enable InstCombine to evaluate the 4237 // entire expression in the smaller type. 4238 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4239 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4240 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4241 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4242 Builder.SetInsertPoint( 4243 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4244 VectorParts RdxParts(UF); 4245 for (unsigned Part = 0; Part < UF; ++Part) { 4246 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4247 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4248 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4249 : Builder.CreateZExt(Trunc, VecTy); 4250 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4251 UI != RdxParts[Part]->user_end();) 4252 if (*UI != Trunc) { 4253 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4254 RdxParts[Part] = Extnd; 4255 } else { 4256 ++UI; 4257 } 4258 } 4259 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4260 for (unsigned Part = 0; Part < UF; ++Part) { 4261 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4262 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4263 } 4264 } 4265 4266 // Reduce all of the unrolled parts into a single vector. 4267 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4268 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4269 4270 // The middle block terminator has already been assigned a DebugLoc here (the 4271 // OrigLoop's single latch terminator). We want the whole middle block to 4272 // appear to execute on this line because: (a) it is all compiler generated, 4273 // (b) these instructions are always executed after evaluating the latch 4274 // conditional branch, and (c) other passes may add new predecessors which 4275 // terminate on this line. This is the easiest way to ensure we don't 4276 // accidentally cause an extra step back into the loop while debugging. 4277 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4278 for (unsigned Part = 1; Part < UF; ++Part) { 4279 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4280 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4281 // Floating point operations had to be 'fast' to enable the reduction. 4282 ReducedPartRdx = addFastMathFlag( 4283 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4284 ReducedPartRdx, "bin.rdx"), 4285 RdxDesc.getFastMathFlags()); 4286 else 4287 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4288 } 4289 4290 // Create the reduction after the loop. Note that inloop reductions create the 4291 // target reduction in the loop using a Reduction recipe. 4292 if (VF.isVector() && !IsInLoopReductionPhi) { 4293 ReducedPartRdx = 4294 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4295 // If the reduction can be performed in a smaller type, we need to extend 4296 // the reduction to the wider type before we branch to the original loop. 4297 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4298 ReducedPartRdx = 4299 RdxDesc.isSigned() 4300 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4301 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4302 } 4303 4304 // Create a phi node that merges control-flow from the backedge-taken check 4305 // block and the middle block. 4306 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4307 LoopScalarPreHeader->getTerminator()); 4308 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4309 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4310 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4311 4312 // Now, we need to fix the users of the reduction variable 4313 // inside and outside of the scalar remainder loop. 4314 // We know that the loop is in LCSSA form. We need to update the 4315 // PHI nodes in the exit blocks. 4316 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4317 // All PHINodes need to have a single entry edge, or two if 4318 // we already fixed them. 4319 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4320 4321 // We found a reduction value exit-PHI. Update it with the 4322 // incoming bypass edge. 4323 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4324 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4325 } // end of the LCSSA phi scan. 4326 4327 // Fix the scalar loop reduction variable with the incoming reduction sum 4328 // from the vector body and from the backedge value. 4329 int IncomingEdgeBlockIdx = 4330 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4331 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4332 // Pick the other block. 4333 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4334 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4335 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4336 } 4337 4338 void InnerLoopVectorizer::clearReductionWrapFlags( 4339 RecurrenceDescriptor &RdxDesc) { 4340 RecurKind RK = RdxDesc.getRecurrenceKind(); 4341 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4342 return; 4343 4344 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4345 assert(LoopExitInstr && "null loop exit instruction"); 4346 SmallVector<Instruction *, 8> Worklist; 4347 SmallPtrSet<Instruction *, 8> Visited; 4348 Worklist.push_back(LoopExitInstr); 4349 Visited.insert(LoopExitInstr); 4350 4351 while (!Worklist.empty()) { 4352 Instruction *Cur = Worklist.pop_back_val(); 4353 if (isa<OverflowingBinaryOperator>(Cur)) 4354 for (unsigned Part = 0; Part < UF; ++Part) { 4355 Value *V = getOrCreateVectorValue(Cur, Part); 4356 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4357 } 4358 4359 for (User *U : Cur->users()) { 4360 Instruction *UI = cast<Instruction>(U); 4361 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4362 Visited.insert(UI).second) 4363 Worklist.push_back(UI); 4364 } 4365 } 4366 } 4367 4368 void InnerLoopVectorizer::fixLCSSAPHIs() { 4369 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4370 if (LCSSAPhi.getNumIncomingValues() == 1) { 4371 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4372 // Non-instruction incoming values will have only one value. 4373 unsigned LastLane = 0; 4374 if (isa<Instruction>(IncomingValue)) 4375 LastLane = Cost->isUniformAfterVectorization( 4376 cast<Instruction>(IncomingValue), VF) 4377 ? 0 4378 : VF.getKnownMinValue() - 1; 4379 assert((!VF.isScalable() || LastLane == 0) && 4380 "scalable vectors dont support non-uniform scalars yet"); 4381 // Can be a loop invariant incoming value or the last scalar value to be 4382 // extracted from the vectorized loop. 4383 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4384 Value *lastIncomingValue = 4385 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4386 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4387 } 4388 } 4389 } 4390 4391 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4392 // The basic block and loop containing the predicated instruction. 4393 auto *PredBB = PredInst->getParent(); 4394 auto *VectorLoop = LI->getLoopFor(PredBB); 4395 4396 // Initialize a worklist with the operands of the predicated instruction. 4397 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4398 4399 // Holds instructions that we need to analyze again. An instruction may be 4400 // reanalyzed if we don't yet know if we can sink it or not. 4401 SmallVector<Instruction *, 8> InstsToReanalyze; 4402 4403 // Returns true if a given use occurs in the predicated block. Phi nodes use 4404 // their operands in their corresponding predecessor blocks. 4405 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4406 auto *I = cast<Instruction>(U.getUser()); 4407 BasicBlock *BB = I->getParent(); 4408 if (auto *Phi = dyn_cast<PHINode>(I)) 4409 BB = Phi->getIncomingBlock( 4410 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4411 return BB == PredBB; 4412 }; 4413 4414 // Iteratively sink the scalarized operands of the predicated instruction 4415 // into the block we created for it. When an instruction is sunk, it's 4416 // operands are then added to the worklist. The algorithm ends after one pass 4417 // through the worklist doesn't sink a single instruction. 4418 bool Changed; 4419 do { 4420 // Add the instructions that need to be reanalyzed to the worklist, and 4421 // reset the changed indicator. 4422 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4423 InstsToReanalyze.clear(); 4424 Changed = false; 4425 4426 while (!Worklist.empty()) { 4427 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4428 4429 // We can't sink an instruction if it is a phi node, is already in the 4430 // predicated block, is not in the loop, or may have side effects. 4431 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4432 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4433 continue; 4434 4435 // It's legal to sink the instruction if all its uses occur in the 4436 // predicated block. Otherwise, there's nothing to do yet, and we may 4437 // need to reanalyze the instruction. 4438 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4439 InstsToReanalyze.push_back(I); 4440 continue; 4441 } 4442 4443 // Move the instruction to the beginning of the predicated block, and add 4444 // it's operands to the worklist. 4445 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4446 Worklist.insert(I->op_begin(), I->op_end()); 4447 4448 // The sinking may have enabled other instructions to be sunk, so we will 4449 // need to iterate. 4450 Changed = true; 4451 } 4452 } while (Changed); 4453 } 4454 4455 void InnerLoopVectorizer::fixNonInductionPHIs() { 4456 for (PHINode *OrigPhi : OrigPHIsToFix) { 4457 PHINode *NewPhi = 4458 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4459 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4460 4461 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4462 predecessors(OrigPhi->getParent())); 4463 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4464 predecessors(NewPhi->getParent())); 4465 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4466 "Scalar and Vector BB should have the same number of predecessors"); 4467 4468 // The insertion point in Builder may be invalidated by the time we get 4469 // here. Force the Builder insertion point to something valid so that we do 4470 // not run into issues during insertion point restore in 4471 // getOrCreateVectorValue calls below. 4472 Builder.SetInsertPoint(NewPhi); 4473 4474 // The predecessor order is preserved and we can rely on mapping between 4475 // scalar and vector block predecessors. 4476 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4477 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4478 4479 // When looking up the new scalar/vector values to fix up, use incoming 4480 // values from original phi. 4481 Value *ScIncV = 4482 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4483 4484 // Scalar incoming value may need a broadcast 4485 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4486 NewPhi->addIncoming(NewIncV, NewPredBB); 4487 } 4488 } 4489 } 4490 4491 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4492 VPUser &Operands, unsigned UF, 4493 ElementCount VF, bool IsPtrLoopInvariant, 4494 SmallBitVector &IsIndexLoopInvariant, 4495 VPTransformState &State) { 4496 // Construct a vector GEP by widening the operands of the scalar GEP as 4497 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4498 // results in a vector of pointers when at least one operand of the GEP 4499 // is vector-typed. Thus, to keep the representation compact, we only use 4500 // vector-typed operands for loop-varying values. 4501 4502 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4503 // If we are vectorizing, but the GEP has only loop-invariant operands, 4504 // the GEP we build (by only using vector-typed operands for 4505 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4506 // produce a vector of pointers, we need to either arbitrarily pick an 4507 // operand to broadcast, or broadcast a clone of the original GEP. 4508 // Here, we broadcast a clone of the original. 4509 // 4510 // TODO: If at some point we decide to scalarize instructions having 4511 // loop-invariant operands, this special case will no longer be 4512 // required. We would add the scalarization decision to 4513 // collectLoopScalars() and teach getVectorValue() to broadcast 4514 // the lane-zero scalar value. 4515 auto *Clone = Builder.Insert(GEP->clone()); 4516 for (unsigned Part = 0; Part < UF; ++Part) { 4517 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4518 State.set(VPDef, GEP, EntryPart, Part); 4519 addMetadata(EntryPart, GEP); 4520 } 4521 } else { 4522 // If the GEP has at least one loop-varying operand, we are sure to 4523 // produce a vector of pointers. But if we are only unrolling, we want 4524 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4525 // produce with the code below will be scalar (if VF == 1) or vector 4526 // (otherwise). Note that for the unroll-only case, we still maintain 4527 // values in the vector mapping with initVector, as we do for other 4528 // instructions. 4529 for (unsigned Part = 0; Part < UF; ++Part) { 4530 // The pointer operand of the new GEP. If it's loop-invariant, we 4531 // won't broadcast it. 4532 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4533 : State.get(Operands.getOperand(0), Part); 4534 4535 // Collect all the indices for the new GEP. If any index is 4536 // loop-invariant, we won't broadcast it. 4537 SmallVector<Value *, 4> Indices; 4538 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4539 VPValue *Operand = Operands.getOperand(I); 4540 if (IsIndexLoopInvariant[I - 1]) 4541 Indices.push_back(State.get(Operand, {0, 0})); 4542 else 4543 Indices.push_back(State.get(Operand, Part)); 4544 } 4545 4546 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4547 // but it should be a vector, otherwise. 4548 auto *NewGEP = 4549 GEP->isInBounds() 4550 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4551 Indices) 4552 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4553 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4554 "NewGEP is not a pointer vector"); 4555 State.set(VPDef, GEP, NewGEP, Part); 4556 addMetadata(NewGEP, GEP); 4557 } 4558 } 4559 } 4560 4561 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4562 RecurrenceDescriptor *RdxDesc, 4563 Value *StartV, unsigned UF, 4564 ElementCount VF) { 4565 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4566 PHINode *P = cast<PHINode>(PN); 4567 if (EnableVPlanNativePath) { 4568 // Currently we enter here in the VPlan-native path for non-induction 4569 // PHIs where all control flow is uniform. We simply widen these PHIs. 4570 // Create a vector phi with no operands - the vector phi operands will be 4571 // set at the end of vector code generation. 4572 Type *VecTy = 4573 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4574 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4575 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4576 OrigPHIsToFix.push_back(P); 4577 4578 return; 4579 } 4580 4581 assert(PN->getParent() == OrigLoop->getHeader() && 4582 "Non-header phis should have been handled elsewhere"); 4583 4584 // In order to support recurrences we need to be able to vectorize Phi nodes. 4585 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4586 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4587 // this value when we vectorize all of the instructions that use the PHI. 4588 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4589 Value *Iden = nullptr; 4590 bool ScalarPHI = 4591 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4592 Type *VecTy = 4593 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4594 4595 if (RdxDesc) { 4596 assert(Legal->isReductionVariable(P) && StartV && 4597 "RdxDesc should only be set for reduction variables; in that case " 4598 "a StartV is also required"); 4599 RecurKind RK = RdxDesc->getRecurrenceKind(); 4600 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4601 // MinMax reduction have the start value as their identify. 4602 if (ScalarPHI) { 4603 Iden = StartV; 4604 } else { 4605 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4606 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4607 StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); 4608 } 4609 } else { 4610 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4611 RK, VecTy->getScalarType()); 4612 Iden = IdenC; 4613 4614 if (!ScalarPHI) { 4615 Iden = ConstantVector::getSplat(VF, IdenC); 4616 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4617 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4618 Constant *Zero = Builder.getInt32(0); 4619 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4620 } 4621 } 4622 } 4623 4624 for (unsigned Part = 0; Part < UF; ++Part) { 4625 // This is phase one of vectorizing PHIs. 4626 Value *EntryPart = PHINode::Create( 4627 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4628 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4629 if (StartV) { 4630 // Make sure to add the reduction start value only to the 4631 // first unroll part. 4632 Value *StartVal = (Part == 0) ? StartV : Iden; 4633 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4634 } 4635 } 4636 return; 4637 } 4638 4639 assert(!Legal->isReductionVariable(P) && 4640 "reductions should be handled above"); 4641 4642 setDebugLocFromInst(Builder, P); 4643 4644 // This PHINode must be an induction variable. 4645 // Make sure that we know about it. 4646 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4647 4648 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4649 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4650 4651 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4652 // which can be found from the original scalar operations. 4653 switch (II.getKind()) { 4654 case InductionDescriptor::IK_NoInduction: 4655 llvm_unreachable("Unknown induction"); 4656 case InductionDescriptor::IK_IntInduction: 4657 case InductionDescriptor::IK_FpInduction: 4658 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4659 case InductionDescriptor::IK_PtrInduction: { 4660 // Handle the pointer induction variable case. 4661 assert(P->getType()->isPointerTy() && "Unexpected type."); 4662 4663 if (Cost->isScalarAfterVectorization(P, VF)) { 4664 // This is the normalized GEP that starts counting at zero. 4665 Value *PtrInd = 4666 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4667 // Determine the number of scalars we need to generate for each unroll 4668 // iteration. If the instruction is uniform, we only need to generate the 4669 // first lane. Otherwise, we generate all VF values. 4670 unsigned Lanes = 4671 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4672 for (unsigned Part = 0; Part < UF; ++Part) { 4673 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4674 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4675 Lane + Part * VF.getKnownMinValue()); 4676 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4677 Value *SclrGep = 4678 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4679 SclrGep->setName("next.gep"); 4680 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4681 } 4682 } 4683 return; 4684 } 4685 assert(isa<SCEVConstant>(II.getStep()) && 4686 "Induction step not a SCEV constant!"); 4687 Type *PhiType = II.getStep()->getType(); 4688 4689 // Build a pointer phi 4690 Value *ScalarStartValue = II.getStartValue(); 4691 Type *ScStValueType = ScalarStartValue->getType(); 4692 PHINode *NewPointerPhi = 4693 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4694 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4695 4696 // A pointer induction, performed by using a gep 4697 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4698 Instruction *InductionLoc = LoopLatch->getTerminator(); 4699 const SCEV *ScalarStep = II.getStep(); 4700 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4701 Value *ScalarStepValue = 4702 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4703 Value *InductionGEP = GetElementPtrInst::Create( 4704 ScStValueType->getPointerElementType(), NewPointerPhi, 4705 Builder.CreateMul( 4706 ScalarStepValue, 4707 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4708 "ptr.ind", InductionLoc); 4709 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4710 4711 // Create UF many actual address geps that use the pointer 4712 // phi as base and a vectorized version of the step value 4713 // (<step*0, ..., step*N>) as offset. 4714 for (unsigned Part = 0; Part < UF; ++Part) { 4715 SmallVector<Constant *, 8> Indices; 4716 // Create a vector of consecutive numbers from zero to VF. 4717 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4718 Indices.push_back( 4719 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4720 Constant *StartOffset = ConstantVector::get(Indices); 4721 4722 Value *GEP = Builder.CreateGEP( 4723 ScStValueType->getPointerElementType(), NewPointerPhi, 4724 Builder.CreateMul( 4725 StartOffset, 4726 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4727 "vector.gep")); 4728 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4729 } 4730 } 4731 } 4732 } 4733 4734 /// A helper function for checking whether an integer division-related 4735 /// instruction may divide by zero (in which case it must be predicated if 4736 /// executed conditionally in the scalar code). 4737 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4738 /// Non-zero divisors that are non compile-time constants will not be 4739 /// converted into multiplication, so we will still end up scalarizing 4740 /// the division, but can do so w/o predication. 4741 static bool mayDivideByZero(Instruction &I) { 4742 assert((I.getOpcode() == Instruction::UDiv || 4743 I.getOpcode() == Instruction::SDiv || 4744 I.getOpcode() == Instruction::URem || 4745 I.getOpcode() == Instruction::SRem) && 4746 "Unexpected instruction"); 4747 Value *Divisor = I.getOperand(1); 4748 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4749 return !CInt || CInt->isZero(); 4750 } 4751 4752 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4753 VPUser &User, 4754 VPTransformState &State) { 4755 switch (I.getOpcode()) { 4756 case Instruction::Call: 4757 case Instruction::Br: 4758 case Instruction::PHI: 4759 case Instruction::GetElementPtr: 4760 case Instruction::Select: 4761 llvm_unreachable("This instruction is handled by a different recipe."); 4762 case Instruction::UDiv: 4763 case Instruction::SDiv: 4764 case Instruction::SRem: 4765 case Instruction::URem: 4766 case Instruction::Add: 4767 case Instruction::FAdd: 4768 case Instruction::Sub: 4769 case Instruction::FSub: 4770 case Instruction::FNeg: 4771 case Instruction::Mul: 4772 case Instruction::FMul: 4773 case Instruction::FDiv: 4774 case Instruction::FRem: 4775 case Instruction::Shl: 4776 case Instruction::LShr: 4777 case Instruction::AShr: 4778 case Instruction::And: 4779 case Instruction::Or: 4780 case Instruction::Xor: { 4781 // Just widen unops and binops. 4782 setDebugLocFromInst(Builder, &I); 4783 4784 for (unsigned Part = 0; Part < UF; ++Part) { 4785 SmallVector<Value *, 2> Ops; 4786 for (VPValue *VPOp : User.operands()) 4787 Ops.push_back(State.get(VPOp, Part)); 4788 4789 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4790 4791 if (auto *VecOp = dyn_cast<Instruction>(V)) 4792 VecOp->copyIRFlags(&I); 4793 4794 // Use this vector value for all users of the original instruction. 4795 State.set(Def, &I, V, Part); 4796 addMetadata(V, &I); 4797 } 4798 4799 break; 4800 } 4801 case Instruction::ICmp: 4802 case Instruction::FCmp: { 4803 // Widen compares. Generate vector compares. 4804 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4805 auto *Cmp = cast<CmpInst>(&I); 4806 setDebugLocFromInst(Builder, Cmp); 4807 for (unsigned Part = 0; Part < UF; ++Part) { 4808 Value *A = State.get(User.getOperand(0), Part); 4809 Value *B = State.get(User.getOperand(1), Part); 4810 Value *C = nullptr; 4811 if (FCmp) { 4812 // Propagate fast math flags. 4813 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4814 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4815 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4816 } else { 4817 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4818 } 4819 State.set(Def, &I, C, Part); 4820 addMetadata(C, &I); 4821 } 4822 4823 break; 4824 } 4825 4826 case Instruction::ZExt: 4827 case Instruction::SExt: 4828 case Instruction::FPToUI: 4829 case Instruction::FPToSI: 4830 case Instruction::FPExt: 4831 case Instruction::PtrToInt: 4832 case Instruction::IntToPtr: 4833 case Instruction::SIToFP: 4834 case Instruction::UIToFP: 4835 case Instruction::Trunc: 4836 case Instruction::FPTrunc: 4837 case Instruction::BitCast: { 4838 auto *CI = cast<CastInst>(&I); 4839 setDebugLocFromInst(Builder, CI); 4840 4841 /// Vectorize casts. 4842 Type *DestTy = 4843 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4844 4845 for (unsigned Part = 0; Part < UF; ++Part) { 4846 Value *A = State.get(User.getOperand(0), Part); 4847 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4848 State.set(Def, &I, Cast, Part); 4849 addMetadata(Cast, &I); 4850 } 4851 break; 4852 } 4853 default: 4854 // This instruction is not vectorized by simple widening. 4855 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4856 llvm_unreachable("Unhandled instruction!"); 4857 } // end of switch. 4858 } 4859 4860 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4861 VPUser &ArgOperands, 4862 VPTransformState &State) { 4863 assert(!isa<DbgInfoIntrinsic>(I) && 4864 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4865 setDebugLocFromInst(Builder, &I); 4866 4867 Module *M = I.getParent()->getParent()->getParent(); 4868 auto *CI = cast<CallInst>(&I); 4869 4870 SmallVector<Type *, 4> Tys; 4871 for (Value *ArgOperand : CI->arg_operands()) 4872 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4873 4874 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4875 4876 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4877 // version of the instruction. 4878 // Is it beneficial to perform intrinsic call compared to lib call? 4879 bool NeedToScalarize = false; 4880 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4881 bool UseVectorIntrinsic = 4882 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4883 assert((UseVectorIntrinsic || !NeedToScalarize) && 4884 "Instruction should be scalarized elsewhere."); 4885 4886 for (unsigned Part = 0; Part < UF; ++Part) { 4887 SmallVector<Value *, 4> Args; 4888 for (auto &I : enumerate(ArgOperands.operands())) { 4889 // Some intrinsics have a scalar argument - don't replace it with a 4890 // vector. 4891 Value *Arg; 4892 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4893 Arg = State.get(I.value(), Part); 4894 else 4895 Arg = State.get(I.value(), {0, 0}); 4896 Args.push_back(Arg); 4897 } 4898 4899 Function *VectorF; 4900 if (UseVectorIntrinsic) { 4901 // Use vector version of the intrinsic. 4902 Type *TysForDecl[] = {CI->getType()}; 4903 if (VF.isVector()) { 4904 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4905 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4906 } 4907 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4908 assert(VectorF && "Can't retrieve vector intrinsic."); 4909 } else { 4910 // Use vector version of the function call. 4911 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4912 #ifndef NDEBUG 4913 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4914 "Can't create vector function."); 4915 #endif 4916 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4917 } 4918 SmallVector<OperandBundleDef, 1> OpBundles; 4919 CI->getOperandBundlesAsDefs(OpBundles); 4920 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4921 4922 if (isa<FPMathOperator>(V)) 4923 V->copyFastMathFlags(CI); 4924 4925 State.set(Def, &I, V, Part); 4926 addMetadata(V, &I); 4927 } 4928 } 4929 4930 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4931 VPUser &Operands, 4932 bool InvariantCond, 4933 VPTransformState &State) { 4934 setDebugLocFromInst(Builder, &I); 4935 4936 // The condition can be loop invariant but still defined inside the 4937 // loop. This means that we can't just use the original 'cond' value. 4938 // We have to take the 'vectorized' value and pick the first lane. 4939 // Instcombine will make this a no-op. 4940 auto *InvarCond = 4941 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4942 4943 for (unsigned Part = 0; Part < UF; ++Part) { 4944 Value *Cond = 4945 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4946 Value *Op0 = State.get(Operands.getOperand(1), Part); 4947 Value *Op1 = State.get(Operands.getOperand(2), Part); 4948 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4949 State.set(VPDef, &I, Sel, Part); 4950 addMetadata(Sel, &I); 4951 } 4952 } 4953 4954 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4955 // We should not collect Scalars more than once per VF. Right now, this 4956 // function is called from collectUniformsAndScalars(), which already does 4957 // this check. Collecting Scalars for VF=1 does not make any sense. 4958 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4959 "This function should not be visited twice for the same VF"); 4960 4961 SmallSetVector<Instruction *, 8> Worklist; 4962 4963 // These sets are used to seed the analysis with pointers used by memory 4964 // accesses that will remain scalar. 4965 SmallSetVector<Instruction *, 8> ScalarPtrs; 4966 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4967 auto *Latch = TheLoop->getLoopLatch(); 4968 4969 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4970 // The pointer operands of loads and stores will be scalar as long as the 4971 // memory access is not a gather or scatter operation. The value operand of a 4972 // store will remain scalar if the store is scalarized. 4973 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4974 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4975 assert(WideningDecision != CM_Unknown && 4976 "Widening decision should be ready at this moment"); 4977 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4978 if (Ptr == Store->getValueOperand()) 4979 return WideningDecision == CM_Scalarize; 4980 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4981 "Ptr is neither a value or pointer operand"); 4982 return WideningDecision != CM_GatherScatter; 4983 }; 4984 4985 // A helper that returns true if the given value is a bitcast or 4986 // getelementptr instruction contained in the loop. 4987 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4988 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4989 isa<GetElementPtrInst>(V)) && 4990 !TheLoop->isLoopInvariant(V); 4991 }; 4992 4993 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4994 if (!isa<PHINode>(Ptr) || 4995 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4996 return false; 4997 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4998 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4999 return false; 5000 return isScalarUse(MemAccess, Ptr); 5001 }; 5002 5003 // A helper that evaluates a memory access's use of a pointer. If the 5004 // pointer is actually the pointer induction of a loop, it is being 5005 // inserted into Worklist. If the use will be a scalar use, and the 5006 // pointer is only used by memory accesses, we place the pointer in 5007 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5008 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5009 if (isScalarPtrInduction(MemAccess, Ptr)) { 5010 Worklist.insert(cast<Instruction>(Ptr)); 5011 Instruction *Update = cast<Instruction>( 5012 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5013 Worklist.insert(Update); 5014 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5015 << "\n"); 5016 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5017 << "\n"); 5018 return; 5019 } 5020 // We only care about bitcast and getelementptr instructions contained in 5021 // the loop. 5022 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5023 return; 5024 5025 // If the pointer has already been identified as scalar (e.g., if it was 5026 // also identified as uniform), there's nothing to do. 5027 auto *I = cast<Instruction>(Ptr); 5028 if (Worklist.count(I)) 5029 return; 5030 5031 // If the use of the pointer will be a scalar use, and all users of the 5032 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5033 // place the pointer in PossibleNonScalarPtrs. 5034 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5035 return isa<LoadInst>(U) || isa<StoreInst>(U); 5036 })) 5037 ScalarPtrs.insert(I); 5038 else 5039 PossibleNonScalarPtrs.insert(I); 5040 }; 5041 5042 // We seed the scalars analysis with three classes of instructions: (1) 5043 // instructions marked uniform-after-vectorization and (2) bitcast, 5044 // getelementptr and (pointer) phi instructions used by memory accesses 5045 // requiring a scalar use. 5046 // 5047 // (1) Add to the worklist all instructions that have been identified as 5048 // uniform-after-vectorization. 5049 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5050 5051 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5052 // memory accesses requiring a scalar use. The pointer operands of loads and 5053 // stores will be scalar as long as the memory accesses is not a gather or 5054 // scatter operation. The value operand of a store will remain scalar if the 5055 // store is scalarized. 5056 for (auto *BB : TheLoop->blocks()) 5057 for (auto &I : *BB) { 5058 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5059 evaluatePtrUse(Load, Load->getPointerOperand()); 5060 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5061 evaluatePtrUse(Store, Store->getPointerOperand()); 5062 evaluatePtrUse(Store, Store->getValueOperand()); 5063 } 5064 } 5065 for (auto *I : ScalarPtrs) 5066 if (!PossibleNonScalarPtrs.count(I)) { 5067 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5068 Worklist.insert(I); 5069 } 5070 5071 // Insert the forced scalars. 5072 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5073 // induction variable when the PHI user is scalarized. 5074 auto ForcedScalar = ForcedScalars.find(VF); 5075 if (ForcedScalar != ForcedScalars.end()) 5076 for (auto *I : ForcedScalar->second) 5077 Worklist.insert(I); 5078 5079 // Expand the worklist by looking through any bitcasts and getelementptr 5080 // instructions we've already identified as scalar. This is similar to the 5081 // expansion step in collectLoopUniforms(); however, here we're only 5082 // expanding to include additional bitcasts and getelementptr instructions. 5083 unsigned Idx = 0; 5084 while (Idx != Worklist.size()) { 5085 Instruction *Dst = Worklist[Idx++]; 5086 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5087 continue; 5088 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5089 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5090 auto *J = cast<Instruction>(U); 5091 return !TheLoop->contains(J) || Worklist.count(J) || 5092 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5093 isScalarUse(J, Src)); 5094 })) { 5095 Worklist.insert(Src); 5096 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5097 } 5098 } 5099 5100 // An induction variable will remain scalar if all users of the induction 5101 // variable and induction variable update remain scalar. 5102 for (auto &Induction : Legal->getInductionVars()) { 5103 auto *Ind = Induction.first; 5104 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5105 5106 // If tail-folding is applied, the primary induction variable will be used 5107 // to feed a vector compare. 5108 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5109 continue; 5110 5111 // Determine if all users of the induction variable are scalar after 5112 // vectorization. 5113 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5114 auto *I = cast<Instruction>(U); 5115 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5116 }); 5117 if (!ScalarInd) 5118 continue; 5119 5120 // Determine if all users of the induction variable update instruction are 5121 // scalar after vectorization. 5122 auto ScalarIndUpdate = 5123 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5124 auto *I = cast<Instruction>(U); 5125 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5126 }); 5127 if (!ScalarIndUpdate) 5128 continue; 5129 5130 // The induction variable and its update instruction will remain scalar. 5131 Worklist.insert(Ind); 5132 Worklist.insert(IndUpdate); 5133 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5134 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5135 << "\n"); 5136 } 5137 5138 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5139 } 5140 5141 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5142 ElementCount VF) { 5143 if (!blockNeedsPredication(I->getParent())) 5144 return false; 5145 switch(I->getOpcode()) { 5146 default: 5147 break; 5148 case Instruction::Load: 5149 case Instruction::Store: { 5150 if (!Legal->isMaskRequired(I)) 5151 return false; 5152 auto *Ptr = getLoadStorePointerOperand(I); 5153 auto *Ty = getMemInstValueType(I); 5154 // We have already decided how to vectorize this instruction, get that 5155 // result. 5156 if (VF.isVector()) { 5157 InstWidening WideningDecision = getWideningDecision(I, VF); 5158 assert(WideningDecision != CM_Unknown && 5159 "Widening decision should be ready at this moment"); 5160 return WideningDecision == CM_Scalarize; 5161 } 5162 const Align Alignment = getLoadStoreAlignment(I); 5163 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5164 isLegalMaskedGather(Ty, Alignment)) 5165 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5166 isLegalMaskedScatter(Ty, Alignment)); 5167 } 5168 case Instruction::UDiv: 5169 case Instruction::SDiv: 5170 case Instruction::SRem: 5171 case Instruction::URem: 5172 return mayDivideByZero(*I); 5173 } 5174 return false; 5175 } 5176 5177 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5178 Instruction *I, ElementCount VF) { 5179 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5180 assert(getWideningDecision(I, VF) == CM_Unknown && 5181 "Decision should not be set yet."); 5182 auto *Group = getInterleavedAccessGroup(I); 5183 assert(Group && "Must have a group."); 5184 5185 // If the instruction's allocated size doesn't equal it's type size, it 5186 // requires padding and will be scalarized. 5187 auto &DL = I->getModule()->getDataLayout(); 5188 auto *ScalarTy = getMemInstValueType(I); 5189 if (hasIrregularType(ScalarTy, DL, VF)) 5190 return false; 5191 5192 // Check if masking is required. 5193 // A Group may need masking for one of two reasons: it resides in a block that 5194 // needs predication, or it was decided to use masking to deal with gaps. 5195 bool PredicatedAccessRequiresMasking = 5196 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5197 bool AccessWithGapsRequiresMasking = 5198 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5199 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5200 return true; 5201 5202 // If masked interleaving is required, we expect that the user/target had 5203 // enabled it, because otherwise it either wouldn't have been created or 5204 // it should have been invalidated by the CostModel. 5205 assert(useMaskedInterleavedAccesses(TTI) && 5206 "Masked interleave-groups for predicated accesses are not enabled."); 5207 5208 auto *Ty = getMemInstValueType(I); 5209 const Align Alignment = getLoadStoreAlignment(I); 5210 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5211 : TTI.isLegalMaskedStore(Ty, Alignment); 5212 } 5213 5214 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5215 Instruction *I, ElementCount VF) { 5216 // Get and ensure we have a valid memory instruction. 5217 LoadInst *LI = dyn_cast<LoadInst>(I); 5218 StoreInst *SI = dyn_cast<StoreInst>(I); 5219 assert((LI || SI) && "Invalid memory instruction"); 5220 5221 auto *Ptr = getLoadStorePointerOperand(I); 5222 5223 // In order to be widened, the pointer should be consecutive, first of all. 5224 if (!Legal->isConsecutivePtr(Ptr)) 5225 return false; 5226 5227 // If the instruction is a store located in a predicated block, it will be 5228 // scalarized. 5229 if (isScalarWithPredication(I)) 5230 return false; 5231 5232 // If the instruction's allocated size doesn't equal it's type size, it 5233 // requires padding and will be scalarized. 5234 auto &DL = I->getModule()->getDataLayout(); 5235 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5236 if (hasIrregularType(ScalarTy, DL, VF)) 5237 return false; 5238 5239 return true; 5240 } 5241 5242 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5243 // We should not collect Uniforms more than once per VF. Right now, 5244 // this function is called from collectUniformsAndScalars(), which 5245 // already does this check. Collecting Uniforms for VF=1 does not make any 5246 // sense. 5247 5248 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5249 "This function should not be visited twice for the same VF"); 5250 5251 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5252 // not analyze again. Uniforms.count(VF) will return 1. 5253 Uniforms[VF].clear(); 5254 5255 // We now know that the loop is vectorizable! 5256 // Collect instructions inside the loop that will remain uniform after 5257 // vectorization. 5258 5259 // Global values, params and instructions outside of current loop are out of 5260 // scope. 5261 auto isOutOfScope = [&](Value *V) -> bool { 5262 Instruction *I = dyn_cast<Instruction>(V); 5263 return (!I || !TheLoop->contains(I)); 5264 }; 5265 5266 SetVector<Instruction *> Worklist; 5267 BasicBlock *Latch = TheLoop->getLoopLatch(); 5268 5269 // Instructions that are scalar with predication must not be considered 5270 // uniform after vectorization, because that would create an erroneous 5271 // replicating region where only a single instance out of VF should be formed. 5272 // TODO: optimize such seldom cases if found important, see PR40816. 5273 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5274 if (isOutOfScope(I)) { 5275 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5276 << *I << "\n"); 5277 return; 5278 } 5279 if (isScalarWithPredication(I, VF)) { 5280 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5281 << *I << "\n"); 5282 return; 5283 } 5284 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5285 Worklist.insert(I); 5286 }; 5287 5288 // Start with the conditional branch. If the branch condition is an 5289 // instruction contained in the loop that is only used by the branch, it is 5290 // uniform. 5291 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5292 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5293 addToWorklistIfAllowed(Cmp); 5294 5295 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5296 InstWidening WideningDecision = getWideningDecision(I, VF); 5297 assert(WideningDecision != CM_Unknown && 5298 "Widening decision should be ready at this moment"); 5299 5300 // A uniform memory op is itself uniform. We exclude uniform stores 5301 // here as they demand the last lane, not the first one. 5302 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5303 assert(WideningDecision == CM_Scalarize); 5304 return true; 5305 } 5306 5307 return (WideningDecision == CM_Widen || 5308 WideningDecision == CM_Widen_Reverse || 5309 WideningDecision == CM_Interleave); 5310 }; 5311 5312 5313 // Returns true if Ptr is the pointer operand of a memory access instruction 5314 // I, and I is known to not require scalarization. 5315 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5316 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5317 }; 5318 5319 // Holds a list of values which are known to have at least one uniform use. 5320 // Note that there may be other uses which aren't uniform. A "uniform use" 5321 // here is something which only demands lane 0 of the unrolled iterations; 5322 // it does not imply that all lanes produce the same value (e.g. this is not 5323 // the usual meaning of uniform) 5324 SmallPtrSet<Value *, 8> HasUniformUse; 5325 5326 // Scan the loop for instructions which are either a) known to have only 5327 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5328 for (auto *BB : TheLoop->blocks()) 5329 for (auto &I : *BB) { 5330 // If there's no pointer operand, there's nothing to do. 5331 auto *Ptr = getLoadStorePointerOperand(&I); 5332 if (!Ptr) 5333 continue; 5334 5335 // A uniform memory op is itself uniform. We exclude uniform stores 5336 // here as they demand the last lane, not the first one. 5337 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5338 addToWorklistIfAllowed(&I); 5339 5340 if (isUniformDecision(&I, VF)) { 5341 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5342 HasUniformUse.insert(Ptr); 5343 } 5344 } 5345 5346 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5347 // demanding) users. Since loops are assumed to be in LCSSA form, this 5348 // disallows uses outside the loop as well. 5349 for (auto *V : HasUniformUse) { 5350 if (isOutOfScope(V)) 5351 continue; 5352 auto *I = cast<Instruction>(V); 5353 auto UsersAreMemAccesses = 5354 llvm::all_of(I->users(), [&](User *U) -> bool { 5355 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5356 }); 5357 if (UsersAreMemAccesses) 5358 addToWorklistIfAllowed(I); 5359 } 5360 5361 // Expand Worklist in topological order: whenever a new instruction 5362 // is added , its users should be already inside Worklist. It ensures 5363 // a uniform instruction will only be used by uniform instructions. 5364 unsigned idx = 0; 5365 while (idx != Worklist.size()) { 5366 Instruction *I = Worklist[idx++]; 5367 5368 for (auto OV : I->operand_values()) { 5369 // isOutOfScope operands cannot be uniform instructions. 5370 if (isOutOfScope(OV)) 5371 continue; 5372 // First order recurrence Phi's should typically be considered 5373 // non-uniform. 5374 auto *OP = dyn_cast<PHINode>(OV); 5375 if (OP && Legal->isFirstOrderRecurrence(OP)) 5376 continue; 5377 // If all the users of the operand are uniform, then add the 5378 // operand into the uniform worklist. 5379 auto *OI = cast<Instruction>(OV); 5380 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5381 auto *J = cast<Instruction>(U); 5382 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5383 })) 5384 addToWorklistIfAllowed(OI); 5385 } 5386 } 5387 5388 // For an instruction to be added into Worklist above, all its users inside 5389 // the loop should also be in Worklist. However, this condition cannot be 5390 // true for phi nodes that form a cyclic dependence. We must process phi 5391 // nodes separately. An induction variable will remain uniform if all users 5392 // of the induction variable and induction variable update remain uniform. 5393 // The code below handles both pointer and non-pointer induction variables. 5394 for (auto &Induction : Legal->getInductionVars()) { 5395 auto *Ind = Induction.first; 5396 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5397 5398 // Determine if all users of the induction variable are uniform after 5399 // vectorization. 5400 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5401 auto *I = cast<Instruction>(U); 5402 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5403 isVectorizedMemAccessUse(I, Ind); 5404 }); 5405 if (!UniformInd) 5406 continue; 5407 5408 // Determine if all users of the induction variable update instruction are 5409 // uniform after vectorization. 5410 auto UniformIndUpdate = 5411 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5412 auto *I = cast<Instruction>(U); 5413 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5414 isVectorizedMemAccessUse(I, IndUpdate); 5415 }); 5416 if (!UniformIndUpdate) 5417 continue; 5418 5419 // The induction variable and its update instruction will remain uniform. 5420 addToWorklistIfAllowed(Ind); 5421 addToWorklistIfAllowed(IndUpdate); 5422 } 5423 5424 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5425 } 5426 5427 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5428 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5429 5430 if (Legal->getRuntimePointerChecking()->Need) { 5431 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5432 "runtime pointer checks needed. Enable vectorization of this " 5433 "loop with '#pragma clang loop vectorize(enable)' when " 5434 "compiling with -Os/-Oz", 5435 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5436 return true; 5437 } 5438 5439 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5440 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5441 "runtime SCEV checks needed. Enable vectorization of this " 5442 "loop with '#pragma clang loop vectorize(enable)' when " 5443 "compiling with -Os/-Oz", 5444 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5445 return true; 5446 } 5447 5448 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5449 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5450 reportVectorizationFailure("Runtime stride check for small trip count", 5451 "runtime stride == 1 checks needed. Enable vectorization of " 5452 "this loop without such check by compiling with -Os/-Oz", 5453 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5454 return true; 5455 } 5456 5457 return false; 5458 } 5459 5460 Optional<ElementCount> 5461 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5462 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5463 // TODO: It may by useful to do since it's still likely to be dynamically 5464 // uniform if the target can skip. 5465 reportVectorizationFailure( 5466 "Not inserting runtime ptr check for divergent target", 5467 "runtime pointer checks needed. Not enabled for divergent target", 5468 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5469 return None; 5470 } 5471 5472 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5473 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5474 if (TC == 1) { 5475 reportVectorizationFailure("Single iteration (non) loop", 5476 "loop trip count is one, irrelevant for vectorization", 5477 "SingleIterationLoop", ORE, TheLoop); 5478 return None; 5479 } 5480 5481 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5482 5483 switch (ScalarEpilogueStatus) { 5484 case CM_ScalarEpilogueAllowed: 5485 return MaxVF; 5486 case CM_ScalarEpilogueNotAllowedUsePredicate: 5487 LLVM_FALLTHROUGH; 5488 case CM_ScalarEpilogueNotNeededUsePredicate: 5489 LLVM_DEBUG( 5490 dbgs() << "LV: vector predicate hint/switch found.\n" 5491 << "LV: Not allowing scalar epilogue, creating predicated " 5492 << "vector loop.\n"); 5493 break; 5494 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5495 // fallthrough as a special case of OptForSize 5496 case CM_ScalarEpilogueNotAllowedOptSize: 5497 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5498 LLVM_DEBUG( 5499 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5500 else 5501 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5502 << "count.\n"); 5503 5504 // Bail if runtime checks are required, which are not good when optimising 5505 // for size. 5506 if (runtimeChecksRequired()) 5507 return None; 5508 5509 break; 5510 } 5511 5512 // The only loops we can vectorize without a scalar epilogue, are loops with 5513 // a bottom-test and a single exiting block. We'd have to handle the fact 5514 // that not every instruction executes on the last iteration. This will 5515 // require a lane mask which varies through the vector loop body. (TODO) 5516 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5517 // If there was a tail-folding hint/switch, but we can't fold the tail by 5518 // masking, fallback to a vectorization with a scalar epilogue. 5519 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5520 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5521 "scalar epilogue instead.\n"); 5522 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5523 return MaxVF; 5524 } 5525 return None; 5526 } 5527 5528 // Now try the tail folding 5529 5530 // Invalidate interleave groups that require an epilogue if we can't mask 5531 // the interleave-group. 5532 if (!useMaskedInterleavedAccesses(TTI)) { 5533 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5534 "No decisions should have been taken at this point"); 5535 // Note: There is no need to invalidate any cost modeling decisions here, as 5536 // non where taken so far. 5537 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5538 } 5539 5540 assert(!MaxVF.isScalable() && 5541 "Scalable vectors do not yet support tail folding"); 5542 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5543 "MaxVF must be a power of 2"); 5544 unsigned MaxVFtimesIC = 5545 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5546 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5547 // chose. 5548 ScalarEvolution *SE = PSE.getSE(); 5549 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5550 const SCEV *ExitCount = SE->getAddExpr( 5551 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5552 const SCEV *Rem = SE->getURemExpr( 5553 ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5554 if (Rem->isZero()) { 5555 // Accept MaxVF if we do not have a tail. 5556 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5557 return MaxVF; 5558 } 5559 5560 // If we don't know the precise trip count, or if the trip count that we 5561 // found modulo the vectorization factor is not zero, try to fold the tail 5562 // by masking. 5563 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5564 if (Legal->prepareToFoldTailByMasking()) { 5565 FoldTailByMasking = true; 5566 return MaxVF; 5567 } 5568 5569 // If there was a tail-folding hint/switch, but we can't fold the tail by 5570 // masking, fallback to a vectorization with a scalar epilogue. 5571 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5572 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5573 "scalar epilogue instead.\n"); 5574 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5575 return MaxVF; 5576 } 5577 5578 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5579 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5580 return None; 5581 } 5582 5583 if (TC == 0) { 5584 reportVectorizationFailure( 5585 "Unable to calculate the loop count due to complex control flow", 5586 "unable to calculate the loop count due to complex control flow", 5587 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5588 return None; 5589 } 5590 5591 reportVectorizationFailure( 5592 "Cannot optimize for size and vectorize at the same time.", 5593 "cannot optimize for size and vectorize at the same time. " 5594 "Enable vectorization of this loop with '#pragma clang loop " 5595 "vectorize(enable)' when compiling with -Os/-Oz", 5596 "NoTailLoopWithOptForSize", ORE, TheLoop); 5597 return None; 5598 } 5599 5600 ElementCount 5601 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5602 ElementCount UserVF) { 5603 bool IgnoreScalableUserVF = UserVF.isScalable() && 5604 !TTI.supportsScalableVectors() && 5605 !ForceTargetSupportsScalableVectors; 5606 if (IgnoreScalableUserVF) { 5607 LLVM_DEBUG( 5608 dbgs() << "LV: Ignoring VF=" << UserVF 5609 << " because target does not support scalable vectors.\n"); 5610 ORE->emit([&]() { 5611 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5612 TheLoop->getStartLoc(), 5613 TheLoop->getHeader()) 5614 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5615 << " because target does not support scalable vectors."; 5616 }); 5617 } 5618 5619 // Beyond this point two scenarios are handled. If UserVF isn't specified 5620 // then a suitable VF is chosen. If UserVF is specified and there are 5621 // dependencies, check if it's legal. However, if a UserVF is specified and 5622 // there are no dependencies, then there's nothing to do. 5623 if (UserVF.isNonZero() && !IgnoreScalableUserVF && 5624 Legal->isSafeForAnyVectorWidth()) 5625 return UserVF; 5626 5627 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5628 unsigned SmallestType, WidestType; 5629 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5630 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5631 5632 // Get the maximum safe dependence distance in bits computed by LAA. 5633 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5634 // the memory accesses that is most restrictive (involved in the smallest 5635 // dependence distance). 5636 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5637 5638 // If the user vectorization factor is legally unsafe, clamp it to a safe 5639 // value. Otherwise, return as is. 5640 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5641 unsigned MaxSafeElements = 5642 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5643 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5644 5645 if (UserVF.isScalable()) { 5646 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5647 5648 // Scale VF by vscale before checking if it's safe. 5649 MaxSafeVF = ElementCount::getScalable( 5650 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5651 5652 if (MaxSafeVF.isZero()) { 5653 // The dependence distance is too small to use scalable vectors, 5654 // fallback on fixed. 5655 LLVM_DEBUG( 5656 dbgs() 5657 << "LV: Max legal vector width too small, scalable vectorization " 5658 "unfeasible. Using fixed-width vectorization instead.\n"); 5659 ORE->emit([&]() { 5660 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5661 TheLoop->getStartLoc(), 5662 TheLoop->getHeader()) 5663 << "Max legal vector width too small, scalable vectorization " 5664 << "unfeasible. Using fixed-width vectorization instead."; 5665 }); 5666 return computeFeasibleMaxVF( 5667 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5668 } 5669 } 5670 5671 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5672 5673 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5674 return UserVF; 5675 5676 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5677 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5678 << ".\n"); 5679 ORE->emit([&]() { 5680 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5681 TheLoop->getStartLoc(), 5682 TheLoop->getHeader()) 5683 << "User-specified vectorization factor " 5684 << ore::NV("UserVectorizationFactor", UserVF) 5685 << " is unsafe, clamping to maximum safe vectorization factor " 5686 << ore::NV("VectorizationFactor", MaxSafeVF); 5687 }); 5688 return MaxSafeVF; 5689 } 5690 5691 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5692 5693 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5694 // Note that both WidestRegister and WidestType may not be a powers of 2. 5695 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5696 5697 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5698 << " / " << WidestType << " bits.\n"); 5699 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5700 << WidestRegister << " bits.\n"); 5701 5702 assert(MaxVectorSize <= WidestRegister && 5703 "Did not expect to pack so many elements" 5704 " into one vector!"); 5705 if (MaxVectorSize == 0) { 5706 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5707 MaxVectorSize = 1; 5708 return ElementCount::getFixed(MaxVectorSize); 5709 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5710 isPowerOf2_32(ConstTripCount)) { 5711 // We need to clamp the VF to be the ConstTripCount. There is no point in 5712 // choosing a higher viable VF as done in the loop below. 5713 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5714 << ConstTripCount << "\n"); 5715 MaxVectorSize = ConstTripCount; 5716 return ElementCount::getFixed(MaxVectorSize); 5717 } 5718 5719 unsigned MaxVF = MaxVectorSize; 5720 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5721 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5722 // Collect all viable vectorization factors larger than the default MaxVF 5723 // (i.e. MaxVectorSize). 5724 SmallVector<ElementCount, 8> VFs; 5725 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5726 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5727 VFs.push_back(ElementCount::getFixed(VS)); 5728 5729 // For each VF calculate its register usage. 5730 auto RUs = calculateRegisterUsage(VFs); 5731 5732 // Select the largest VF which doesn't require more registers than existing 5733 // ones. 5734 for (int i = RUs.size() - 1; i >= 0; --i) { 5735 bool Selected = true; 5736 for (auto& pair : RUs[i].MaxLocalUsers) { 5737 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5738 if (pair.second > TargetNumRegisters) 5739 Selected = false; 5740 } 5741 if (Selected) { 5742 MaxVF = VFs[i].getKnownMinValue(); 5743 break; 5744 } 5745 } 5746 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5747 if (MaxVF < MinVF) { 5748 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5749 << ") with target's minimum: " << MinVF << '\n'); 5750 MaxVF = MinVF; 5751 } 5752 } 5753 } 5754 return ElementCount::getFixed(MaxVF); 5755 } 5756 5757 VectorizationFactor 5758 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5759 // FIXME: This can be fixed for scalable vectors later, because at this stage 5760 // the LoopVectorizer will only consider vectorizing a loop with scalable 5761 // vectors when the loop has a hint to enable vectorization for a given VF. 5762 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5763 5764 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5765 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5766 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5767 5768 unsigned Width = 1; 5769 const float ScalarCost = *ExpectedCost.getValue(); 5770 float Cost = ScalarCost; 5771 5772 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5773 if (ForceVectorization && MaxVF.isVector()) { 5774 // Ignore scalar width, because the user explicitly wants vectorization. 5775 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5776 // evaluation. 5777 Cost = std::numeric_limits<float>::max(); 5778 } 5779 5780 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5781 // Notice that the vector loop needs to be executed less times, so 5782 // we need to divide the cost of the vector loops by the width of 5783 // the vector elements. 5784 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5785 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5786 float VectorCost = *C.first.getValue() / (float)i; 5787 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5788 << " costs: " << (int)VectorCost << ".\n"); 5789 if (!C.second && !ForceVectorization) { 5790 LLVM_DEBUG( 5791 dbgs() << "LV: Not considering vector loop of width " << i 5792 << " because it will not generate any vector instructions.\n"); 5793 continue; 5794 } 5795 5796 // If profitable add it to ProfitableVF list. 5797 if (VectorCost < ScalarCost) { 5798 ProfitableVFs.push_back(VectorizationFactor( 5799 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5800 } 5801 5802 if (VectorCost < Cost) { 5803 Cost = VectorCost; 5804 Width = i; 5805 } 5806 } 5807 5808 if (!EnableCondStoresVectorization && NumPredStores) { 5809 reportVectorizationFailure("There are conditional stores.", 5810 "store that is conditionally executed prevents vectorization", 5811 "ConditionalStore", ORE, TheLoop); 5812 Width = 1; 5813 Cost = ScalarCost; 5814 } 5815 5816 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5817 << "LV: Vectorization seems to be not beneficial, " 5818 << "but was forced by a user.\n"); 5819 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5820 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5821 (unsigned)(Width * Cost)}; 5822 return Factor; 5823 } 5824 5825 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5826 const Loop &L, ElementCount VF) const { 5827 // Cross iteration phis such as reductions need special handling and are 5828 // currently unsupported. 5829 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5830 return Legal->isFirstOrderRecurrence(&Phi) || 5831 Legal->isReductionVariable(&Phi); 5832 })) 5833 return false; 5834 5835 // Phis with uses outside of the loop require special handling and are 5836 // currently unsupported. 5837 for (auto &Entry : Legal->getInductionVars()) { 5838 // Look for uses of the value of the induction at the last iteration. 5839 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5840 for (User *U : PostInc->users()) 5841 if (!L.contains(cast<Instruction>(U))) 5842 return false; 5843 // Look for uses of penultimate value of the induction. 5844 for (User *U : Entry.first->users()) 5845 if (!L.contains(cast<Instruction>(U))) 5846 return false; 5847 } 5848 5849 // Induction variables that are widened require special handling that is 5850 // currently not supported. 5851 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5852 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5853 this->isProfitableToScalarize(Entry.first, VF)); 5854 })) 5855 return false; 5856 5857 return true; 5858 } 5859 5860 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5861 const ElementCount VF) const { 5862 // FIXME: We need a much better cost-model to take different parameters such 5863 // as register pressure, code size increase and cost of extra branches into 5864 // account. For now we apply a very crude heuristic and only consider loops 5865 // with vectorization factors larger than a certain value. 5866 // We also consider epilogue vectorization unprofitable for targets that don't 5867 // consider interleaving beneficial (eg. MVE). 5868 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5869 return false; 5870 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5871 return true; 5872 return false; 5873 } 5874 5875 VectorizationFactor 5876 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5877 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5878 VectorizationFactor Result = VectorizationFactor::Disabled(); 5879 if (!EnableEpilogueVectorization) { 5880 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5881 return Result; 5882 } 5883 5884 if (!isScalarEpilogueAllowed()) { 5885 LLVM_DEBUG( 5886 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5887 "allowed.\n";); 5888 return Result; 5889 } 5890 5891 // FIXME: This can be fixed for scalable vectors later, because at this stage 5892 // the LoopVectorizer will only consider vectorizing a loop with scalable 5893 // vectors when the loop has a hint to enable vectorization for a given VF. 5894 if (MainLoopVF.isScalable()) { 5895 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5896 "yet supported.\n"); 5897 return Result; 5898 } 5899 5900 // Not really a cost consideration, but check for unsupported cases here to 5901 // simplify the logic. 5902 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5903 LLVM_DEBUG( 5904 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5905 "not a supported candidate.\n";); 5906 return Result; 5907 } 5908 5909 if (EpilogueVectorizationForceVF > 1) { 5910 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5911 if (LVP.hasPlanWithVFs( 5912 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5913 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5914 else { 5915 LLVM_DEBUG( 5916 dbgs() 5917 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5918 return Result; 5919 } 5920 } 5921 5922 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5923 TheLoop->getHeader()->getParent()->hasMinSize()) { 5924 LLVM_DEBUG( 5925 dbgs() 5926 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5927 return Result; 5928 } 5929 5930 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5931 return Result; 5932 5933 for (auto &NextVF : ProfitableVFs) 5934 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5935 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5936 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5937 Result = NextVF; 5938 5939 if (Result != VectorizationFactor::Disabled()) 5940 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5941 << Result.Width.getFixedValue() << "\n";); 5942 return Result; 5943 } 5944 5945 std::pair<unsigned, unsigned> 5946 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5947 unsigned MinWidth = -1U; 5948 unsigned MaxWidth = 8; 5949 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5950 5951 // For each block. 5952 for (BasicBlock *BB : TheLoop->blocks()) { 5953 // For each instruction in the loop. 5954 for (Instruction &I : BB->instructionsWithoutDebug()) { 5955 Type *T = I.getType(); 5956 5957 // Skip ignored values. 5958 if (ValuesToIgnore.count(&I)) 5959 continue; 5960 5961 // Only examine Loads, Stores and PHINodes. 5962 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5963 continue; 5964 5965 // Examine PHI nodes that are reduction variables. Update the type to 5966 // account for the recurrence type. 5967 if (auto *PN = dyn_cast<PHINode>(&I)) { 5968 if (!Legal->isReductionVariable(PN)) 5969 continue; 5970 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5971 T = RdxDesc.getRecurrenceType(); 5972 } 5973 5974 // Examine the stored values. 5975 if (auto *ST = dyn_cast<StoreInst>(&I)) 5976 T = ST->getValueOperand()->getType(); 5977 5978 // Ignore loaded pointer types and stored pointer types that are not 5979 // vectorizable. 5980 // 5981 // FIXME: The check here attempts to predict whether a load or store will 5982 // be vectorized. We only know this for certain after a VF has 5983 // been selected. Here, we assume that if an access can be 5984 // vectorized, it will be. We should also look at extending this 5985 // optimization to non-pointer types. 5986 // 5987 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5988 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5989 continue; 5990 5991 MinWidth = std::min(MinWidth, 5992 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5993 MaxWidth = std::max(MaxWidth, 5994 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5995 } 5996 } 5997 5998 return {MinWidth, MaxWidth}; 5999 } 6000 6001 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6002 unsigned LoopCost) { 6003 // -- The interleave heuristics -- 6004 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6005 // There are many micro-architectural considerations that we can't predict 6006 // at this level. For example, frontend pressure (on decode or fetch) due to 6007 // code size, or the number and capabilities of the execution ports. 6008 // 6009 // We use the following heuristics to select the interleave count: 6010 // 1. If the code has reductions, then we interleave to break the cross 6011 // iteration dependency. 6012 // 2. If the loop is really small, then we interleave to reduce the loop 6013 // overhead. 6014 // 3. We don't interleave if we think that we will spill registers to memory 6015 // due to the increased register pressure. 6016 6017 if (!isScalarEpilogueAllowed()) 6018 return 1; 6019 6020 // We used the distance for the interleave count. 6021 if (Legal->getMaxSafeDepDistBytes() != -1U) 6022 return 1; 6023 6024 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6025 const bool HasReductions = !Legal->getReductionVars().empty(); 6026 // Do not interleave loops with a relatively small known or estimated trip 6027 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6028 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6029 // because with the above conditions interleaving can expose ILP and break 6030 // cross iteration dependences for reductions. 6031 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6032 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6033 return 1; 6034 6035 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6036 // We divide by these constants so assume that we have at least one 6037 // instruction that uses at least one register. 6038 for (auto& pair : R.MaxLocalUsers) { 6039 pair.second = std::max(pair.second, 1U); 6040 } 6041 6042 // We calculate the interleave count using the following formula. 6043 // Subtract the number of loop invariants from the number of available 6044 // registers. These registers are used by all of the interleaved instances. 6045 // Next, divide the remaining registers by the number of registers that is 6046 // required by the loop, in order to estimate how many parallel instances 6047 // fit without causing spills. All of this is rounded down if necessary to be 6048 // a power of two. We want power of two interleave count to simplify any 6049 // addressing operations or alignment considerations. 6050 // We also want power of two interleave counts to ensure that the induction 6051 // variable of the vector loop wraps to zero, when tail is folded by masking; 6052 // this currently happens when OptForSize, in which case IC is set to 1 above. 6053 unsigned IC = UINT_MAX; 6054 6055 for (auto& pair : R.MaxLocalUsers) { 6056 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6057 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6058 << " registers of " 6059 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6060 if (VF.isScalar()) { 6061 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6062 TargetNumRegisters = ForceTargetNumScalarRegs; 6063 } else { 6064 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6065 TargetNumRegisters = ForceTargetNumVectorRegs; 6066 } 6067 unsigned MaxLocalUsers = pair.second; 6068 unsigned LoopInvariantRegs = 0; 6069 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6070 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6071 6072 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6073 // Don't count the induction variable as interleaved. 6074 if (EnableIndVarRegisterHeur) { 6075 TmpIC = 6076 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6077 std::max(1U, (MaxLocalUsers - 1))); 6078 } 6079 6080 IC = std::min(IC, TmpIC); 6081 } 6082 6083 // Clamp the interleave ranges to reasonable counts. 6084 unsigned MaxInterleaveCount = 6085 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6086 6087 // Check if the user has overridden the max. 6088 if (VF.isScalar()) { 6089 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6090 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6091 } else { 6092 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6093 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6094 } 6095 6096 // If trip count is known or estimated compile time constant, limit the 6097 // interleave count to be less than the trip count divided by VF, provided it 6098 // is at least 1. 6099 // 6100 // For scalable vectors we can't know if interleaving is beneficial. It may 6101 // not be beneficial for small loops if none of the lanes in the second vector 6102 // iterations is enabled. However, for larger loops, there is likely to be a 6103 // similar benefit as for fixed-width vectors. For now, we choose to leave 6104 // the InterleaveCount as if vscale is '1', although if some information about 6105 // the vector is known (e.g. min vector size), we can make a better decision. 6106 if (BestKnownTC) { 6107 MaxInterleaveCount = 6108 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6109 // Make sure MaxInterleaveCount is greater than 0. 6110 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6111 } 6112 6113 assert(MaxInterleaveCount > 0 && 6114 "Maximum interleave count must be greater than 0"); 6115 6116 // Clamp the calculated IC to be between the 1 and the max interleave count 6117 // that the target and trip count allows. 6118 if (IC > MaxInterleaveCount) 6119 IC = MaxInterleaveCount; 6120 else 6121 // Make sure IC is greater than 0. 6122 IC = std::max(1u, IC); 6123 6124 assert(IC > 0 && "Interleave count must be greater than 0."); 6125 6126 // If we did not calculate the cost for VF (because the user selected the VF) 6127 // then we calculate the cost of VF here. 6128 if (LoopCost == 0) { 6129 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6130 LoopCost = *expectedCost(VF).first.getValue(); 6131 } 6132 6133 assert(LoopCost && "Non-zero loop cost expected"); 6134 6135 // Interleave if we vectorized this loop and there is a reduction that could 6136 // benefit from interleaving. 6137 if (VF.isVector() && HasReductions) { 6138 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6139 return IC; 6140 } 6141 6142 // Note that if we've already vectorized the loop we will have done the 6143 // runtime check and so interleaving won't require further checks. 6144 bool InterleavingRequiresRuntimePointerCheck = 6145 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6146 6147 // We want to interleave small loops in order to reduce the loop overhead and 6148 // potentially expose ILP opportunities. 6149 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6150 << "LV: IC is " << IC << '\n' 6151 << "LV: VF is " << VF << '\n'); 6152 const bool AggressivelyInterleaveReductions = 6153 TTI.enableAggressiveInterleaving(HasReductions); 6154 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6155 // We assume that the cost overhead is 1 and we use the cost model 6156 // to estimate the cost of the loop and interleave until the cost of the 6157 // loop overhead is about 5% of the cost of the loop. 6158 unsigned SmallIC = 6159 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6160 6161 // Interleave until store/load ports (estimated by max interleave count) are 6162 // saturated. 6163 unsigned NumStores = Legal->getNumStores(); 6164 unsigned NumLoads = Legal->getNumLoads(); 6165 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6166 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6167 6168 // If we have a scalar reduction (vector reductions are already dealt with 6169 // by this point), we can increase the critical path length if the loop 6170 // we're interleaving is inside another loop. Limit, by default to 2, so the 6171 // critical path only gets increased by one reduction operation. 6172 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6173 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6174 SmallIC = std::min(SmallIC, F); 6175 StoresIC = std::min(StoresIC, F); 6176 LoadsIC = std::min(LoadsIC, F); 6177 } 6178 6179 if (EnableLoadStoreRuntimeInterleave && 6180 std::max(StoresIC, LoadsIC) > SmallIC) { 6181 LLVM_DEBUG( 6182 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6183 return std::max(StoresIC, LoadsIC); 6184 } 6185 6186 // If there are scalar reductions and TTI has enabled aggressive 6187 // interleaving for reductions, we will interleave to expose ILP. 6188 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6189 AggressivelyInterleaveReductions) { 6190 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6191 // Interleave no less than SmallIC but not as aggressive as the normal IC 6192 // to satisfy the rare situation when resources are too limited. 6193 return std::max(IC / 2, SmallIC); 6194 } else { 6195 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6196 return SmallIC; 6197 } 6198 } 6199 6200 // Interleave if this is a large loop (small loops are already dealt with by 6201 // this point) that could benefit from interleaving. 6202 if (AggressivelyInterleaveReductions) { 6203 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6204 return IC; 6205 } 6206 6207 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6208 return 1; 6209 } 6210 6211 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6212 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6213 // This function calculates the register usage by measuring the highest number 6214 // of values that are alive at a single location. Obviously, this is a very 6215 // rough estimation. We scan the loop in a topological order in order and 6216 // assign a number to each instruction. We use RPO to ensure that defs are 6217 // met before their users. We assume that each instruction that has in-loop 6218 // users starts an interval. We record every time that an in-loop value is 6219 // used, so we have a list of the first and last occurrences of each 6220 // instruction. Next, we transpose this data structure into a multi map that 6221 // holds the list of intervals that *end* at a specific location. This multi 6222 // map allows us to perform a linear search. We scan the instructions linearly 6223 // and record each time that a new interval starts, by placing it in a set. 6224 // If we find this value in the multi-map then we remove it from the set. 6225 // The max register usage is the maximum size of the set. 6226 // We also search for instructions that are defined outside the loop, but are 6227 // used inside the loop. We need this number separately from the max-interval 6228 // usage number because when we unroll, loop-invariant values do not take 6229 // more register. 6230 LoopBlocksDFS DFS(TheLoop); 6231 DFS.perform(LI); 6232 6233 RegisterUsage RU; 6234 6235 // Each 'key' in the map opens a new interval. The values 6236 // of the map are the index of the 'last seen' usage of the 6237 // instruction that is the key. 6238 using IntervalMap = DenseMap<Instruction *, unsigned>; 6239 6240 // Maps instruction to its index. 6241 SmallVector<Instruction *, 64> IdxToInstr; 6242 // Marks the end of each interval. 6243 IntervalMap EndPoint; 6244 // Saves the list of instruction indices that are used in the loop. 6245 SmallPtrSet<Instruction *, 8> Ends; 6246 // Saves the list of values that are used in the loop but are 6247 // defined outside the loop, such as arguments and constants. 6248 SmallPtrSet<Value *, 8> LoopInvariants; 6249 6250 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6251 for (Instruction &I : BB->instructionsWithoutDebug()) { 6252 IdxToInstr.push_back(&I); 6253 6254 // Save the end location of each USE. 6255 for (Value *U : I.operands()) { 6256 auto *Instr = dyn_cast<Instruction>(U); 6257 6258 // Ignore non-instruction values such as arguments, constants, etc. 6259 if (!Instr) 6260 continue; 6261 6262 // If this instruction is outside the loop then record it and continue. 6263 if (!TheLoop->contains(Instr)) { 6264 LoopInvariants.insert(Instr); 6265 continue; 6266 } 6267 6268 // Overwrite previous end points. 6269 EndPoint[Instr] = IdxToInstr.size(); 6270 Ends.insert(Instr); 6271 } 6272 } 6273 } 6274 6275 // Saves the list of intervals that end with the index in 'key'. 6276 using InstrList = SmallVector<Instruction *, 2>; 6277 DenseMap<unsigned, InstrList> TransposeEnds; 6278 6279 // Transpose the EndPoints to a list of values that end at each index. 6280 for (auto &Interval : EndPoint) 6281 TransposeEnds[Interval.second].push_back(Interval.first); 6282 6283 SmallPtrSet<Instruction *, 8> OpenIntervals; 6284 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6285 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6286 6287 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6288 6289 // A lambda that gets the register usage for the given type and VF. 6290 const auto &TTICapture = TTI; 6291 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6292 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6293 return 0U; 6294 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6295 }; 6296 6297 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6298 Instruction *I = IdxToInstr[i]; 6299 6300 // Remove all of the instructions that end at this location. 6301 InstrList &List = TransposeEnds[i]; 6302 for (Instruction *ToRemove : List) 6303 OpenIntervals.erase(ToRemove); 6304 6305 // Ignore instructions that are never used within the loop. 6306 if (!Ends.count(I)) 6307 continue; 6308 6309 // Skip ignored values. 6310 if (ValuesToIgnore.count(I)) 6311 continue; 6312 6313 // For each VF find the maximum usage of registers. 6314 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6315 // Count the number of live intervals. 6316 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6317 6318 if (VFs[j].isScalar()) { 6319 for (auto Inst : OpenIntervals) { 6320 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6321 if (RegUsage.find(ClassID) == RegUsage.end()) 6322 RegUsage[ClassID] = 1; 6323 else 6324 RegUsage[ClassID] += 1; 6325 } 6326 } else { 6327 collectUniformsAndScalars(VFs[j]); 6328 for (auto Inst : OpenIntervals) { 6329 // Skip ignored values for VF > 1. 6330 if (VecValuesToIgnore.count(Inst)) 6331 continue; 6332 if (isScalarAfterVectorization(Inst, VFs[j])) { 6333 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6334 if (RegUsage.find(ClassID) == RegUsage.end()) 6335 RegUsage[ClassID] = 1; 6336 else 6337 RegUsage[ClassID] += 1; 6338 } else { 6339 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6340 if (RegUsage.find(ClassID) == RegUsage.end()) 6341 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6342 else 6343 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6344 } 6345 } 6346 } 6347 6348 for (auto& pair : RegUsage) { 6349 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6350 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6351 else 6352 MaxUsages[j][pair.first] = pair.second; 6353 } 6354 } 6355 6356 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6357 << OpenIntervals.size() << '\n'); 6358 6359 // Add the current instruction to the list of open intervals. 6360 OpenIntervals.insert(I); 6361 } 6362 6363 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6364 SmallMapVector<unsigned, unsigned, 4> Invariant; 6365 6366 for (auto Inst : LoopInvariants) { 6367 unsigned Usage = 6368 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6369 unsigned ClassID = 6370 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6371 if (Invariant.find(ClassID) == Invariant.end()) 6372 Invariant[ClassID] = Usage; 6373 else 6374 Invariant[ClassID] += Usage; 6375 } 6376 6377 LLVM_DEBUG({ 6378 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6379 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6380 << " item\n"; 6381 for (const auto &pair : MaxUsages[i]) { 6382 dbgs() << "LV(REG): RegisterClass: " 6383 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6384 << " registers\n"; 6385 } 6386 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6387 << " item\n"; 6388 for (const auto &pair : Invariant) { 6389 dbgs() << "LV(REG): RegisterClass: " 6390 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6391 << " registers\n"; 6392 } 6393 }); 6394 6395 RU.LoopInvariantRegs = Invariant; 6396 RU.MaxLocalUsers = MaxUsages[i]; 6397 RUs[i] = RU; 6398 } 6399 6400 return RUs; 6401 } 6402 6403 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6404 // TODO: Cost model for emulated masked load/store is completely 6405 // broken. This hack guides the cost model to use an artificially 6406 // high enough value to practically disable vectorization with such 6407 // operations, except where previously deployed legality hack allowed 6408 // using very low cost values. This is to avoid regressions coming simply 6409 // from moving "masked load/store" check from legality to cost model. 6410 // Masked Load/Gather emulation was previously never allowed. 6411 // Limited number of Masked Store/Scatter emulation was allowed. 6412 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6413 return isa<LoadInst>(I) || 6414 (isa<StoreInst>(I) && 6415 NumPredStores > NumberOfStoresToPredicate); 6416 } 6417 6418 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6419 // If we aren't vectorizing the loop, or if we've already collected the 6420 // instructions to scalarize, there's nothing to do. Collection may already 6421 // have occurred if we have a user-selected VF and are now computing the 6422 // expected cost for interleaving. 6423 if (VF.isScalar() || VF.isZero() || 6424 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6425 return; 6426 6427 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6428 // not profitable to scalarize any instructions, the presence of VF in the 6429 // map will indicate that we've analyzed it already. 6430 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6431 6432 // Find all the instructions that are scalar with predication in the loop and 6433 // determine if it would be better to not if-convert the blocks they are in. 6434 // If so, we also record the instructions to scalarize. 6435 for (BasicBlock *BB : TheLoop->blocks()) { 6436 if (!blockNeedsPredication(BB)) 6437 continue; 6438 for (Instruction &I : *BB) 6439 if (isScalarWithPredication(&I)) { 6440 ScalarCostsTy ScalarCosts; 6441 // Do not apply discount logic if hacked cost is needed 6442 // for emulated masked memrefs. 6443 if (!useEmulatedMaskMemRefHack(&I) && 6444 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6445 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6446 // Remember that BB will remain after vectorization. 6447 PredicatedBBsAfterVectorization.insert(BB); 6448 } 6449 } 6450 } 6451 6452 int LoopVectorizationCostModel::computePredInstDiscount( 6453 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6454 assert(!isUniformAfterVectorization(PredInst, VF) && 6455 "Instruction marked uniform-after-vectorization will be predicated"); 6456 6457 // Initialize the discount to zero, meaning that the scalar version and the 6458 // vector version cost the same. 6459 InstructionCost Discount = 0; 6460 6461 // Holds instructions to analyze. The instructions we visit are mapped in 6462 // ScalarCosts. Those instructions are the ones that would be scalarized if 6463 // we find that the scalar version costs less. 6464 SmallVector<Instruction *, 8> Worklist; 6465 6466 // Returns true if the given instruction can be scalarized. 6467 auto canBeScalarized = [&](Instruction *I) -> bool { 6468 // We only attempt to scalarize instructions forming a single-use chain 6469 // from the original predicated block that would otherwise be vectorized. 6470 // Although not strictly necessary, we give up on instructions we know will 6471 // already be scalar to avoid traversing chains that are unlikely to be 6472 // beneficial. 6473 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6474 isScalarAfterVectorization(I, VF)) 6475 return false; 6476 6477 // If the instruction is scalar with predication, it will be analyzed 6478 // separately. We ignore it within the context of PredInst. 6479 if (isScalarWithPredication(I)) 6480 return false; 6481 6482 // If any of the instruction's operands are uniform after vectorization, 6483 // the instruction cannot be scalarized. This prevents, for example, a 6484 // masked load from being scalarized. 6485 // 6486 // We assume we will only emit a value for lane zero of an instruction 6487 // marked uniform after vectorization, rather than VF identical values. 6488 // Thus, if we scalarize an instruction that uses a uniform, we would 6489 // create uses of values corresponding to the lanes we aren't emitting code 6490 // for. This behavior can be changed by allowing getScalarValue to clone 6491 // the lane zero values for uniforms rather than asserting. 6492 for (Use &U : I->operands()) 6493 if (auto *J = dyn_cast<Instruction>(U.get())) 6494 if (isUniformAfterVectorization(J, VF)) 6495 return false; 6496 6497 // Otherwise, we can scalarize the instruction. 6498 return true; 6499 }; 6500 6501 // Compute the expected cost discount from scalarizing the entire expression 6502 // feeding the predicated instruction. We currently only consider expressions 6503 // that are single-use instruction chains. 6504 Worklist.push_back(PredInst); 6505 while (!Worklist.empty()) { 6506 Instruction *I = Worklist.pop_back_val(); 6507 6508 // If we've already analyzed the instruction, there's nothing to do. 6509 if (ScalarCosts.find(I) != ScalarCosts.end()) 6510 continue; 6511 6512 // Compute the cost of the vector instruction. Note that this cost already 6513 // includes the scalarization overhead of the predicated instruction. 6514 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6515 6516 // Compute the cost of the scalarized instruction. This cost is the cost of 6517 // the instruction as if it wasn't if-converted and instead remained in the 6518 // predicated block. We will scale this cost by block probability after 6519 // computing the scalarization overhead. 6520 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6521 InstructionCost ScalarCost = 6522 VF.getKnownMinValue() * 6523 getInstructionCost(I, ElementCount::getFixed(1)).first; 6524 6525 // Compute the scalarization overhead of needed insertelement instructions 6526 // and phi nodes. 6527 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6528 ScalarCost += TTI.getScalarizationOverhead( 6529 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6530 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6531 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6532 ScalarCost += 6533 VF.getKnownMinValue() * 6534 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6535 } 6536 6537 // Compute the scalarization overhead of needed extractelement 6538 // instructions. For each of the instruction's operands, if the operand can 6539 // be scalarized, add it to the worklist; otherwise, account for the 6540 // overhead. 6541 for (Use &U : I->operands()) 6542 if (auto *J = dyn_cast<Instruction>(U.get())) { 6543 assert(VectorType::isValidElementType(J->getType()) && 6544 "Instruction has non-scalar type"); 6545 if (canBeScalarized(J)) 6546 Worklist.push_back(J); 6547 else if (needsExtract(J, VF)) { 6548 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6549 ScalarCost += TTI.getScalarizationOverhead( 6550 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6551 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6552 } 6553 } 6554 6555 // Scale the total scalar cost by block probability. 6556 ScalarCost /= getReciprocalPredBlockProb(); 6557 6558 // Compute the discount. A non-negative discount means the vector version 6559 // of the instruction costs more, and scalarizing would be beneficial. 6560 Discount += VectorCost - ScalarCost; 6561 ScalarCosts[I] = ScalarCost; 6562 } 6563 6564 return *Discount.getValue(); 6565 } 6566 6567 LoopVectorizationCostModel::VectorizationCostTy 6568 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6569 VectorizationCostTy Cost; 6570 6571 // For each block. 6572 for (BasicBlock *BB : TheLoop->blocks()) { 6573 VectorizationCostTy BlockCost; 6574 6575 // For each instruction in the old loop. 6576 for (Instruction &I : BB->instructionsWithoutDebug()) { 6577 // Skip ignored values. 6578 if (ValuesToIgnore.count(&I) || 6579 (VF.isVector() && VecValuesToIgnore.count(&I))) 6580 continue; 6581 6582 VectorizationCostTy C = getInstructionCost(&I, VF); 6583 6584 // Check if we should override the cost. 6585 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6586 C.first = InstructionCost(ForceTargetInstructionCost); 6587 6588 BlockCost.first += C.first; 6589 BlockCost.second |= C.second; 6590 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6591 << " for VF " << VF << " For instruction: " << I 6592 << '\n'); 6593 } 6594 6595 // If we are vectorizing a predicated block, it will have been 6596 // if-converted. This means that the block's instructions (aside from 6597 // stores and instructions that may divide by zero) will now be 6598 // unconditionally executed. For the scalar case, we may not always execute 6599 // the predicated block, if it is an if-else block. Thus, scale the block's 6600 // cost by the probability of executing it. blockNeedsPredication from 6601 // Legal is used so as to not include all blocks in tail folded loops. 6602 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6603 BlockCost.first /= getReciprocalPredBlockProb(); 6604 6605 Cost.first += BlockCost.first; 6606 Cost.second |= BlockCost.second; 6607 } 6608 6609 return Cost; 6610 } 6611 6612 /// Gets Address Access SCEV after verifying that the access pattern 6613 /// is loop invariant except the induction variable dependence. 6614 /// 6615 /// This SCEV can be sent to the Target in order to estimate the address 6616 /// calculation cost. 6617 static const SCEV *getAddressAccessSCEV( 6618 Value *Ptr, 6619 LoopVectorizationLegality *Legal, 6620 PredicatedScalarEvolution &PSE, 6621 const Loop *TheLoop) { 6622 6623 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6624 if (!Gep) 6625 return nullptr; 6626 6627 // We are looking for a gep with all loop invariant indices except for one 6628 // which should be an induction variable. 6629 auto SE = PSE.getSE(); 6630 unsigned NumOperands = Gep->getNumOperands(); 6631 for (unsigned i = 1; i < NumOperands; ++i) { 6632 Value *Opd = Gep->getOperand(i); 6633 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6634 !Legal->isInductionVariable(Opd)) 6635 return nullptr; 6636 } 6637 6638 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6639 return PSE.getSCEV(Ptr); 6640 } 6641 6642 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6643 return Legal->hasStride(I->getOperand(0)) || 6644 Legal->hasStride(I->getOperand(1)); 6645 } 6646 6647 unsigned 6648 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6649 ElementCount VF) { 6650 assert(VF.isVector() && 6651 "Scalarization cost of instruction implies vectorization."); 6652 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6653 Type *ValTy = getMemInstValueType(I); 6654 auto SE = PSE.getSE(); 6655 6656 unsigned AS = getLoadStoreAddressSpace(I); 6657 Value *Ptr = getLoadStorePointerOperand(I); 6658 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6659 6660 // Figure out whether the access is strided and get the stride value 6661 // if it's known in compile time 6662 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6663 6664 // Get the cost of the scalar memory instruction and address computation. 6665 unsigned Cost = 6666 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6667 6668 // Don't pass *I here, since it is scalar but will actually be part of a 6669 // vectorized loop where the user of it is a vectorized instruction. 6670 const Align Alignment = getLoadStoreAlignment(I); 6671 Cost += VF.getKnownMinValue() * 6672 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6673 AS, TTI::TCK_RecipThroughput); 6674 6675 // Get the overhead of the extractelement and insertelement instructions 6676 // we might create due to scalarization. 6677 Cost += getScalarizationOverhead(I, VF); 6678 6679 // If we have a predicated store, it may not be executed for each vector 6680 // lane. Scale the cost by the probability of executing the predicated 6681 // block. 6682 if (isPredicatedInst(I)) { 6683 Cost /= getReciprocalPredBlockProb(); 6684 6685 if (useEmulatedMaskMemRefHack(I)) 6686 // Artificially setting to a high enough value to practically disable 6687 // vectorization with such operations. 6688 Cost = 3000000; 6689 } 6690 6691 return Cost; 6692 } 6693 6694 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6695 ElementCount VF) { 6696 Type *ValTy = getMemInstValueType(I); 6697 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6698 Value *Ptr = getLoadStorePointerOperand(I); 6699 unsigned AS = getLoadStoreAddressSpace(I); 6700 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6701 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6702 6703 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6704 "Stride should be 1 or -1 for consecutive memory access"); 6705 const Align Alignment = getLoadStoreAlignment(I); 6706 unsigned Cost = 0; 6707 if (Legal->isMaskRequired(I)) 6708 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6709 CostKind); 6710 else 6711 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6712 CostKind, I); 6713 6714 bool Reverse = ConsecutiveStride < 0; 6715 if (Reverse) 6716 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6717 return Cost; 6718 } 6719 6720 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6721 ElementCount VF) { 6722 assert(Legal->isUniformMemOp(*I)); 6723 6724 Type *ValTy = getMemInstValueType(I); 6725 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6726 const Align Alignment = getLoadStoreAlignment(I); 6727 unsigned AS = getLoadStoreAddressSpace(I); 6728 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6729 if (isa<LoadInst>(I)) { 6730 return TTI.getAddressComputationCost(ValTy) + 6731 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6732 CostKind) + 6733 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6734 } 6735 StoreInst *SI = cast<StoreInst>(I); 6736 6737 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6738 return TTI.getAddressComputationCost(ValTy) + 6739 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6740 CostKind) + 6741 (isLoopInvariantStoreValue 6742 ? 0 6743 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6744 VF.getKnownMinValue() - 1)); 6745 } 6746 6747 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6748 ElementCount VF) { 6749 Type *ValTy = getMemInstValueType(I); 6750 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6751 const Align Alignment = getLoadStoreAlignment(I); 6752 const Value *Ptr = getLoadStorePointerOperand(I); 6753 6754 return TTI.getAddressComputationCost(VectorTy) + 6755 TTI.getGatherScatterOpCost( 6756 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6757 TargetTransformInfo::TCK_RecipThroughput, I); 6758 } 6759 6760 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6761 ElementCount VF) { 6762 Type *ValTy = getMemInstValueType(I); 6763 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6764 unsigned AS = getLoadStoreAddressSpace(I); 6765 6766 auto Group = getInterleavedAccessGroup(I); 6767 assert(Group && "Fail to get an interleaved access group."); 6768 6769 unsigned InterleaveFactor = Group->getFactor(); 6770 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6771 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6772 6773 // Holds the indices of existing members in an interleaved load group. 6774 // An interleaved store group doesn't need this as it doesn't allow gaps. 6775 SmallVector<unsigned, 4> Indices; 6776 if (isa<LoadInst>(I)) { 6777 for (unsigned i = 0; i < InterleaveFactor; i++) 6778 if (Group->getMember(i)) 6779 Indices.push_back(i); 6780 } 6781 6782 // Calculate the cost of the whole interleaved group. 6783 bool UseMaskForGaps = 6784 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6785 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6786 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6787 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6788 6789 if (Group->isReverse()) { 6790 // TODO: Add support for reversed masked interleaved access. 6791 assert(!Legal->isMaskRequired(I) && 6792 "Reverse masked interleaved access not supported."); 6793 Cost += Group->getNumMembers() * 6794 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6795 } 6796 return Cost; 6797 } 6798 6799 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6800 ElementCount VF) { 6801 // Calculate scalar cost only. Vectorization cost should be ready at this 6802 // moment. 6803 if (VF.isScalar()) { 6804 Type *ValTy = getMemInstValueType(I); 6805 const Align Alignment = getLoadStoreAlignment(I); 6806 unsigned AS = getLoadStoreAddressSpace(I); 6807 6808 return TTI.getAddressComputationCost(ValTy) + 6809 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6810 TTI::TCK_RecipThroughput, I); 6811 } 6812 return getWideningCost(I, VF); 6813 } 6814 6815 LoopVectorizationCostModel::VectorizationCostTy 6816 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6817 ElementCount VF) { 6818 // If we know that this instruction will remain uniform, check the cost of 6819 // the scalar version. 6820 if (isUniformAfterVectorization(I, VF)) 6821 VF = ElementCount::getFixed(1); 6822 6823 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6824 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6825 6826 // Forced scalars do not have any scalarization overhead. 6827 auto ForcedScalar = ForcedScalars.find(VF); 6828 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6829 auto InstSet = ForcedScalar->second; 6830 if (InstSet.count(I)) 6831 return VectorizationCostTy( 6832 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6833 VF.getKnownMinValue()), 6834 false); 6835 } 6836 6837 Type *VectorTy; 6838 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6839 6840 bool TypeNotScalarized = 6841 VF.isVector() && VectorTy->isVectorTy() && 6842 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6843 return VectorizationCostTy(C, TypeNotScalarized); 6844 } 6845 6846 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6847 ElementCount VF) { 6848 6849 assert(!VF.isScalable() && 6850 "cannot compute scalarization overhead for scalable vectorization"); 6851 if (VF.isScalar()) 6852 return 0; 6853 6854 unsigned Cost = 0; 6855 Type *RetTy = ToVectorTy(I->getType(), VF); 6856 if (!RetTy->isVoidTy() && 6857 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6858 Cost += TTI.getScalarizationOverhead( 6859 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6860 true, false); 6861 6862 // Some targets keep addresses scalar. 6863 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6864 return Cost; 6865 6866 // Some targets support efficient element stores. 6867 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6868 return Cost; 6869 6870 // Collect operands to consider. 6871 CallInst *CI = dyn_cast<CallInst>(I); 6872 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6873 6874 // Skip operands that do not require extraction/scalarization and do not incur 6875 // any overhead. 6876 return Cost + TTI.getOperandsScalarizationOverhead( 6877 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6878 } 6879 6880 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6881 if (VF.isScalar()) 6882 return; 6883 NumPredStores = 0; 6884 for (BasicBlock *BB : TheLoop->blocks()) { 6885 // For each instruction in the old loop. 6886 for (Instruction &I : *BB) { 6887 Value *Ptr = getLoadStorePointerOperand(&I); 6888 if (!Ptr) 6889 continue; 6890 6891 // TODO: We should generate better code and update the cost model for 6892 // predicated uniform stores. Today they are treated as any other 6893 // predicated store (see added test cases in 6894 // invariant-store-vectorization.ll). 6895 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6896 NumPredStores++; 6897 6898 if (Legal->isUniformMemOp(I)) { 6899 // TODO: Avoid replicating loads and stores instead of 6900 // relying on instcombine to remove them. 6901 // Load: Scalar load + broadcast 6902 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6903 unsigned Cost = getUniformMemOpCost(&I, VF); 6904 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6905 continue; 6906 } 6907 6908 // We assume that widening is the best solution when possible. 6909 if (memoryInstructionCanBeWidened(&I, VF)) { 6910 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6911 int ConsecutiveStride = 6912 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6913 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6914 "Expected consecutive stride."); 6915 InstWidening Decision = 6916 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6917 setWideningDecision(&I, VF, Decision, Cost); 6918 continue; 6919 } 6920 6921 // Choose between Interleaving, Gather/Scatter or Scalarization. 6922 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6923 unsigned NumAccesses = 1; 6924 if (isAccessInterleaved(&I)) { 6925 auto Group = getInterleavedAccessGroup(&I); 6926 assert(Group && "Fail to get an interleaved access group."); 6927 6928 // Make one decision for the whole group. 6929 if (getWideningDecision(&I, VF) != CM_Unknown) 6930 continue; 6931 6932 NumAccesses = Group->getNumMembers(); 6933 if (interleavedAccessCanBeWidened(&I, VF)) 6934 InterleaveCost = getInterleaveGroupCost(&I, VF); 6935 } 6936 6937 unsigned GatherScatterCost = 6938 isLegalGatherOrScatter(&I) 6939 ? getGatherScatterCost(&I, VF) * NumAccesses 6940 : std::numeric_limits<unsigned>::max(); 6941 6942 unsigned ScalarizationCost = 6943 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6944 6945 // Choose better solution for the current VF, 6946 // write down this decision and use it during vectorization. 6947 unsigned Cost; 6948 InstWidening Decision; 6949 if (InterleaveCost <= GatherScatterCost && 6950 InterleaveCost < ScalarizationCost) { 6951 Decision = CM_Interleave; 6952 Cost = InterleaveCost; 6953 } else if (GatherScatterCost < ScalarizationCost) { 6954 Decision = CM_GatherScatter; 6955 Cost = GatherScatterCost; 6956 } else { 6957 Decision = CM_Scalarize; 6958 Cost = ScalarizationCost; 6959 } 6960 // If the instructions belongs to an interleave group, the whole group 6961 // receives the same decision. The whole group receives the cost, but 6962 // the cost will actually be assigned to one instruction. 6963 if (auto Group = getInterleavedAccessGroup(&I)) 6964 setWideningDecision(Group, VF, Decision, Cost); 6965 else 6966 setWideningDecision(&I, VF, Decision, Cost); 6967 } 6968 } 6969 6970 // Make sure that any load of address and any other address computation 6971 // remains scalar unless there is gather/scatter support. This avoids 6972 // inevitable extracts into address registers, and also has the benefit of 6973 // activating LSR more, since that pass can't optimize vectorized 6974 // addresses. 6975 if (TTI.prefersVectorizedAddressing()) 6976 return; 6977 6978 // Start with all scalar pointer uses. 6979 SmallPtrSet<Instruction *, 8> AddrDefs; 6980 for (BasicBlock *BB : TheLoop->blocks()) 6981 for (Instruction &I : *BB) { 6982 Instruction *PtrDef = 6983 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6984 if (PtrDef && TheLoop->contains(PtrDef) && 6985 getWideningDecision(&I, VF) != CM_GatherScatter) 6986 AddrDefs.insert(PtrDef); 6987 } 6988 6989 // Add all instructions used to generate the addresses. 6990 SmallVector<Instruction *, 4> Worklist; 6991 for (auto *I : AddrDefs) 6992 Worklist.push_back(I); 6993 while (!Worklist.empty()) { 6994 Instruction *I = Worklist.pop_back_val(); 6995 for (auto &Op : I->operands()) 6996 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6997 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6998 AddrDefs.insert(InstOp).second) 6999 Worklist.push_back(InstOp); 7000 } 7001 7002 for (auto *I : AddrDefs) { 7003 if (isa<LoadInst>(I)) { 7004 // Setting the desired widening decision should ideally be handled in 7005 // by cost functions, but since this involves the task of finding out 7006 // if the loaded register is involved in an address computation, it is 7007 // instead changed here when we know this is the case. 7008 InstWidening Decision = getWideningDecision(I, VF); 7009 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7010 // Scalarize a widened load of address. 7011 setWideningDecision( 7012 I, VF, CM_Scalarize, 7013 (VF.getKnownMinValue() * 7014 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7015 else if (auto Group = getInterleavedAccessGroup(I)) { 7016 // Scalarize an interleave group of address loads. 7017 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7018 if (Instruction *Member = Group->getMember(I)) 7019 setWideningDecision( 7020 Member, VF, CM_Scalarize, 7021 (VF.getKnownMinValue() * 7022 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7023 } 7024 } 7025 } else 7026 // Make sure I gets scalarized and a cost estimate without 7027 // scalarization overhead. 7028 ForcedScalars[VF].insert(I); 7029 } 7030 } 7031 7032 InstructionCost 7033 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7034 Type *&VectorTy) { 7035 Type *RetTy = I->getType(); 7036 if (canTruncateToMinimalBitwidth(I, VF)) 7037 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7038 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7039 auto SE = PSE.getSE(); 7040 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7041 7042 // TODO: We need to estimate the cost of intrinsic calls. 7043 switch (I->getOpcode()) { 7044 case Instruction::GetElementPtr: 7045 // We mark this instruction as zero-cost because the cost of GEPs in 7046 // vectorized code depends on whether the corresponding memory instruction 7047 // is scalarized or not. Therefore, we handle GEPs with the memory 7048 // instruction cost. 7049 return 0; 7050 case Instruction::Br: { 7051 // In cases of scalarized and predicated instructions, there will be VF 7052 // predicated blocks in the vectorized loop. Each branch around these 7053 // blocks requires also an extract of its vector compare i1 element. 7054 bool ScalarPredicatedBB = false; 7055 BranchInst *BI = cast<BranchInst>(I); 7056 if (VF.isVector() && BI->isConditional() && 7057 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7058 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7059 ScalarPredicatedBB = true; 7060 7061 if (ScalarPredicatedBB) { 7062 // Return cost for branches around scalarized and predicated blocks. 7063 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7064 auto *Vec_i1Ty = 7065 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7066 return (TTI.getScalarizationOverhead( 7067 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7068 false, true) + 7069 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7070 VF.getKnownMinValue())); 7071 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7072 // The back-edge branch will remain, as will all scalar branches. 7073 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7074 else 7075 // This branch will be eliminated by if-conversion. 7076 return 0; 7077 // Note: We currently assume zero cost for an unconditional branch inside 7078 // a predicated block since it will become a fall-through, although we 7079 // may decide in the future to call TTI for all branches. 7080 } 7081 case Instruction::PHI: { 7082 auto *Phi = cast<PHINode>(I); 7083 7084 // First-order recurrences are replaced by vector shuffles inside the loop. 7085 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7086 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7087 return TTI.getShuffleCost( 7088 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7089 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7090 7091 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7092 // converted into select instructions. We require N - 1 selects per phi 7093 // node, where N is the number of incoming values. 7094 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7095 return (Phi->getNumIncomingValues() - 1) * 7096 TTI.getCmpSelInstrCost( 7097 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7098 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7099 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7100 7101 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7102 } 7103 case Instruction::UDiv: 7104 case Instruction::SDiv: 7105 case Instruction::URem: 7106 case Instruction::SRem: 7107 // If we have a predicated instruction, it may not be executed for each 7108 // vector lane. Get the scalarization cost and scale this amount by the 7109 // probability of executing the predicated block. If the instruction is not 7110 // predicated, we fall through to the next case. 7111 if (VF.isVector() && isScalarWithPredication(I)) { 7112 unsigned Cost = 0; 7113 7114 // These instructions have a non-void type, so account for the phi nodes 7115 // that we will create. This cost is likely to be zero. The phi node 7116 // cost, if any, should be scaled by the block probability because it 7117 // models a copy at the end of each predicated block. 7118 Cost += VF.getKnownMinValue() * 7119 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7120 7121 // The cost of the non-predicated instruction. 7122 Cost += VF.getKnownMinValue() * 7123 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7124 7125 // The cost of insertelement and extractelement instructions needed for 7126 // scalarization. 7127 Cost += getScalarizationOverhead(I, VF); 7128 7129 // Scale the cost by the probability of executing the predicated blocks. 7130 // This assumes the predicated block for each vector lane is equally 7131 // likely. 7132 return Cost / getReciprocalPredBlockProb(); 7133 } 7134 LLVM_FALLTHROUGH; 7135 case Instruction::Add: 7136 case Instruction::FAdd: 7137 case Instruction::Sub: 7138 case Instruction::FSub: 7139 case Instruction::Mul: 7140 case Instruction::FMul: 7141 case Instruction::FDiv: 7142 case Instruction::FRem: 7143 case Instruction::Shl: 7144 case Instruction::LShr: 7145 case Instruction::AShr: 7146 case Instruction::And: 7147 case Instruction::Or: 7148 case Instruction::Xor: { 7149 // Since we will replace the stride by 1 the multiplication should go away. 7150 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7151 return 0; 7152 // Certain instructions can be cheaper to vectorize if they have a constant 7153 // second vector operand. One example of this are shifts on x86. 7154 Value *Op2 = I->getOperand(1); 7155 TargetTransformInfo::OperandValueProperties Op2VP; 7156 TargetTransformInfo::OperandValueKind Op2VK = 7157 TTI.getOperandInfo(Op2, Op2VP); 7158 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7159 Op2VK = TargetTransformInfo::OK_UniformValue; 7160 7161 SmallVector<const Value *, 4> Operands(I->operand_values()); 7162 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7163 return N * TTI.getArithmeticInstrCost( 7164 I->getOpcode(), VectorTy, CostKind, 7165 TargetTransformInfo::OK_AnyValue, 7166 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7167 } 7168 case Instruction::FNeg: { 7169 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7170 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7171 return N * TTI.getArithmeticInstrCost( 7172 I->getOpcode(), VectorTy, CostKind, 7173 TargetTransformInfo::OK_AnyValue, 7174 TargetTransformInfo::OK_AnyValue, 7175 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7176 I->getOperand(0), I); 7177 } 7178 case Instruction::Select: { 7179 SelectInst *SI = cast<SelectInst>(I); 7180 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7181 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7182 Type *CondTy = SI->getCondition()->getType(); 7183 if (!ScalarCond) { 7184 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7185 CondTy = VectorType::get(CondTy, VF); 7186 } 7187 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7188 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7189 } 7190 case Instruction::ICmp: 7191 case Instruction::FCmp: { 7192 Type *ValTy = I->getOperand(0)->getType(); 7193 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7194 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7195 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7196 VectorTy = ToVectorTy(ValTy, VF); 7197 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7198 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7199 } 7200 case Instruction::Store: 7201 case Instruction::Load: { 7202 ElementCount Width = VF; 7203 if (Width.isVector()) { 7204 InstWidening Decision = getWideningDecision(I, Width); 7205 assert(Decision != CM_Unknown && 7206 "CM decision should be taken at this point"); 7207 if (Decision == CM_Scalarize) 7208 Width = ElementCount::getFixed(1); 7209 } 7210 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7211 return getMemoryInstructionCost(I, VF); 7212 } 7213 case Instruction::ZExt: 7214 case Instruction::SExt: 7215 case Instruction::FPToUI: 7216 case Instruction::FPToSI: 7217 case Instruction::FPExt: 7218 case Instruction::PtrToInt: 7219 case Instruction::IntToPtr: 7220 case Instruction::SIToFP: 7221 case Instruction::UIToFP: 7222 case Instruction::Trunc: 7223 case Instruction::FPTrunc: 7224 case Instruction::BitCast: { 7225 // Computes the CastContextHint from a Load/Store instruction. 7226 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7227 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7228 "Expected a load or a store!"); 7229 7230 if (VF.isScalar() || !TheLoop->contains(I)) 7231 return TTI::CastContextHint::Normal; 7232 7233 switch (getWideningDecision(I, VF)) { 7234 case LoopVectorizationCostModel::CM_GatherScatter: 7235 return TTI::CastContextHint::GatherScatter; 7236 case LoopVectorizationCostModel::CM_Interleave: 7237 return TTI::CastContextHint::Interleave; 7238 case LoopVectorizationCostModel::CM_Scalarize: 7239 case LoopVectorizationCostModel::CM_Widen: 7240 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7241 : TTI::CastContextHint::Normal; 7242 case LoopVectorizationCostModel::CM_Widen_Reverse: 7243 return TTI::CastContextHint::Reversed; 7244 case LoopVectorizationCostModel::CM_Unknown: 7245 llvm_unreachable("Instr did not go through cost modelling?"); 7246 } 7247 7248 llvm_unreachable("Unhandled case!"); 7249 }; 7250 7251 unsigned Opcode = I->getOpcode(); 7252 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7253 // For Trunc, the context is the only user, which must be a StoreInst. 7254 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7255 if (I->hasOneUse()) 7256 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7257 CCH = ComputeCCH(Store); 7258 } 7259 // For Z/Sext, the context is the operand, which must be a LoadInst. 7260 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7261 Opcode == Instruction::FPExt) { 7262 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7263 CCH = ComputeCCH(Load); 7264 } 7265 7266 // We optimize the truncation of induction variables having constant 7267 // integer steps. The cost of these truncations is the same as the scalar 7268 // operation. 7269 if (isOptimizableIVTruncate(I, VF)) { 7270 auto *Trunc = cast<TruncInst>(I); 7271 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7272 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7273 } 7274 7275 Type *SrcScalarTy = I->getOperand(0)->getType(); 7276 Type *SrcVecTy = 7277 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7278 if (canTruncateToMinimalBitwidth(I, VF)) { 7279 // This cast is going to be shrunk. This may remove the cast or it might 7280 // turn it into slightly different cast. For example, if MinBW == 16, 7281 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7282 // 7283 // Calculate the modified src and dest types. 7284 Type *MinVecTy = VectorTy; 7285 if (Opcode == Instruction::Trunc) { 7286 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7287 VectorTy = 7288 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7289 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7290 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7291 VectorTy = 7292 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7293 } 7294 } 7295 7296 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7297 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7298 return N * 7299 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7300 } 7301 case Instruction::Call: { 7302 bool NeedToScalarize; 7303 CallInst *CI = cast<CallInst>(I); 7304 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7305 if (getVectorIntrinsicIDForCall(CI, TLI)) 7306 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 7307 return CallCost; 7308 } 7309 case Instruction::ExtractValue: 7310 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7311 default: 7312 // The cost of executing VF copies of the scalar instruction. This opcode 7313 // is unknown. Assume that it is the same as 'mul'. 7314 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7315 Instruction::Mul, VectorTy, CostKind) + 7316 getScalarizationOverhead(I, VF); 7317 } // end of switch. 7318 } 7319 7320 char LoopVectorize::ID = 0; 7321 7322 static const char lv_name[] = "Loop Vectorization"; 7323 7324 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7325 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7326 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7327 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7328 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7329 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7330 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7331 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7332 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7333 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7334 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7335 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7336 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7337 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7338 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7339 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7340 7341 namespace llvm { 7342 7343 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7344 7345 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7346 bool VectorizeOnlyWhenForced) { 7347 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7348 } 7349 7350 } // end namespace llvm 7351 7352 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7353 // Check if the pointer operand of a load or store instruction is 7354 // consecutive. 7355 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7356 return Legal->isConsecutivePtr(Ptr); 7357 return false; 7358 } 7359 7360 void LoopVectorizationCostModel::collectValuesToIgnore() { 7361 // Ignore ephemeral values. 7362 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7363 7364 // Ignore type-promoting instructions we identified during reduction 7365 // detection. 7366 for (auto &Reduction : Legal->getReductionVars()) { 7367 RecurrenceDescriptor &RedDes = Reduction.second; 7368 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7369 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7370 } 7371 // Ignore type-casting instructions we identified during induction 7372 // detection. 7373 for (auto &Induction : Legal->getInductionVars()) { 7374 InductionDescriptor &IndDes = Induction.second; 7375 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7376 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7377 } 7378 } 7379 7380 void LoopVectorizationCostModel::collectInLoopReductions() { 7381 for (auto &Reduction : Legal->getReductionVars()) { 7382 PHINode *Phi = Reduction.first; 7383 RecurrenceDescriptor &RdxDesc = Reduction.second; 7384 7385 // We don't collect reductions that are type promoted (yet). 7386 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7387 continue; 7388 7389 // If the target would prefer this reduction to happen "in-loop", then we 7390 // want to record it as such. 7391 unsigned Opcode = RdxDesc.getOpcode(); 7392 if (!PreferInLoopReductions && 7393 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7394 TargetTransformInfo::ReductionFlags())) 7395 continue; 7396 7397 // Check that we can correctly put the reductions into the loop, by 7398 // finding the chain of operations that leads from the phi to the loop 7399 // exit value. 7400 SmallVector<Instruction *, 4> ReductionOperations = 7401 RdxDesc.getReductionOpChain(Phi, TheLoop); 7402 bool InLoop = !ReductionOperations.empty(); 7403 if (InLoop) 7404 InLoopReductionChains[Phi] = ReductionOperations; 7405 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7406 << " reduction for phi: " << *Phi << "\n"); 7407 } 7408 } 7409 7410 // TODO: we could return a pair of values that specify the max VF and 7411 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7412 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7413 // doesn't have a cost model that can choose which plan to execute if 7414 // more than one is generated. 7415 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7416 LoopVectorizationCostModel &CM) { 7417 unsigned WidestType; 7418 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7419 return WidestVectorRegBits / WidestType; 7420 } 7421 7422 VectorizationFactor 7423 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7424 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7425 ElementCount VF = UserVF; 7426 // Outer loop handling: They may require CFG and instruction level 7427 // transformations before even evaluating whether vectorization is profitable. 7428 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7429 // the vectorization pipeline. 7430 if (!OrigLoop->isInnermost()) { 7431 // If the user doesn't provide a vectorization factor, determine a 7432 // reasonable one. 7433 if (UserVF.isZero()) { 7434 VF = ElementCount::getFixed( 7435 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7436 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7437 7438 // Make sure we have a VF > 1 for stress testing. 7439 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7440 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7441 << "overriding computed VF.\n"); 7442 VF = ElementCount::getFixed(4); 7443 } 7444 } 7445 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7446 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7447 "VF needs to be a power of two"); 7448 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7449 << "VF " << VF << " to build VPlans.\n"); 7450 buildVPlans(VF, VF); 7451 7452 // For VPlan build stress testing, we bail out after VPlan construction. 7453 if (VPlanBuildStressTest) 7454 return VectorizationFactor::Disabled(); 7455 7456 return {VF, 0 /*Cost*/}; 7457 } 7458 7459 LLVM_DEBUG( 7460 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7461 "VPlan-native path.\n"); 7462 return VectorizationFactor::Disabled(); 7463 } 7464 7465 Optional<VectorizationFactor> 7466 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7467 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7468 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7469 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7470 return None; 7471 7472 // Invalidate interleave groups if all blocks of loop will be predicated. 7473 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7474 !useMaskedInterleavedAccesses(*TTI)) { 7475 LLVM_DEBUG( 7476 dbgs() 7477 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7478 "which requires masked-interleaved support.\n"); 7479 if (CM.InterleaveInfo.invalidateGroups()) 7480 // Invalidating interleave groups also requires invalidating all decisions 7481 // based on them, which includes widening decisions and uniform and scalar 7482 // values. 7483 CM.invalidateCostModelingDecisions(); 7484 } 7485 7486 ElementCount MaxVF = MaybeMaxVF.getValue(); 7487 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7488 7489 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7490 if (!UserVF.isZero() && 7491 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7492 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7493 // VFs here, this should be reverted to only use legal UserVFs once the 7494 // loop below supports scalable VFs. 7495 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7496 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7497 << " VF " << VF << ".\n"); 7498 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7499 "VF needs to be a power of two"); 7500 // Collect the instructions (and their associated costs) that will be more 7501 // profitable to scalarize. 7502 CM.selectUserVectorizationFactor(VF); 7503 CM.collectInLoopReductions(); 7504 buildVPlansWithVPRecipes(VF, VF); 7505 LLVM_DEBUG(printPlans(dbgs())); 7506 return {{VF, 0}}; 7507 } 7508 7509 assert(!MaxVF.isScalable() && 7510 "Scalable vectors not yet supported beyond this point"); 7511 7512 for (ElementCount VF = ElementCount::getFixed(1); 7513 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7514 // Collect Uniform and Scalar instructions after vectorization with VF. 7515 CM.collectUniformsAndScalars(VF); 7516 7517 // Collect the instructions (and their associated costs) that will be more 7518 // profitable to scalarize. 7519 if (VF.isVector()) 7520 CM.collectInstsToScalarize(VF); 7521 } 7522 7523 CM.collectInLoopReductions(); 7524 7525 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7526 LLVM_DEBUG(printPlans(dbgs())); 7527 if (MaxVF.isScalar()) 7528 return VectorizationFactor::Disabled(); 7529 7530 // Select the optimal vectorization factor. 7531 return CM.selectVectorizationFactor(MaxVF); 7532 } 7533 7534 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7535 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7536 << '\n'); 7537 BestVF = VF; 7538 BestUF = UF; 7539 7540 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7541 return !Plan->hasVF(VF); 7542 }); 7543 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7544 } 7545 7546 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7547 DominatorTree *DT) { 7548 // Perform the actual loop transformation. 7549 7550 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7551 VPCallbackILV CallbackILV(ILV); 7552 7553 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7554 7555 VPTransformState State{*BestVF, BestUF, LI, 7556 DT, ILV.Builder, ILV.VectorLoopValueMap, 7557 &ILV, CallbackILV}; 7558 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7559 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7560 State.CanonicalIV = ILV.Induction; 7561 7562 ILV.printDebugTracesAtStart(); 7563 7564 //===------------------------------------------------===// 7565 // 7566 // Notice: any optimization or new instruction that go 7567 // into the code below should also be implemented in 7568 // the cost-model. 7569 // 7570 //===------------------------------------------------===// 7571 7572 // 2. Copy and widen instructions from the old loop into the new loop. 7573 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7574 VPlans.front()->execute(&State); 7575 7576 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7577 // predication, updating analyses. 7578 ILV.fixVectorizedLoop(); 7579 7580 ILV.printDebugTracesAtEnd(); 7581 } 7582 7583 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7584 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7585 7586 // We create new control-flow for the vectorized loop, so the original exit 7587 // conditions will be dead after vectorization if it's only used by the 7588 // terminator 7589 SmallVector<BasicBlock*> ExitingBlocks; 7590 OrigLoop->getExitingBlocks(ExitingBlocks); 7591 for (auto *BB : ExitingBlocks) { 7592 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7593 if (!Cmp || !Cmp->hasOneUse()) 7594 continue; 7595 7596 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7597 if (!DeadInstructions.insert(Cmp).second) 7598 continue; 7599 7600 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7601 // TODO: can recurse through operands in general 7602 for (Value *Op : Cmp->operands()) { 7603 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7604 DeadInstructions.insert(cast<Instruction>(Op)); 7605 } 7606 } 7607 7608 // We create new "steps" for induction variable updates to which the original 7609 // induction variables map. An original update instruction will be dead if 7610 // all its users except the induction variable are dead. 7611 auto *Latch = OrigLoop->getLoopLatch(); 7612 for (auto &Induction : Legal->getInductionVars()) { 7613 PHINode *Ind = Induction.first; 7614 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7615 7616 // If the tail is to be folded by masking, the primary induction variable, 7617 // if exists, isn't dead: it will be used for masking. Don't kill it. 7618 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7619 continue; 7620 7621 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7622 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7623 })) 7624 DeadInstructions.insert(IndUpdate); 7625 7626 // We record as "Dead" also the type-casting instructions we had identified 7627 // during induction analysis. We don't need any handling for them in the 7628 // vectorized loop because we have proven that, under a proper runtime 7629 // test guarding the vectorized loop, the value of the phi, and the casted 7630 // value of the phi, are the same. The last instruction in this casting chain 7631 // will get its scalar/vector/widened def from the scalar/vector/widened def 7632 // of the respective phi node. Any other casts in the induction def-use chain 7633 // have no other uses outside the phi update chain, and will be ignored. 7634 InductionDescriptor &IndDes = Induction.second; 7635 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7636 DeadInstructions.insert(Casts.begin(), Casts.end()); 7637 } 7638 } 7639 7640 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7641 7642 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7643 7644 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7645 Instruction::BinaryOps BinOp) { 7646 // When unrolling and the VF is 1, we only need to add a simple scalar. 7647 Type *Ty = Val->getType(); 7648 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7649 7650 if (Ty->isFloatingPointTy()) { 7651 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7652 7653 // Floating point operations had to be 'fast' to enable the unrolling. 7654 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7655 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7656 } 7657 Constant *C = ConstantInt::get(Ty, StartIdx); 7658 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7659 } 7660 7661 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7662 SmallVector<Metadata *, 4> MDs; 7663 // Reserve first location for self reference to the LoopID metadata node. 7664 MDs.push_back(nullptr); 7665 bool IsUnrollMetadata = false; 7666 MDNode *LoopID = L->getLoopID(); 7667 if (LoopID) { 7668 // First find existing loop unrolling disable metadata. 7669 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7670 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7671 if (MD) { 7672 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7673 IsUnrollMetadata = 7674 S && S->getString().startswith("llvm.loop.unroll.disable"); 7675 } 7676 MDs.push_back(LoopID->getOperand(i)); 7677 } 7678 } 7679 7680 if (!IsUnrollMetadata) { 7681 // Add runtime unroll disable metadata. 7682 LLVMContext &Context = L->getHeader()->getContext(); 7683 SmallVector<Metadata *, 1> DisableOperands; 7684 DisableOperands.push_back( 7685 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7686 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7687 MDs.push_back(DisableNode); 7688 MDNode *NewLoopID = MDNode::get(Context, MDs); 7689 // Set operand 0 to refer to the loop id itself. 7690 NewLoopID->replaceOperandWith(0, NewLoopID); 7691 L->setLoopID(NewLoopID); 7692 } 7693 } 7694 7695 //===--------------------------------------------------------------------===// 7696 // EpilogueVectorizerMainLoop 7697 //===--------------------------------------------------------------------===// 7698 7699 /// This function is partially responsible for generating the control flow 7700 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7701 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7702 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7703 Loop *Lp = createVectorLoopSkeleton(""); 7704 7705 // Generate the code to check the minimum iteration count of the vector 7706 // epilogue (see below). 7707 EPI.EpilogueIterationCountCheck = 7708 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7709 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7710 7711 // Generate the code to check any assumptions that we've made for SCEV 7712 // expressions. 7713 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7714 emitSCEVChecks(Lp, LoopScalarPreHeader); 7715 7716 // If a safety check was generated save it. 7717 if (SavedPreHeader != LoopVectorPreHeader) 7718 EPI.SCEVSafetyCheck = SavedPreHeader; 7719 7720 // Generate the code that checks at runtime if arrays overlap. We put the 7721 // checks into a separate block to make the more common case of few elements 7722 // faster. 7723 SavedPreHeader = LoopVectorPreHeader; 7724 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7725 7726 // If a safety check was generated save/overwite it. 7727 if (SavedPreHeader != LoopVectorPreHeader) 7728 EPI.MemSafetyCheck = SavedPreHeader; 7729 7730 // Generate the iteration count check for the main loop, *after* the check 7731 // for the epilogue loop, so that the path-length is shorter for the case 7732 // that goes directly through the vector epilogue. The longer-path length for 7733 // the main loop is compensated for, by the gain from vectorizing the larger 7734 // trip count. Note: the branch will get updated later on when we vectorize 7735 // the epilogue. 7736 EPI.MainLoopIterationCountCheck = 7737 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7738 7739 // Generate the induction variable. 7740 OldInduction = Legal->getPrimaryInduction(); 7741 Type *IdxTy = Legal->getWidestInductionType(); 7742 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7743 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7744 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7745 EPI.VectorTripCount = CountRoundDown; 7746 Induction = 7747 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7748 getDebugLocFromInstOrOperands(OldInduction)); 7749 7750 // Skip induction resume value creation here because they will be created in 7751 // the second pass. If we created them here, they wouldn't be used anyway, 7752 // because the vplan in the second pass still contains the inductions from the 7753 // original loop. 7754 7755 return completeLoopSkeleton(Lp, OrigLoopID); 7756 } 7757 7758 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7759 LLVM_DEBUG({ 7760 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7761 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7762 << ", Main Loop UF:" << EPI.MainLoopUF 7763 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7764 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7765 }); 7766 } 7767 7768 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7769 DEBUG_WITH_TYPE(VerboseDebug, { 7770 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7771 }); 7772 } 7773 7774 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7775 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7776 assert(L && "Expected valid Loop."); 7777 assert(Bypass && "Expected valid bypass basic block."); 7778 unsigned VFactor = 7779 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7780 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7781 Value *Count = getOrCreateTripCount(L); 7782 // Reuse existing vector loop preheader for TC checks. 7783 // Note that new preheader block is generated for vector loop. 7784 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7785 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7786 7787 // Generate code to check if the loop's trip count is less than VF * UF of the 7788 // main vector loop. 7789 auto P = 7790 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7791 7792 Value *CheckMinIters = Builder.CreateICmp( 7793 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7794 "min.iters.check"); 7795 7796 if (!ForEpilogue) 7797 TCCheckBlock->setName("vector.main.loop.iter.check"); 7798 7799 // Create new preheader for vector loop. 7800 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7801 DT, LI, nullptr, "vector.ph"); 7802 7803 if (ForEpilogue) { 7804 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7805 DT->getNode(Bypass)->getIDom()) && 7806 "TC check is expected to dominate Bypass"); 7807 7808 // Update dominator for Bypass & LoopExit. 7809 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7810 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7811 7812 LoopBypassBlocks.push_back(TCCheckBlock); 7813 7814 // Save the trip count so we don't have to regenerate it in the 7815 // vec.epilog.iter.check. This is safe to do because the trip count 7816 // generated here dominates the vector epilog iter check. 7817 EPI.TripCount = Count; 7818 } 7819 7820 ReplaceInstWithInst( 7821 TCCheckBlock->getTerminator(), 7822 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7823 7824 return TCCheckBlock; 7825 } 7826 7827 //===--------------------------------------------------------------------===// 7828 // EpilogueVectorizerEpilogueLoop 7829 //===--------------------------------------------------------------------===// 7830 7831 /// This function is partially responsible for generating the control flow 7832 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7833 BasicBlock * 7834 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7835 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7836 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7837 7838 // Now, compare the remaining count and if there aren't enough iterations to 7839 // execute the vectorized epilogue skip to the scalar part. 7840 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7841 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7842 LoopVectorPreHeader = 7843 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7844 LI, nullptr, "vec.epilog.ph"); 7845 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7846 VecEpilogueIterationCountCheck); 7847 7848 // Adjust the control flow taking the state info from the main loop 7849 // vectorization into account. 7850 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7851 "expected this to be saved from the previous pass."); 7852 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7853 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7854 7855 DT->changeImmediateDominator(LoopVectorPreHeader, 7856 EPI.MainLoopIterationCountCheck); 7857 7858 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7859 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7860 7861 if (EPI.SCEVSafetyCheck) 7862 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7863 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7864 if (EPI.MemSafetyCheck) 7865 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7866 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7867 7868 DT->changeImmediateDominator( 7869 VecEpilogueIterationCountCheck, 7870 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7871 7872 DT->changeImmediateDominator(LoopScalarPreHeader, 7873 EPI.EpilogueIterationCountCheck); 7874 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7875 7876 // Keep track of bypass blocks, as they feed start values to the induction 7877 // phis in the scalar loop preheader. 7878 if (EPI.SCEVSafetyCheck) 7879 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7880 if (EPI.MemSafetyCheck) 7881 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7882 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7883 7884 // Generate a resume induction for the vector epilogue and put it in the 7885 // vector epilogue preheader 7886 Type *IdxTy = Legal->getWidestInductionType(); 7887 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7888 LoopVectorPreHeader->getFirstNonPHI()); 7889 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7890 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7891 EPI.MainLoopIterationCountCheck); 7892 7893 // Generate the induction variable. 7894 OldInduction = Legal->getPrimaryInduction(); 7895 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7896 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7897 Value *StartIdx = EPResumeVal; 7898 Induction = 7899 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7900 getDebugLocFromInstOrOperands(OldInduction)); 7901 7902 // Generate induction resume values. These variables save the new starting 7903 // indexes for the scalar loop. They are used to test if there are any tail 7904 // iterations left once the vector loop has completed. 7905 // Note that when the vectorized epilogue is skipped due to iteration count 7906 // check, then the resume value for the induction variable comes from 7907 // the trip count of the main vector loop, hence passing the AdditionalBypass 7908 // argument. 7909 createInductionResumeValues(Lp, CountRoundDown, 7910 {VecEpilogueIterationCountCheck, 7911 EPI.VectorTripCount} /* AdditionalBypass */); 7912 7913 AddRuntimeUnrollDisableMetaData(Lp); 7914 return completeLoopSkeleton(Lp, OrigLoopID); 7915 } 7916 7917 BasicBlock * 7918 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7919 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7920 7921 assert(EPI.TripCount && 7922 "Expected trip count to have been safed in the first pass."); 7923 assert( 7924 (!isa<Instruction>(EPI.TripCount) || 7925 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7926 "saved trip count does not dominate insertion point."); 7927 Value *TC = EPI.TripCount; 7928 IRBuilder<> Builder(Insert->getTerminator()); 7929 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7930 7931 // Generate code to check if the loop's trip count is less than VF * UF of the 7932 // vector epilogue loop. 7933 auto P = 7934 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7935 7936 Value *CheckMinIters = Builder.CreateICmp( 7937 P, Count, 7938 ConstantInt::get(Count->getType(), 7939 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7940 "min.epilog.iters.check"); 7941 7942 ReplaceInstWithInst( 7943 Insert->getTerminator(), 7944 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7945 7946 LoopBypassBlocks.push_back(Insert); 7947 return Insert; 7948 } 7949 7950 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7951 LLVM_DEBUG({ 7952 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7953 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7954 << ", Main Loop UF:" << EPI.MainLoopUF 7955 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7956 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7957 }); 7958 } 7959 7960 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7961 DEBUG_WITH_TYPE(VerboseDebug, { 7962 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7963 }); 7964 } 7965 7966 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7967 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7968 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7969 bool PredicateAtRangeStart = Predicate(Range.Start); 7970 7971 for (ElementCount TmpVF = Range.Start * 2; 7972 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7973 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7974 Range.End = TmpVF; 7975 break; 7976 } 7977 7978 return PredicateAtRangeStart; 7979 } 7980 7981 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7982 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7983 /// of VF's starting at a given VF and extending it as much as possible. Each 7984 /// vectorization decision can potentially shorten this sub-range during 7985 /// buildVPlan(). 7986 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7987 ElementCount MaxVF) { 7988 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7989 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7990 VFRange SubRange = {VF, MaxVFPlusOne}; 7991 VPlans.push_back(buildVPlan(SubRange)); 7992 VF = SubRange.End; 7993 } 7994 } 7995 7996 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7997 VPlanPtr &Plan) { 7998 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7999 8000 // Look for cached value. 8001 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8002 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8003 if (ECEntryIt != EdgeMaskCache.end()) 8004 return ECEntryIt->second; 8005 8006 VPValue *SrcMask = createBlockInMask(Src, Plan); 8007 8008 // The terminator has to be a branch inst! 8009 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8010 assert(BI && "Unexpected terminator found"); 8011 8012 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8013 return EdgeMaskCache[Edge] = SrcMask; 8014 8015 // If source is an exiting block, we know the exit edge is dynamically dead 8016 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8017 // adding uses of an otherwise potentially dead instruction. 8018 if (OrigLoop->isLoopExiting(Src)) 8019 return EdgeMaskCache[Edge] = SrcMask; 8020 8021 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8022 assert(EdgeMask && "No Edge Mask found for condition"); 8023 8024 if (BI->getSuccessor(0) != Dst) 8025 EdgeMask = Builder.createNot(EdgeMask); 8026 8027 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 8028 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 8029 8030 return EdgeMaskCache[Edge] = EdgeMask; 8031 } 8032 8033 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8034 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8035 8036 // Look for cached value. 8037 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8038 if (BCEntryIt != BlockMaskCache.end()) 8039 return BCEntryIt->second; 8040 8041 // All-one mask is modelled as no-mask following the convention for masked 8042 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8043 VPValue *BlockMask = nullptr; 8044 8045 if (OrigLoop->getHeader() == BB) { 8046 if (!CM.blockNeedsPredication(BB)) 8047 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8048 8049 // Create the block in mask as the first non-phi instruction in the block. 8050 VPBuilder::InsertPointGuard Guard(Builder); 8051 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8052 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8053 8054 // Introduce the early-exit compare IV <= BTC to form header block mask. 8055 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8056 // Start by constructing the desired canonical IV. 8057 VPValue *IV = nullptr; 8058 if (Legal->getPrimaryInduction()) 8059 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8060 else { 8061 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8062 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8063 IV = IVRecipe->getVPValue(); 8064 } 8065 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8066 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8067 8068 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8069 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8070 // as a second argument, we only pass the IV here and extract the 8071 // tripcount from the transform state where codegen of the VP instructions 8072 // happen. 8073 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8074 } else { 8075 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8076 } 8077 return BlockMaskCache[BB] = BlockMask; 8078 } 8079 8080 // This is the block mask. We OR all incoming edges. 8081 for (auto *Predecessor : predecessors(BB)) { 8082 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8083 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8084 return BlockMaskCache[BB] = EdgeMask; 8085 8086 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8087 BlockMask = EdgeMask; 8088 continue; 8089 } 8090 8091 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8092 } 8093 8094 return BlockMaskCache[BB] = BlockMask; 8095 } 8096 8097 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8098 VPlanPtr &Plan) { 8099 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8100 "Must be called with either a load or store"); 8101 8102 auto willWiden = [&](ElementCount VF) -> bool { 8103 if (VF.isScalar()) 8104 return false; 8105 LoopVectorizationCostModel::InstWidening Decision = 8106 CM.getWideningDecision(I, VF); 8107 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8108 "CM decision should be taken at this point."); 8109 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8110 return true; 8111 if (CM.isScalarAfterVectorization(I, VF) || 8112 CM.isProfitableToScalarize(I, VF)) 8113 return false; 8114 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8115 }; 8116 8117 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8118 return nullptr; 8119 8120 VPValue *Mask = nullptr; 8121 if (Legal->isMaskRequired(I)) 8122 Mask = createBlockInMask(I->getParent(), Plan); 8123 8124 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8125 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8126 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8127 8128 StoreInst *Store = cast<StoreInst>(I); 8129 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8130 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8131 } 8132 8133 VPWidenIntOrFpInductionRecipe * 8134 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8135 // Check if this is an integer or fp induction. If so, build the recipe that 8136 // produces its scalar and vector values. 8137 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8138 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8139 II.getKind() == InductionDescriptor::IK_FpInduction) { 8140 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8141 return new VPWidenIntOrFpInductionRecipe(Phi, Start); 8142 } 8143 8144 return nullptr; 8145 } 8146 8147 VPWidenIntOrFpInductionRecipe * 8148 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8149 VPlan &Plan) const { 8150 // Optimize the special case where the source is a constant integer 8151 // induction variable. Notice that we can only optimize the 'trunc' case 8152 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8153 // (c) other casts depend on pointer size. 8154 8155 // Determine whether \p K is a truncation based on an induction variable that 8156 // can be optimized. 8157 auto isOptimizableIVTruncate = 8158 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8159 return [=](ElementCount VF) -> bool { 8160 return CM.isOptimizableIVTruncate(K, VF); 8161 }; 8162 }; 8163 8164 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8165 isOptimizableIVTruncate(I), Range)) { 8166 8167 InductionDescriptor II = 8168 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8169 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8170 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8171 Start, I); 8172 } 8173 return nullptr; 8174 } 8175 8176 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8177 // We know that all PHIs in non-header blocks are converted into selects, so 8178 // we don't have to worry about the insertion order and we can just use the 8179 // builder. At this point we generate the predication tree. There may be 8180 // duplications since this is a simple recursive scan, but future 8181 // optimizations will clean it up. 8182 8183 SmallVector<VPValue *, 2> Operands; 8184 unsigned NumIncoming = Phi->getNumIncomingValues(); 8185 for (unsigned In = 0; In < NumIncoming; In++) { 8186 VPValue *EdgeMask = 8187 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8188 assert((EdgeMask || NumIncoming == 1) && 8189 "Multiple predecessors with one having a full mask"); 8190 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8191 if (EdgeMask) 8192 Operands.push_back(EdgeMask); 8193 } 8194 return new VPBlendRecipe(Phi, Operands); 8195 } 8196 8197 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8198 VPlan &Plan) const { 8199 8200 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8201 [this, CI](ElementCount VF) { 8202 return CM.isScalarWithPredication(CI, VF); 8203 }, 8204 Range); 8205 8206 if (IsPredicated) 8207 return nullptr; 8208 8209 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8210 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8211 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8212 ID == Intrinsic::pseudoprobe)) 8213 return nullptr; 8214 8215 auto willWiden = [&](ElementCount VF) -> bool { 8216 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8217 // The following case may be scalarized depending on the VF. 8218 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8219 // version of the instruction. 8220 // Is it beneficial to perform intrinsic call compared to lib call? 8221 bool NeedToScalarize = false; 8222 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8223 bool UseVectorIntrinsic = 8224 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 8225 return UseVectorIntrinsic || !NeedToScalarize; 8226 }; 8227 8228 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8229 return nullptr; 8230 8231 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8232 } 8233 8234 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8235 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8236 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8237 // Instruction should be widened, unless it is scalar after vectorization, 8238 // scalarization is profitable or it is predicated. 8239 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8240 return CM.isScalarAfterVectorization(I, VF) || 8241 CM.isProfitableToScalarize(I, VF) || 8242 CM.isScalarWithPredication(I, VF); 8243 }; 8244 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8245 Range); 8246 } 8247 8248 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8249 auto IsVectorizableOpcode = [](unsigned Opcode) { 8250 switch (Opcode) { 8251 case Instruction::Add: 8252 case Instruction::And: 8253 case Instruction::AShr: 8254 case Instruction::BitCast: 8255 case Instruction::FAdd: 8256 case Instruction::FCmp: 8257 case Instruction::FDiv: 8258 case Instruction::FMul: 8259 case Instruction::FNeg: 8260 case Instruction::FPExt: 8261 case Instruction::FPToSI: 8262 case Instruction::FPToUI: 8263 case Instruction::FPTrunc: 8264 case Instruction::FRem: 8265 case Instruction::FSub: 8266 case Instruction::ICmp: 8267 case Instruction::IntToPtr: 8268 case Instruction::LShr: 8269 case Instruction::Mul: 8270 case Instruction::Or: 8271 case Instruction::PtrToInt: 8272 case Instruction::SDiv: 8273 case Instruction::Select: 8274 case Instruction::SExt: 8275 case Instruction::Shl: 8276 case Instruction::SIToFP: 8277 case Instruction::SRem: 8278 case Instruction::Sub: 8279 case Instruction::Trunc: 8280 case Instruction::UDiv: 8281 case Instruction::UIToFP: 8282 case Instruction::URem: 8283 case Instruction::Xor: 8284 case Instruction::ZExt: 8285 return true; 8286 } 8287 return false; 8288 }; 8289 8290 if (!IsVectorizableOpcode(I->getOpcode())) 8291 return nullptr; 8292 8293 // Success: widen this instruction. 8294 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8295 } 8296 8297 VPBasicBlock *VPRecipeBuilder::handleReplication( 8298 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8299 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8300 VPlanPtr &Plan) { 8301 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8302 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8303 Range); 8304 8305 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8306 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8307 Range); 8308 8309 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8310 IsUniform, IsPredicated); 8311 setRecipe(I, Recipe); 8312 Plan->addVPValue(I, Recipe); 8313 8314 // Find if I uses a predicated instruction. If so, it will use its scalar 8315 // value. Avoid hoisting the insert-element which packs the scalar value into 8316 // a vector value, as that happens iff all users use the vector value. 8317 for (auto &Op : I->operands()) 8318 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8319 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8320 PredInst2Recipe[PredInst]->setAlsoPack(false); 8321 8322 // Finalize the recipe for Instr, first if it is not predicated. 8323 if (!IsPredicated) { 8324 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8325 VPBB->appendRecipe(Recipe); 8326 return VPBB; 8327 } 8328 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8329 assert(VPBB->getSuccessors().empty() && 8330 "VPBB has successors when handling predicated replication."); 8331 // Record predicated instructions for above packing optimizations. 8332 PredInst2Recipe[I] = Recipe; 8333 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8334 VPBlockUtils::insertBlockAfter(Region, VPBB); 8335 auto *RegSucc = new VPBasicBlock(); 8336 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8337 return RegSucc; 8338 } 8339 8340 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8341 VPRecipeBase *PredRecipe, 8342 VPlanPtr &Plan) { 8343 // Instructions marked for predication are replicated and placed under an 8344 // if-then construct to prevent side-effects. 8345 8346 // Generate recipes to compute the block mask for this region. 8347 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8348 8349 // Build the triangular if-then region. 8350 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8351 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8352 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8353 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8354 auto *PHIRecipe = Instr->getType()->isVoidTy() 8355 ? nullptr 8356 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8357 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8358 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8359 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8360 8361 // Note: first set Entry as region entry and then connect successors starting 8362 // from it in order, to propagate the "parent" of each VPBasicBlock. 8363 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8364 VPBlockUtils::connectBlocks(Pred, Exit); 8365 8366 return Region; 8367 } 8368 8369 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8370 VFRange &Range, 8371 VPlanPtr &Plan) { 8372 // First, check for specific widening recipes that deal with calls, memory 8373 // operations, inductions and Phi nodes. 8374 if (auto *CI = dyn_cast<CallInst>(Instr)) 8375 return tryToWidenCall(CI, Range, *Plan); 8376 8377 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8378 return tryToWidenMemory(Instr, Range, Plan); 8379 8380 VPRecipeBase *Recipe; 8381 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8382 if (Phi->getParent() != OrigLoop->getHeader()) 8383 return tryToBlend(Phi, Plan); 8384 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8385 return Recipe; 8386 8387 if (Legal->isReductionVariable(Phi)) { 8388 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8389 VPValue *StartV = 8390 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8391 return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8392 } 8393 8394 return new VPWidenPHIRecipe(Phi); 8395 } 8396 8397 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8398 cast<TruncInst>(Instr), Range, *Plan))) 8399 return Recipe; 8400 8401 if (!shouldWiden(Instr, Range)) 8402 return nullptr; 8403 8404 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8405 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8406 OrigLoop); 8407 8408 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8409 bool InvariantCond = 8410 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8411 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8412 InvariantCond); 8413 } 8414 8415 return tryToWiden(Instr, *Plan); 8416 } 8417 8418 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8419 ElementCount MaxVF) { 8420 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8421 8422 // Collect instructions from the original loop that will become trivially dead 8423 // in the vectorized loop. We don't need to vectorize these instructions. For 8424 // example, original induction update instructions can become dead because we 8425 // separately emit induction "steps" when generating code for the new loop. 8426 // Similarly, we create a new latch condition when setting up the structure 8427 // of the new loop, so the old one can become dead. 8428 SmallPtrSet<Instruction *, 4> DeadInstructions; 8429 collectTriviallyDeadInstructions(DeadInstructions); 8430 8431 // Add assume instructions we need to drop to DeadInstructions, to prevent 8432 // them from being added to the VPlan. 8433 // TODO: We only need to drop assumes in blocks that get flattend. If the 8434 // control flow is preserved, we should keep them. 8435 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8436 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8437 8438 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8439 // Dead instructions do not need sinking. Remove them from SinkAfter. 8440 for (Instruction *I : DeadInstructions) 8441 SinkAfter.erase(I); 8442 8443 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8444 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8445 VFRange SubRange = {VF, MaxVFPlusOne}; 8446 VPlans.push_back( 8447 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8448 VF = SubRange.End; 8449 } 8450 } 8451 8452 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8453 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8454 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8455 8456 // Hold a mapping from predicated instructions to their recipes, in order to 8457 // fix their AlsoPack behavior if a user is determined to replicate and use a 8458 // scalar instead of vector value. 8459 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8460 8461 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8462 8463 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8464 8465 // --------------------------------------------------------------------------- 8466 // Pre-construction: record ingredients whose recipes we'll need to further 8467 // process after constructing the initial VPlan. 8468 // --------------------------------------------------------------------------- 8469 8470 // Mark instructions we'll need to sink later and their targets as 8471 // ingredients whose recipe we'll need to record. 8472 for (auto &Entry : SinkAfter) { 8473 RecipeBuilder.recordRecipeOf(Entry.first); 8474 RecipeBuilder.recordRecipeOf(Entry.second); 8475 } 8476 for (auto &Reduction : CM.getInLoopReductionChains()) { 8477 PHINode *Phi = Reduction.first; 8478 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8479 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8480 8481 RecipeBuilder.recordRecipeOf(Phi); 8482 for (auto &R : ReductionOperations) { 8483 RecipeBuilder.recordRecipeOf(R); 8484 // For min/max reducitons, where we have a pair of icmp/select, we also 8485 // need to record the ICmp recipe, so it can be removed later. 8486 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8487 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8488 } 8489 } 8490 8491 // For each interleave group which is relevant for this (possibly trimmed) 8492 // Range, add it to the set of groups to be later applied to the VPlan and add 8493 // placeholders for its members' Recipes which we'll be replacing with a 8494 // single VPInterleaveRecipe. 8495 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8496 auto applyIG = [IG, this](ElementCount VF) -> bool { 8497 return (VF.isVector() && // Query is illegal for VF == 1 8498 CM.getWideningDecision(IG->getInsertPos(), VF) == 8499 LoopVectorizationCostModel::CM_Interleave); 8500 }; 8501 if (!getDecisionAndClampRange(applyIG, Range)) 8502 continue; 8503 InterleaveGroups.insert(IG); 8504 for (unsigned i = 0; i < IG->getFactor(); i++) 8505 if (Instruction *Member = IG->getMember(i)) 8506 RecipeBuilder.recordRecipeOf(Member); 8507 }; 8508 8509 // --------------------------------------------------------------------------- 8510 // Build initial VPlan: Scan the body of the loop in a topological order to 8511 // visit each basic block after having visited its predecessor basic blocks. 8512 // --------------------------------------------------------------------------- 8513 8514 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8515 auto Plan = std::make_unique<VPlan>(); 8516 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8517 Plan->setEntry(VPBB); 8518 8519 // Scan the body of the loop in a topological order to visit each basic block 8520 // after having visited its predecessor basic blocks. 8521 LoopBlocksDFS DFS(OrigLoop); 8522 DFS.perform(LI); 8523 8524 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8525 // Relevant instructions from basic block BB will be grouped into VPRecipe 8526 // ingredients and fill a new VPBasicBlock. 8527 unsigned VPBBsForBB = 0; 8528 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8529 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8530 VPBB = FirstVPBBForBB; 8531 Builder.setInsertPoint(VPBB); 8532 8533 // Introduce each ingredient into VPlan. 8534 // TODO: Model and preserve debug instrinsics in VPlan. 8535 for (Instruction &I : BB->instructionsWithoutDebug()) { 8536 Instruction *Instr = &I; 8537 8538 // First filter out irrelevant instructions, to ensure no recipes are 8539 // built for them. 8540 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8541 continue; 8542 8543 if (auto Recipe = 8544 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8545 for (auto *Def : Recipe->definedValues()) { 8546 auto *UV = Def->getUnderlyingValue(); 8547 Plan->addVPValue(UV, Def); 8548 } 8549 8550 RecipeBuilder.setRecipe(Instr, Recipe); 8551 VPBB->appendRecipe(Recipe); 8552 continue; 8553 } 8554 8555 // Otherwise, if all widening options failed, Instruction is to be 8556 // replicated. This may create a successor for VPBB. 8557 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8558 Instr, Range, VPBB, PredInst2Recipe, Plan); 8559 if (NextVPBB != VPBB) { 8560 VPBB = NextVPBB; 8561 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8562 : ""); 8563 } 8564 } 8565 } 8566 8567 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8568 // may also be empty, such as the last one VPBB, reflecting original 8569 // basic-blocks with no recipes. 8570 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8571 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8572 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8573 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8574 delete PreEntry; 8575 8576 // --------------------------------------------------------------------------- 8577 // Transform initial VPlan: Apply previously taken decisions, in order, to 8578 // bring the VPlan to its final state. 8579 // --------------------------------------------------------------------------- 8580 8581 // Apply Sink-After legal constraints. 8582 for (auto &Entry : SinkAfter) { 8583 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8584 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8585 // If the target is in a replication region, make sure to move Sink to the 8586 // block after it, not into the replication region itself. 8587 if (auto *Region = 8588 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8589 if (Region->isReplicator()) { 8590 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8591 VPBasicBlock *NextBlock = 8592 cast<VPBasicBlock>(Region->getSuccessors().front()); 8593 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8594 continue; 8595 } 8596 } 8597 Sink->moveAfter(Target); 8598 } 8599 8600 // Interleave memory: for each Interleave Group we marked earlier as relevant 8601 // for this VPlan, replace the Recipes widening its memory instructions with a 8602 // single VPInterleaveRecipe at its insertion point. 8603 for (auto IG : InterleaveGroups) { 8604 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8605 RecipeBuilder.getRecipe(IG->getInsertPos())); 8606 SmallVector<VPValue *, 4> StoredValues; 8607 for (unsigned i = 0; i < IG->getFactor(); ++i) 8608 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8609 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8610 8611 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8612 Recipe->getMask()); 8613 VPIG->insertBefore(Recipe); 8614 unsigned J = 0; 8615 for (unsigned i = 0; i < IG->getFactor(); ++i) 8616 if (Instruction *Member = IG->getMember(i)) { 8617 if (!Member->getType()->isVoidTy()) { 8618 VPValue *OriginalV = Plan->getVPValue(Member); 8619 Plan->removeVPValueFor(Member); 8620 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8621 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8622 J++; 8623 } 8624 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8625 } 8626 } 8627 8628 // Adjust the recipes for any inloop reductions. 8629 if (Range.Start.isVector()) 8630 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8631 8632 // Finally, if tail is folded by masking, introduce selects between the phi 8633 // and the live-out instruction of each reduction, at the end of the latch. 8634 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8635 Builder.setInsertPoint(VPBB); 8636 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8637 for (auto &Reduction : Legal->getReductionVars()) { 8638 if (CM.isInLoopReduction(Reduction.first)) 8639 continue; 8640 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8641 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8642 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8643 } 8644 } 8645 8646 std::string PlanName; 8647 raw_string_ostream RSO(PlanName); 8648 ElementCount VF = Range.Start; 8649 Plan->addVF(VF); 8650 RSO << "Initial VPlan for VF={" << VF; 8651 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8652 Plan->addVF(VF); 8653 RSO << "," << VF; 8654 } 8655 RSO << "},UF>=1"; 8656 RSO.flush(); 8657 Plan->setName(PlanName); 8658 8659 return Plan; 8660 } 8661 8662 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8663 // Outer loop handling: They may require CFG and instruction level 8664 // transformations before even evaluating whether vectorization is profitable. 8665 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8666 // the vectorization pipeline. 8667 assert(!OrigLoop->isInnermost()); 8668 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8669 8670 // Create new empty VPlan 8671 auto Plan = std::make_unique<VPlan>(); 8672 8673 // Build hierarchical CFG 8674 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8675 HCFGBuilder.buildHierarchicalCFG(); 8676 8677 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8678 VF *= 2) 8679 Plan->addVF(VF); 8680 8681 if (EnableVPlanPredication) { 8682 VPlanPredicator VPP(*Plan); 8683 VPP.predicate(); 8684 8685 // Avoid running transformation to recipes until masked code generation in 8686 // VPlan-native path is in place. 8687 return Plan; 8688 } 8689 8690 SmallPtrSet<Instruction *, 1> DeadInstructions; 8691 VPlanTransforms::VPInstructionsToVPRecipes( 8692 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8693 return Plan; 8694 } 8695 8696 // Adjust the recipes for any inloop reductions. The chain of instructions 8697 // leading from the loop exit instr to the phi need to be converted to 8698 // reductions, with one operand being vector and the other being the scalar 8699 // reduction chain. 8700 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8701 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8702 for (auto &Reduction : CM.getInLoopReductionChains()) { 8703 PHINode *Phi = Reduction.first; 8704 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8705 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8706 8707 // ReductionOperations are orders top-down from the phi's use to the 8708 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8709 // which of the two operands will remain scalar and which will be reduced. 8710 // For minmax the chain will be the select instructions. 8711 Instruction *Chain = Phi; 8712 for (Instruction *R : ReductionOperations) { 8713 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8714 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8715 8716 VPValue *ChainOp = Plan->getVPValue(Chain); 8717 unsigned FirstOpId; 8718 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8719 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8720 "Expected to replace a VPWidenSelectSC"); 8721 FirstOpId = 1; 8722 } else { 8723 assert(isa<VPWidenRecipe>(WidenRecipe) && 8724 "Expected to replace a VPWidenSC"); 8725 FirstOpId = 0; 8726 } 8727 unsigned VecOpId = 8728 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8729 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8730 8731 auto *CondOp = CM.foldTailByMasking() 8732 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8733 : nullptr; 8734 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8735 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8736 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8737 Plan->removeVPValueFor(R); 8738 Plan->addVPValue(R, RedRecipe); 8739 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8740 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8741 WidenRecipe->eraseFromParent(); 8742 8743 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8744 VPRecipeBase *CompareRecipe = 8745 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8746 assert(isa<VPWidenRecipe>(CompareRecipe) && 8747 "Expected to replace a VPWidenSC"); 8748 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 8749 "Expected no remaining users"); 8750 CompareRecipe->eraseFromParent(); 8751 } 8752 Chain = R; 8753 } 8754 } 8755 } 8756 8757 Value* LoopVectorizationPlanner::VPCallbackILV:: 8758 getOrCreateVectorValues(Value *V, unsigned Part) { 8759 return ILV.getOrCreateVectorValue(V, Part); 8760 } 8761 8762 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8763 Value *V, const VPIteration &Instance) { 8764 return ILV.getOrCreateScalarValue(V, Instance); 8765 } 8766 8767 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8768 VPSlotTracker &SlotTracker) const { 8769 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8770 IG->getInsertPos()->printAsOperand(O, false); 8771 O << ", "; 8772 getAddr()->printAsOperand(O, SlotTracker); 8773 VPValue *Mask = getMask(); 8774 if (Mask) { 8775 O << ", "; 8776 Mask->printAsOperand(O, SlotTracker); 8777 } 8778 for (unsigned i = 0; i < IG->getFactor(); ++i) 8779 if (Instruction *I = IG->getMember(i)) 8780 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8781 } 8782 8783 void VPWidenCallRecipe::execute(VPTransformState &State) { 8784 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8785 *this, State); 8786 } 8787 8788 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8789 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8790 this, *this, InvariantCond, State); 8791 } 8792 8793 void VPWidenRecipe::execute(VPTransformState &State) { 8794 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8795 } 8796 8797 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8798 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8799 *this, State.UF, State.VF, IsPtrLoopInvariant, 8800 IsIndexLoopInvariant, State); 8801 } 8802 8803 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8804 assert(!State.Instance && "Int or FP induction being replicated."); 8805 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 8806 Trunc); 8807 } 8808 8809 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8810 Value *StartV = 8811 getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; 8812 State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); 8813 } 8814 8815 void VPBlendRecipe::execute(VPTransformState &State) { 8816 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8817 // We know that all PHIs in non-header blocks are converted into 8818 // selects, so we don't have to worry about the insertion order and we 8819 // can just use the builder. 8820 // At this point we generate the predication tree. There may be 8821 // duplications since this is a simple recursive scan, but future 8822 // optimizations will clean it up. 8823 8824 unsigned NumIncoming = getNumIncomingValues(); 8825 8826 // Generate a sequence of selects of the form: 8827 // SELECT(Mask3, In3, 8828 // SELECT(Mask2, In2, 8829 // SELECT(Mask1, In1, 8830 // In0))) 8831 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8832 // are essentially undef are taken from In0. 8833 InnerLoopVectorizer::VectorParts Entry(State.UF); 8834 for (unsigned In = 0; In < NumIncoming; ++In) { 8835 for (unsigned Part = 0; Part < State.UF; ++Part) { 8836 // We might have single edge PHIs (blocks) - use an identity 8837 // 'select' for the first PHI operand. 8838 Value *In0 = State.get(getIncomingValue(In), Part); 8839 if (In == 0) 8840 Entry[Part] = In0; // Initialize with the first incoming value. 8841 else { 8842 // Select between the current value and the previous incoming edge 8843 // based on the incoming mask. 8844 Value *Cond = State.get(getMask(In), Part); 8845 Entry[Part] = 8846 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8847 } 8848 } 8849 } 8850 for (unsigned Part = 0; Part < State.UF; ++Part) 8851 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8852 } 8853 8854 void VPInterleaveRecipe::execute(VPTransformState &State) { 8855 assert(!State.Instance && "Interleave group being replicated."); 8856 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 8857 getStoredValues(), getMask()); 8858 } 8859 8860 void VPReductionRecipe::execute(VPTransformState &State) { 8861 assert(!State.Instance && "Reduction being replicated."); 8862 for (unsigned Part = 0; Part < State.UF; ++Part) { 8863 RecurKind Kind = RdxDesc->getRecurrenceKind(); 8864 Value *NewVecOp = State.get(getVecOp(), Part); 8865 if (VPValue *Cond = getCondOp()) { 8866 Value *NewCond = State.get(Cond, Part); 8867 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8868 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8869 Kind, VecTy->getElementType()); 8870 Constant *IdenVec = 8871 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8872 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8873 NewVecOp = Select; 8874 } 8875 Value *NewRed = 8876 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 8877 Value *PrevInChain = State.get(getChainOp(), Part); 8878 Value *NextInChain; 8879 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8880 NextInChain = 8881 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 8882 NewRed, PrevInChain); 8883 } else { 8884 NextInChain = State.Builder.CreateBinOp( 8885 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8886 PrevInChain); 8887 } 8888 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8889 } 8890 } 8891 8892 void VPReplicateRecipe::execute(VPTransformState &State) { 8893 if (State.Instance) { // Generate a single instance. 8894 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 8895 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8896 *State.Instance, IsPredicated, State); 8897 // Insert scalar instance packing it into a vector. 8898 if (AlsoPack && State.VF.isVector()) { 8899 // If we're constructing lane 0, initialize to start from poison. 8900 if (State.Instance->Lane == 0) { 8901 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8902 Value *Poison = PoisonValue::get( 8903 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8904 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8905 State.Instance->Part, Poison); 8906 } 8907 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8908 *State.Instance); 8909 } 8910 return; 8911 } 8912 8913 // Generate scalar instances for all VF lanes of all UF parts, unless the 8914 // instruction is uniform inwhich case generate only the first lane for each 8915 // of the UF parts. 8916 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8917 assert((!State.VF.isScalable() || IsUniform) && 8918 "Can't scalarize a scalable vector"); 8919 for (unsigned Part = 0; Part < State.UF; ++Part) 8920 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8921 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8922 IsPredicated, State); 8923 } 8924 8925 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8926 assert(State.Instance && "Branch on Mask works only on single instance."); 8927 8928 unsigned Part = State.Instance->Part; 8929 unsigned Lane = State.Instance->Lane; 8930 8931 Value *ConditionBit = nullptr; 8932 VPValue *BlockInMask = getMask(); 8933 if (BlockInMask) { 8934 ConditionBit = State.get(BlockInMask, Part); 8935 if (ConditionBit->getType()->isVectorTy()) 8936 ConditionBit = State.Builder.CreateExtractElement( 8937 ConditionBit, State.Builder.getInt32(Lane)); 8938 } else // Block in mask is all-one. 8939 ConditionBit = State.Builder.getTrue(); 8940 8941 // Replace the temporary unreachable terminator with a new conditional branch, 8942 // whose two destinations will be set later when they are created. 8943 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8944 assert(isa<UnreachableInst>(CurrentTerminator) && 8945 "Expected to replace unreachable terminator with conditional branch."); 8946 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8947 CondBr->setSuccessor(0, nullptr); 8948 ReplaceInstWithInst(CurrentTerminator, CondBr); 8949 } 8950 8951 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8952 assert(State.Instance && "Predicated instruction PHI works per instance."); 8953 Instruction *ScalarPredInst = 8954 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8955 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8956 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8957 assert(PredicatingBB && "Predicated block has no single predecessor."); 8958 8959 // By current pack/unpack logic we need to generate only a single phi node: if 8960 // a vector value for the predicated instruction exists at this point it means 8961 // the instruction has vector users only, and a phi for the vector value is 8962 // needed. In this case the recipe of the predicated instruction is marked to 8963 // also do that packing, thereby "hoisting" the insert-element sequence. 8964 // Otherwise, a phi node for the scalar value is needed. 8965 unsigned Part = State.Instance->Part; 8966 Instruction *PredInst = 8967 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8968 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8969 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8970 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8971 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8972 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8973 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8974 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8975 } else { 8976 Type *PredInstType = PredInst->getType(); 8977 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8978 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); 8979 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8980 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8981 } 8982 } 8983 8984 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8985 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 8986 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 8987 StoredValue ? nullptr : getVPValue(), 8988 getAddr(), StoredValue, getMask()); 8989 } 8990 8991 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8992 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8993 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8994 // for predication. 8995 static ScalarEpilogueLowering getScalarEpilogueLowering( 8996 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8997 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8998 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8999 LoopVectorizationLegality &LVL) { 9000 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9001 // don't look at hints or options, and don't request a scalar epilogue. 9002 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9003 // LoopAccessInfo (due to code dependency and not being able to reliably get 9004 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9005 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9006 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9007 // back to the old way and vectorize with versioning when forced. See D81345.) 9008 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9009 PGSOQueryType::IRPass) && 9010 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9011 return CM_ScalarEpilogueNotAllowedOptSize; 9012 9013 // 2) If set, obey the directives 9014 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9015 switch (PreferPredicateOverEpilogue) { 9016 case PreferPredicateTy::ScalarEpilogue: 9017 return CM_ScalarEpilogueAllowed; 9018 case PreferPredicateTy::PredicateElseScalarEpilogue: 9019 return CM_ScalarEpilogueNotNeededUsePredicate; 9020 case PreferPredicateTy::PredicateOrDontVectorize: 9021 return CM_ScalarEpilogueNotAllowedUsePredicate; 9022 }; 9023 } 9024 9025 // 3) If set, obey the hints 9026 switch (Hints.getPredicate()) { 9027 case LoopVectorizeHints::FK_Enabled: 9028 return CM_ScalarEpilogueNotNeededUsePredicate; 9029 case LoopVectorizeHints::FK_Disabled: 9030 return CM_ScalarEpilogueAllowed; 9031 }; 9032 9033 // 4) if the TTI hook indicates this is profitable, request predication. 9034 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9035 LVL.getLAI())) 9036 return CM_ScalarEpilogueNotNeededUsePredicate; 9037 9038 return CM_ScalarEpilogueAllowed; 9039 } 9040 9041 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 9042 unsigned Part) { 9043 set(Def, V, Part); 9044 ILV->setVectorValue(IRDef, Part, V); 9045 } 9046 9047 // Process the loop in the VPlan-native vectorization path. This path builds 9048 // VPlan upfront in the vectorization pipeline, which allows to apply 9049 // VPlan-to-VPlan transformations from the very beginning without modifying the 9050 // input LLVM IR. 9051 static bool processLoopInVPlanNativePath( 9052 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9053 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9054 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9055 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9056 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9057 9058 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9059 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9060 return false; 9061 } 9062 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9063 Function *F = L->getHeader()->getParent(); 9064 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9065 9066 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9067 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9068 9069 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9070 &Hints, IAI); 9071 // Use the planner for outer loop vectorization. 9072 // TODO: CM is not used at this point inside the planner. Turn CM into an 9073 // optional argument if we don't need it in the future. 9074 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9075 9076 // Get user vectorization factor. 9077 ElementCount UserVF = Hints.getWidth(); 9078 9079 // Plan how to best vectorize, return the best VF and its cost. 9080 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9081 9082 // If we are stress testing VPlan builds, do not attempt to generate vector 9083 // code. Masked vector code generation support will follow soon. 9084 // Also, do not attempt to vectorize if no vector code will be produced. 9085 if (VPlanBuildStressTest || EnableVPlanPredication || 9086 VectorizationFactor::Disabled() == VF) 9087 return false; 9088 9089 LVP.setBestPlan(VF.Width, 1); 9090 9091 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9092 &CM, BFI, PSI); 9093 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9094 << L->getHeader()->getParent()->getName() << "\"\n"); 9095 LVP.executePlan(LB, DT); 9096 9097 // Mark the loop as already vectorized to avoid vectorizing again. 9098 Hints.setAlreadyVectorized(); 9099 9100 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9101 return true; 9102 } 9103 9104 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9105 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9106 !EnableLoopInterleaving), 9107 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9108 !EnableLoopVectorization) {} 9109 9110 bool LoopVectorizePass::processLoop(Loop *L) { 9111 assert((EnableVPlanNativePath || L->isInnermost()) && 9112 "VPlan-native path is not enabled. Only process inner loops."); 9113 9114 #ifndef NDEBUG 9115 const std::string DebugLocStr = getDebugLocString(L); 9116 #endif /* NDEBUG */ 9117 9118 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9119 << L->getHeader()->getParent()->getName() << "\" from " 9120 << DebugLocStr << "\n"); 9121 9122 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9123 9124 LLVM_DEBUG( 9125 dbgs() << "LV: Loop hints:" 9126 << " force=" 9127 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9128 ? "disabled" 9129 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9130 ? "enabled" 9131 : "?")) 9132 << " width=" << Hints.getWidth() 9133 << " unroll=" << Hints.getInterleave() << "\n"); 9134 9135 // Function containing loop 9136 Function *F = L->getHeader()->getParent(); 9137 9138 // Looking at the diagnostic output is the only way to determine if a loop 9139 // was vectorized (other than looking at the IR or machine code), so it 9140 // is important to generate an optimization remark for each loop. Most of 9141 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9142 // generated as OptimizationRemark and OptimizationRemarkMissed are 9143 // less verbose reporting vectorized loops and unvectorized loops that may 9144 // benefit from vectorization, respectively. 9145 9146 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9147 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9148 return false; 9149 } 9150 9151 PredicatedScalarEvolution PSE(*SE, *L); 9152 9153 // Check if it is legal to vectorize the loop. 9154 LoopVectorizationRequirements Requirements(*ORE); 9155 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9156 &Requirements, &Hints, DB, AC, BFI, PSI); 9157 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9158 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9159 Hints.emitRemarkWithHints(); 9160 return false; 9161 } 9162 9163 // Check the function attributes and profiles to find out if this function 9164 // should be optimized for size. 9165 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9166 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9167 9168 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9169 // here. They may require CFG and instruction level transformations before 9170 // even evaluating whether vectorization is profitable. Since we cannot modify 9171 // the incoming IR, we need to build VPlan upfront in the vectorization 9172 // pipeline. 9173 if (!L->isInnermost()) 9174 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9175 ORE, BFI, PSI, Hints); 9176 9177 assert(L->isInnermost() && "Inner loop expected."); 9178 9179 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9180 // count by optimizing for size, to minimize overheads. 9181 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9182 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9183 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9184 << "This loop is worth vectorizing only if no scalar " 9185 << "iteration overheads are incurred."); 9186 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9187 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9188 else { 9189 LLVM_DEBUG(dbgs() << "\n"); 9190 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9191 } 9192 } 9193 9194 // Check the function attributes to see if implicit floats are allowed. 9195 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9196 // an integer loop and the vector instructions selected are purely integer 9197 // vector instructions? 9198 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9199 reportVectorizationFailure( 9200 "Can't vectorize when the NoImplicitFloat attribute is used", 9201 "loop not vectorized due to NoImplicitFloat attribute", 9202 "NoImplicitFloat", ORE, L); 9203 Hints.emitRemarkWithHints(); 9204 return false; 9205 } 9206 9207 // Check if the target supports potentially unsafe FP vectorization. 9208 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9209 // for the target we're vectorizing for, to make sure none of the 9210 // additional fp-math flags can help. 9211 if (Hints.isPotentiallyUnsafe() && 9212 TTI->isFPVectorizationPotentiallyUnsafe()) { 9213 reportVectorizationFailure( 9214 "Potentially unsafe FP op prevents vectorization", 9215 "loop not vectorized due to unsafe FP support.", 9216 "UnsafeFP", ORE, L); 9217 Hints.emitRemarkWithHints(); 9218 return false; 9219 } 9220 9221 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9222 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9223 9224 // If an override option has been passed in for interleaved accesses, use it. 9225 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9226 UseInterleaved = EnableInterleavedMemAccesses; 9227 9228 // Analyze interleaved memory accesses. 9229 if (UseInterleaved) { 9230 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9231 } 9232 9233 // Use the cost model. 9234 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9235 F, &Hints, IAI); 9236 CM.collectValuesToIgnore(); 9237 9238 // Use the planner for vectorization. 9239 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9240 9241 // Get user vectorization factor and interleave count. 9242 ElementCount UserVF = Hints.getWidth(); 9243 unsigned UserIC = Hints.getInterleave(); 9244 9245 // Plan how to best vectorize, return the best VF and its cost. 9246 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9247 9248 VectorizationFactor VF = VectorizationFactor::Disabled(); 9249 unsigned IC = 1; 9250 9251 if (MaybeVF) { 9252 VF = *MaybeVF; 9253 // Select the interleave count. 9254 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9255 } 9256 9257 // Identify the diagnostic messages that should be produced. 9258 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9259 bool VectorizeLoop = true, InterleaveLoop = true; 9260 if (Requirements.doesNotMeet(F, L, Hints)) { 9261 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9262 "requirements.\n"); 9263 Hints.emitRemarkWithHints(); 9264 return false; 9265 } 9266 9267 if (VF.Width.isScalar()) { 9268 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9269 VecDiagMsg = std::make_pair( 9270 "VectorizationNotBeneficial", 9271 "the cost-model indicates that vectorization is not beneficial"); 9272 VectorizeLoop = false; 9273 } 9274 9275 if (!MaybeVF && UserIC > 1) { 9276 // Tell the user interleaving was avoided up-front, despite being explicitly 9277 // requested. 9278 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9279 "interleaving should be avoided up front\n"); 9280 IntDiagMsg = std::make_pair( 9281 "InterleavingAvoided", 9282 "Ignoring UserIC, because interleaving was avoided up front"); 9283 InterleaveLoop = false; 9284 } else if (IC == 1 && UserIC <= 1) { 9285 // Tell the user interleaving is not beneficial. 9286 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9287 IntDiagMsg = std::make_pair( 9288 "InterleavingNotBeneficial", 9289 "the cost-model indicates that interleaving is not beneficial"); 9290 InterleaveLoop = false; 9291 if (UserIC == 1) { 9292 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9293 IntDiagMsg.second += 9294 " and is explicitly disabled or interleave count is set to 1"; 9295 } 9296 } else if (IC > 1 && UserIC == 1) { 9297 // Tell the user interleaving is beneficial, but it explicitly disabled. 9298 LLVM_DEBUG( 9299 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9300 IntDiagMsg = std::make_pair( 9301 "InterleavingBeneficialButDisabled", 9302 "the cost-model indicates that interleaving is beneficial " 9303 "but is explicitly disabled or interleave count is set to 1"); 9304 InterleaveLoop = false; 9305 } 9306 9307 // Override IC if user provided an interleave count. 9308 IC = UserIC > 0 ? UserIC : IC; 9309 9310 // Emit diagnostic messages, if any. 9311 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9312 if (!VectorizeLoop && !InterleaveLoop) { 9313 // Do not vectorize or interleaving the loop. 9314 ORE->emit([&]() { 9315 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9316 L->getStartLoc(), L->getHeader()) 9317 << VecDiagMsg.second; 9318 }); 9319 ORE->emit([&]() { 9320 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9321 L->getStartLoc(), L->getHeader()) 9322 << IntDiagMsg.second; 9323 }); 9324 return false; 9325 } else if (!VectorizeLoop && InterleaveLoop) { 9326 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9327 ORE->emit([&]() { 9328 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9329 L->getStartLoc(), L->getHeader()) 9330 << VecDiagMsg.second; 9331 }); 9332 } else if (VectorizeLoop && !InterleaveLoop) { 9333 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9334 << ") in " << DebugLocStr << '\n'); 9335 ORE->emit([&]() { 9336 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9337 L->getStartLoc(), L->getHeader()) 9338 << IntDiagMsg.second; 9339 }); 9340 } else if (VectorizeLoop && InterleaveLoop) { 9341 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9342 << ") in " << DebugLocStr << '\n'); 9343 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9344 } 9345 9346 LVP.setBestPlan(VF.Width, IC); 9347 9348 using namespace ore; 9349 bool DisableRuntimeUnroll = false; 9350 MDNode *OrigLoopID = L->getLoopID(); 9351 9352 if (!VectorizeLoop) { 9353 assert(IC > 1 && "interleave count should not be 1 or 0"); 9354 // If we decided that it is not legal to vectorize the loop, then 9355 // interleave it. 9356 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9357 BFI, PSI); 9358 LVP.executePlan(Unroller, DT); 9359 9360 ORE->emit([&]() { 9361 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9362 L->getHeader()) 9363 << "interleaved loop (interleaved count: " 9364 << NV("InterleaveCount", IC) << ")"; 9365 }); 9366 } else { 9367 // If we decided that it is *legal* to vectorize the loop, then do it. 9368 9369 // Consider vectorizing the epilogue too if it's profitable. 9370 VectorizationFactor EpilogueVF = 9371 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9372 if (EpilogueVF.Width.isVector()) { 9373 9374 // The first pass vectorizes the main loop and creates a scalar epilogue 9375 // to be vectorized by executing the plan (potentially with a different 9376 // factor) again shortly afterwards. 9377 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9378 EpilogueVF.Width.getKnownMinValue(), 1); 9379 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9380 &LVL, &CM, BFI, PSI); 9381 9382 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9383 LVP.executePlan(MainILV, DT); 9384 ++LoopsVectorized; 9385 9386 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9387 formLCSSARecursively(*L, *DT, LI, SE); 9388 9389 // Second pass vectorizes the epilogue and adjusts the control flow 9390 // edges from the first pass. 9391 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9392 EPI.MainLoopVF = EPI.EpilogueVF; 9393 EPI.MainLoopUF = EPI.EpilogueUF; 9394 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9395 ORE, EPI, &LVL, &CM, BFI, PSI); 9396 LVP.executePlan(EpilogILV, DT); 9397 ++LoopsEpilogueVectorized; 9398 9399 if (!MainILV.areSafetyChecksAdded()) 9400 DisableRuntimeUnroll = true; 9401 } else { 9402 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9403 &LVL, &CM, BFI, PSI); 9404 LVP.executePlan(LB, DT); 9405 ++LoopsVectorized; 9406 9407 // Add metadata to disable runtime unrolling a scalar loop when there are 9408 // no runtime checks about strides and memory. A scalar loop that is 9409 // rarely used is not worth unrolling. 9410 if (!LB.areSafetyChecksAdded()) 9411 DisableRuntimeUnroll = true; 9412 } 9413 9414 // Report the vectorization decision. 9415 ORE->emit([&]() { 9416 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9417 L->getHeader()) 9418 << "vectorized loop (vectorization width: " 9419 << NV("VectorizationFactor", VF.Width) 9420 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9421 }); 9422 } 9423 9424 Optional<MDNode *> RemainderLoopID = 9425 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9426 LLVMLoopVectorizeFollowupEpilogue}); 9427 if (RemainderLoopID.hasValue()) { 9428 L->setLoopID(RemainderLoopID.getValue()); 9429 } else { 9430 if (DisableRuntimeUnroll) 9431 AddRuntimeUnrollDisableMetaData(L); 9432 9433 // Mark the loop as already vectorized to avoid vectorizing again. 9434 Hints.setAlreadyVectorized(); 9435 } 9436 9437 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9438 return true; 9439 } 9440 9441 LoopVectorizeResult LoopVectorizePass::runImpl( 9442 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9443 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9444 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9445 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9446 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9447 SE = &SE_; 9448 LI = &LI_; 9449 TTI = &TTI_; 9450 DT = &DT_; 9451 BFI = &BFI_; 9452 TLI = TLI_; 9453 AA = &AA_; 9454 AC = &AC_; 9455 GetLAA = &GetLAA_; 9456 DB = &DB_; 9457 ORE = &ORE_; 9458 PSI = PSI_; 9459 9460 // Don't attempt if 9461 // 1. the target claims to have no vector registers, and 9462 // 2. interleaving won't help ILP. 9463 // 9464 // The second condition is necessary because, even if the target has no 9465 // vector registers, loop vectorization may still enable scalar 9466 // interleaving. 9467 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9468 TTI->getMaxInterleaveFactor(1) < 2) 9469 return LoopVectorizeResult(false, false); 9470 9471 bool Changed = false, CFGChanged = false; 9472 9473 // The vectorizer requires loops to be in simplified form. 9474 // Since simplification may add new inner loops, it has to run before the 9475 // legality and profitability checks. This means running the loop vectorizer 9476 // will simplify all loops, regardless of whether anything end up being 9477 // vectorized. 9478 for (auto &L : *LI) 9479 Changed |= CFGChanged |= 9480 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9481 9482 // Build up a worklist of inner-loops to vectorize. This is necessary as 9483 // the act of vectorizing or partially unrolling a loop creates new loops 9484 // and can invalidate iterators across the loops. 9485 SmallVector<Loop *, 8> Worklist; 9486 9487 for (Loop *L : *LI) 9488 collectSupportedLoops(*L, LI, ORE, Worklist); 9489 9490 LoopsAnalyzed += Worklist.size(); 9491 9492 // Now walk the identified inner loops. 9493 while (!Worklist.empty()) { 9494 Loop *L = Worklist.pop_back_val(); 9495 9496 // For the inner loops we actually process, form LCSSA to simplify the 9497 // transform. 9498 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9499 9500 Changed |= CFGChanged |= processLoop(L); 9501 } 9502 9503 // Process each loop nest in the function. 9504 return LoopVectorizeResult(Changed, CFGChanged); 9505 } 9506 9507 PreservedAnalyses LoopVectorizePass::run(Function &F, 9508 FunctionAnalysisManager &AM) { 9509 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9510 auto &LI = AM.getResult<LoopAnalysis>(F); 9511 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9512 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9513 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9514 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9515 auto &AA = AM.getResult<AAManager>(F); 9516 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9517 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9518 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9519 MemorySSA *MSSA = EnableMSSALoopDependency 9520 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9521 : nullptr; 9522 9523 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9524 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9525 [&](Loop &L) -> const LoopAccessInfo & { 9526 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9527 TLI, TTI, nullptr, MSSA}; 9528 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9529 }; 9530 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9531 ProfileSummaryInfo *PSI = 9532 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9533 LoopVectorizeResult Result = 9534 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9535 if (!Result.MadeAnyChange) 9536 return PreservedAnalyses::all(); 9537 PreservedAnalyses PA; 9538 9539 // We currently do not preserve loopinfo/dominator analyses with outer loop 9540 // vectorization. Until this is addressed, mark these analyses as preserved 9541 // only for non-VPlan-native path. 9542 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9543 if (!EnableVPlanNativePath) { 9544 PA.preserve<LoopAnalysis>(); 9545 PA.preserve<DominatorTreeAnalysis>(); 9546 } 9547 PA.preserve<BasicAA>(); 9548 PA.preserve<GlobalsAA>(); 9549 if (!Result.MadeCFGChange) 9550 PA.preserveSet<CFGAnalyses>(); 9551 return PA; 9552 } 9553