1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 202 // that predication is preferred, and this lists all options. I.e., the 203 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 204 // and predicate the instructions accordingly. If tail-folding fails, there are 205 // different fallback strategies depending on these values: 206 namespace PreferPredicateTy { 207 enum Option { 208 ScalarEpilogue = 0, 209 PredicateElseScalarEpilogue, 210 PredicateOrDontVectorize 211 }; 212 } // namespace PreferPredicateTy 213 214 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 215 "prefer-predicate-over-epilogue", 216 cl::init(PreferPredicateTy::ScalarEpilogue), 217 cl::Hidden, 218 cl::desc("Tail-folding and predication preferences over creating a scalar " 219 "epilogue loop."), 220 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 221 "scalar-epilogue", 222 "Don't tail-predicate loops, create scalar epilogue"), 223 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 224 "predicate-else-scalar-epilogue", 225 "prefer tail-folding, create scalar epilogue if tail " 226 "folding fails."), 227 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 228 "predicate-dont-vectorize", 229 "prefers tail-folding, don't attempt vectorization if " 230 "tail-folding fails."))); 231 232 static cl::opt<bool> MaximizeBandwidth( 233 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 234 cl::desc("Maximize bandwidth when selecting vectorization factor which " 235 "will be determined by the smallest type in loop.")); 236 237 static cl::opt<bool> EnableInterleavedMemAccesses( 238 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 239 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 240 241 /// An interleave-group may need masking if it resides in a block that needs 242 /// predication, or in order to mask away gaps. 243 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 244 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 245 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 246 247 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 248 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 249 cl::desc("We don't interleave loops with a estimated constant trip count " 250 "below this number")); 251 252 static cl::opt<unsigned> ForceTargetNumScalarRegs( 253 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 254 cl::desc("A flag that overrides the target's number of scalar registers.")); 255 256 static cl::opt<unsigned> ForceTargetNumVectorRegs( 257 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 258 cl::desc("A flag that overrides the target's number of vector registers.")); 259 260 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 261 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 262 cl::desc("A flag that overrides the target's max interleave factor for " 263 "scalar loops.")); 264 265 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 266 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "vectorized loops.")); 269 270 static cl::opt<unsigned> ForceTargetInstructionCost( 271 "force-target-instruction-cost", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's expected cost for " 273 "an instruction to a single constant value. Mostly " 274 "useful for getting consistent testing.")); 275 276 static cl::opt<bool> ForceTargetSupportsScalableVectors( 277 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 278 cl::desc( 279 "Pretend that scalable vectors are supported, even if the target does " 280 "not support them. This flag should only be used for testing.")); 281 282 static cl::opt<unsigned> SmallLoopCost( 283 "small-loop-cost", cl::init(20), cl::Hidden, 284 cl::desc( 285 "The cost of a loop that is considered 'small' by the interleaver.")); 286 287 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 288 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 289 cl::desc("Enable the use of the block frequency analysis to access PGO " 290 "heuristics minimizing code growth in cold regions and being more " 291 "aggressive in hot regions.")); 292 293 // Runtime interleave loops for load/store throughput. 294 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 295 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 296 cl::desc( 297 "Enable runtime interleaving until load/store ports are saturated")); 298 299 /// Interleave small loops with scalar reductions. 300 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 301 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 302 cl::desc("Enable interleaving for loops with small iteration counts that " 303 "contain scalar reductions to expose ILP.")); 304 305 /// The number of stores in a loop that are allowed to need predication. 306 static cl::opt<unsigned> NumberOfStoresToPredicate( 307 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 308 cl::desc("Max number of stores to be predicated behind an if.")); 309 310 static cl::opt<bool> EnableIndVarRegisterHeur( 311 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 312 cl::desc("Count the induction variable only once when interleaving")); 313 314 static cl::opt<bool> EnableCondStoresVectorization( 315 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 316 cl::desc("Enable if predication of stores during vectorization.")); 317 318 static cl::opt<unsigned> MaxNestedScalarReductionIC( 319 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 320 cl::desc("The maximum interleave count to use when interleaving a scalar " 321 "reduction in a nested loop.")); 322 323 static cl::opt<bool> 324 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 325 cl::Hidden, 326 cl::desc("Prefer in-loop vector reductions, " 327 "overriding the targets preference.")); 328 329 static cl::opt<bool> PreferPredicatedReductionSelect( 330 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 331 cl::desc( 332 "Prefer predicating a reduction operation over an after loop select.")); 333 334 cl::opt<bool> EnableVPlanNativePath( 335 "enable-vplan-native-path", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path with " 337 "support for outer loop vectorization.")); 338 339 // FIXME: Remove this switch once we have divergence analysis. Currently we 340 // assume divergent non-backedge branches when this switch is true. 341 cl::opt<bool> EnableVPlanPredication( 342 "enable-vplan-predication", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path predicator with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 /// A helper function that returns the type of loaded or stored value. 365 static Type *getMemInstValueType(Value *I) { 366 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 367 "Expected Load or Store instruction"); 368 if (auto *LI = dyn_cast<LoadInst>(I)) 369 return LI->getType(); 370 return cast<StoreInst>(I)->getValueOperand()->getType(); 371 } 372 373 /// A helper function that returns true if the given type is irregular. The 374 /// type is irregular if its allocated size doesn't equal the store size of an 375 /// element of the corresponding vector type at the given vectorization factor. 376 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 377 // Determine if an array of VF elements of type Ty is "bitcast compatible" 378 // with a <VF x Ty> vector. 379 if (VF.isVector()) { 380 auto *VectorTy = VectorType::get(Ty, VF); 381 return TypeSize::get(VF.getKnownMinValue() * 382 DL.getTypeAllocSize(Ty).getFixedValue(), 383 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 384 } 385 386 // If the vectorization factor is one, we just check if an array of type Ty 387 // requires padding between elements. 388 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 389 } 390 391 /// A helper function that returns the reciprocal of the block probability of 392 /// predicated blocks. If we return X, we are assuming the predicated block 393 /// will execute once for every X iterations of the loop header. 394 /// 395 /// TODO: We should use actual block probability here, if available. Currently, 396 /// we always assume predicated blocks have a 50% chance of executing. 397 static unsigned getReciprocalPredBlockProb() { return 2; } 398 399 /// A helper function that adds a 'fast' flag to floating-point operations. 400 static Value *addFastMathFlag(Value *V) { 401 if (isa<FPMathOperator>(V)) 402 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 403 return V; 404 } 405 406 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 407 if (isa<FPMathOperator>(V)) 408 cast<Instruction>(V)->setFastMathFlags(FMF); 409 return V; 410 } 411 412 /// A helper function that returns an integer or floating-point constant with 413 /// value C. 414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 415 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 416 : ConstantFP::get(Ty, C); 417 } 418 419 /// Returns "best known" trip count for the specified loop \p L as defined by 420 /// the following procedure: 421 /// 1) Returns exact trip count if it is known. 422 /// 2) Returns expected trip count according to profile data if any. 423 /// 3) Returns upper bound estimate if it is known. 424 /// 4) Returns None if all of the above failed. 425 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 426 // Check if exact trip count is known. 427 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 428 return ExpectedTC; 429 430 // Check if there is an expected trip count available from profile data. 431 if (LoopVectorizeWithBlockFrequency) 432 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 433 return EstimatedTC; 434 435 // Check if upper bound estimate is known. 436 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 437 return ExpectedTC; 438 439 return None; 440 } 441 442 namespace llvm { 443 444 /// InnerLoopVectorizer vectorizes loops which contain only one basic 445 /// block to a specified vectorization factor (VF). 446 /// This class performs the widening of scalars into vectors, or multiple 447 /// scalars. This class also implements the following features: 448 /// * It inserts an epilogue loop for handling loops that don't have iteration 449 /// counts that are known to be a multiple of the vectorization factor. 450 /// * It handles the code generation for reduction variables. 451 /// * Scalarization (implementation using scalars) of un-vectorizable 452 /// instructions. 453 /// InnerLoopVectorizer does not perform any vectorization-legality 454 /// checks, and relies on the caller to check for the different legality 455 /// aspects. The InnerLoopVectorizer relies on the 456 /// LoopVectorizationLegality class to provide information about the induction 457 /// and reduction variables that were found to a given vectorization factor. 458 class InnerLoopVectorizer { 459 public: 460 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 461 LoopInfo *LI, DominatorTree *DT, 462 const TargetLibraryInfo *TLI, 463 const TargetTransformInfo *TTI, AssumptionCache *AC, 464 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 465 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 466 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 467 ProfileSummaryInfo *PSI) 468 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 469 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 470 Builder(PSE.getSE()->getContext()), 471 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 472 BFI(BFI), PSI(PSI) { 473 // Query this against the original loop and save it here because the profile 474 // of the original loop header may change as the transformation happens. 475 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 476 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 477 } 478 479 virtual ~InnerLoopVectorizer() = default; 480 481 /// Create a new empty loop that will contain vectorized instructions later 482 /// on, while the old loop will be used as the scalar remainder. Control flow 483 /// is generated around the vectorized (and scalar epilogue) loops consisting 484 /// of various checks and bypasses. Return the pre-header block of the new 485 /// loop. 486 /// In the case of epilogue vectorization, this function is overriden to 487 /// handle the more complex control flow around the loops. 488 virtual BasicBlock *createVectorizedLoopSkeleton(); 489 490 /// Widen a single instruction within the innermost loop. 491 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 492 VPTransformState &State); 493 494 /// Widen a single call instruction within the innermost loop. 495 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 496 VPTransformState &State); 497 498 /// Widen a single select instruction within the innermost loop. 499 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 500 bool InvariantCond, VPTransformState &State); 501 502 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 503 void fixVectorizedLoop(); 504 505 // Return true if any runtime check is added. 506 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 507 508 /// A type for vectorized values in the new loop. Each value from the 509 /// original loop, when vectorized, is represented by UF vector values in the 510 /// new unrolled loop, where UF is the unroll factor. 511 using VectorParts = SmallVector<Value *, 2>; 512 513 /// Vectorize a single GetElementPtrInst based on information gathered and 514 /// decisions taken during planning. 515 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 516 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 517 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 518 519 /// Vectorize a single PHINode in a block. This method handles the induction 520 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 521 /// arbitrary length vectors. 522 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 523 Value *StartV, unsigned UF, ElementCount VF); 524 525 /// A helper function to scalarize a single Instruction in the innermost loop. 526 /// Generates a sequence of scalar instances for each lane between \p MinLane 527 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 528 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 529 /// Instr's operands. 530 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 531 const VPIteration &Instance, bool IfPredicateInstr, 532 VPTransformState &State); 533 534 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 535 /// is provided, the integer induction variable will first be truncated to 536 /// the corresponding type. 537 void widenIntOrFpInduction(PHINode *IV, Value *Start, 538 TruncInst *Trunc = nullptr); 539 540 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 541 /// vector or scalar value on-demand if one is not yet available. When 542 /// vectorizing a loop, we visit the definition of an instruction before its 543 /// uses. When visiting the definition, we either vectorize or scalarize the 544 /// instruction, creating an entry for it in the corresponding map. (In some 545 /// cases, such as induction variables, we will create both vector and scalar 546 /// entries.) Then, as we encounter uses of the definition, we derive values 547 /// for each scalar or vector use unless such a value is already available. 548 /// For example, if we scalarize a definition and one of its uses is vector, 549 /// we build the required vector on-demand with an insertelement sequence 550 /// when visiting the use. Otherwise, if the use is scalar, we can use the 551 /// existing scalar definition. 552 /// 553 /// Return a value in the new loop corresponding to \p V from the original 554 /// loop at unroll index \p Part. If the value has already been vectorized, 555 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 556 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 557 /// a new vector value on-demand by inserting the scalar values into a vector 558 /// with an insertelement sequence. If the value has been neither vectorized 559 /// nor scalarized, it must be loop invariant, so we simply broadcast the 560 /// value into a vector. 561 Value *getOrCreateVectorValue(Value *V, unsigned Part); 562 563 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 564 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 565 } 566 567 /// Return a value in the new loop corresponding to \p V from the original 568 /// loop at unroll and vector indices \p Instance. If the value has been 569 /// vectorized but not scalarized, the necessary extractelement instruction 570 /// will be generated. 571 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 572 573 /// Construct the vector value of a scalarized value \p V one lane at a time. 574 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 575 576 /// Try to vectorize interleaved access group \p Group with the base address 577 /// given in \p Addr, optionally masking the vector operations if \p 578 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 579 /// values in the vectorized loop. 580 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 581 ArrayRef<VPValue *> VPDefs, 582 VPTransformState &State, VPValue *Addr, 583 ArrayRef<VPValue *> StoredValues, 584 VPValue *BlockInMask = nullptr); 585 586 /// Vectorize Load and Store instructions with the base address given in \p 587 /// Addr, optionally masking the vector operations if \p BlockInMask is 588 /// non-null. Use \p State to translate given VPValues to IR values in the 589 /// vectorized loop. 590 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 591 VPValue *Def, VPValue *Addr, 592 VPValue *StoredValue, VPValue *BlockInMask); 593 594 /// Set the debug location in the builder using the debug location in 595 /// the instruction. 596 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 597 598 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 599 void fixNonInductionPHIs(void); 600 601 protected: 602 friend class LoopVectorizationPlanner; 603 604 /// A small list of PHINodes. 605 using PhiVector = SmallVector<PHINode *, 4>; 606 607 /// A type for scalarized values in the new loop. Each value from the 608 /// original loop, when scalarized, is represented by UF x VF scalar values 609 /// in the new unrolled loop, where UF is the unroll factor and VF is the 610 /// vectorization factor. 611 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 612 613 /// Set up the values of the IVs correctly when exiting the vector loop. 614 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 615 Value *CountRoundDown, Value *EndValue, 616 BasicBlock *MiddleBlock); 617 618 /// Create a new induction variable inside L. 619 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 620 Value *Step, Instruction *DL); 621 622 /// Handle all cross-iteration phis in the header. 623 void fixCrossIterationPHIs(); 624 625 /// Fix a first-order recurrence. This is the second phase of vectorizing 626 /// this phi node. 627 void fixFirstOrderRecurrence(PHINode *Phi); 628 629 /// Fix a reduction cross-iteration phi. This is the second phase of 630 /// vectorizing this phi node. 631 void fixReduction(PHINode *Phi); 632 633 /// Clear NSW/NUW flags from reduction instructions if necessary. 634 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 635 636 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 637 /// means we need to add the appropriate incoming value from the middle 638 /// block as exiting edges from the scalar epilogue loop (if present) are 639 /// already in place, and we exit the vector loop exclusively to the middle 640 /// block. 641 void fixLCSSAPHIs(); 642 643 /// Iteratively sink the scalarized operands of a predicated instruction into 644 /// the block that was created for it. 645 void sinkScalarOperands(Instruction *PredInst); 646 647 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 648 /// represented as. 649 void truncateToMinimalBitwidths(); 650 651 /// Create a broadcast instruction. This method generates a broadcast 652 /// instruction (shuffle) for loop invariant values and for the induction 653 /// value. If this is the induction variable then we extend it to N, N+1, ... 654 /// this is needed because each iteration in the loop corresponds to a SIMD 655 /// element. 656 virtual Value *getBroadcastInstrs(Value *V); 657 658 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 659 /// to each vector element of Val. The sequence starts at StartIndex. 660 /// \p Opcode is relevant for FP induction variable. 661 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 662 Instruction::BinaryOps Opcode = 663 Instruction::BinaryOpsEnd); 664 665 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 666 /// variable on which to base the steps, \p Step is the size of the step, and 667 /// \p EntryVal is the value from the original loop that maps to the steps. 668 /// Note that \p EntryVal doesn't have to be an induction variable - it 669 /// can also be a truncate instruction. 670 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 671 const InductionDescriptor &ID); 672 673 /// Create a vector induction phi node based on an existing scalar one. \p 674 /// EntryVal is the value from the original loop that maps to the vector phi 675 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 676 /// truncate instruction, instead of widening the original IV, we widen a 677 /// version of the IV truncated to \p EntryVal's type. 678 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 679 Value *Step, Value *Start, 680 Instruction *EntryVal); 681 682 /// Returns true if an instruction \p I should be scalarized instead of 683 /// vectorized for the chosen vectorization factor. 684 bool shouldScalarizeInstruction(Instruction *I) const; 685 686 /// Returns true if we should generate a scalar version of \p IV. 687 bool needsScalarInduction(Instruction *IV) const; 688 689 /// If there is a cast involved in the induction variable \p ID, which should 690 /// be ignored in the vectorized loop body, this function records the 691 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 692 /// cast. We had already proved that the casted Phi is equal to the uncasted 693 /// Phi in the vectorized loop (under a runtime guard), and therefore 694 /// there is no need to vectorize the cast - the same value can be used in the 695 /// vector loop for both the Phi and the cast. 696 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 697 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 698 /// 699 /// \p EntryVal is the value from the original loop that maps to the vector 700 /// phi node and is used to distinguish what is the IV currently being 701 /// processed - original one (if \p EntryVal is a phi corresponding to the 702 /// original IV) or the "newly-created" one based on the proof mentioned above 703 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 704 /// latter case \p EntryVal is a TruncInst and we must not record anything for 705 /// that IV, but it's error-prone to expect callers of this routine to care 706 /// about that, hence this explicit parameter. 707 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 708 const Instruction *EntryVal, 709 Value *VectorLoopValue, 710 unsigned Part, 711 unsigned Lane = UINT_MAX); 712 713 /// Generate a shuffle sequence that will reverse the vector Vec. 714 virtual Value *reverseVector(Value *Vec); 715 716 /// Returns (and creates if needed) the original loop trip count. 717 Value *getOrCreateTripCount(Loop *NewLoop); 718 719 /// Returns (and creates if needed) the trip count of the widened loop. 720 Value *getOrCreateVectorTripCount(Loop *NewLoop); 721 722 /// Returns a bitcasted value to the requested vector type. 723 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 724 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 725 const DataLayout &DL); 726 727 /// Emit a bypass check to see if the vector trip count is zero, including if 728 /// it overflows. 729 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 730 731 /// Emit a bypass check to see if all of the SCEV assumptions we've 732 /// had to make are correct. 733 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 734 735 /// Emit bypass checks to check any memory assumptions we may have made. 736 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 737 738 /// Compute the transformed value of Index at offset StartValue using step 739 /// StepValue. 740 /// For integer induction, returns StartValue + Index * StepValue. 741 /// For pointer induction, returns StartValue[Index * StepValue]. 742 /// FIXME: The newly created binary instructions should contain nsw/nuw 743 /// flags, which can be found from the original scalar operations. 744 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 745 const DataLayout &DL, 746 const InductionDescriptor &ID) const; 747 748 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 749 /// vector loop preheader, middle block and scalar preheader. Also 750 /// allocate a loop object for the new vector loop and return it. 751 Loop *createVectorLoopSkeleton(StringRef Prefix); 752 753 /// Create new phi nodes for the induction variables to resume iteration count 754 /// in the scalar epilogue, from where the vectorized loop left off (given by 755 /// \p VectorTripCount). 756 /// In cases where the loop skeleton is more complicated (eg. epilogue 757 /// vectorization) and the resume values can come from an additional bypass 758 /// block, the \p AdditionalBypass pair provides information about the bypass 759 /// block and the end value on the edge from bypass to this loop. 760 void createInductionResumeValues( 761 Loop *L, Value *VectorTripCount, 762 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 763 764 /// Complete the loop skeleton by adding debug MDs, creating appropriate 765 /// conditional branches in the middle block, preparing the builder and 766 /// running the verifier. Take in the vector loop \p L as argument, and return 767 /// the preheader of the completed vector loop. 768 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 769 770 /// Add additional metadata to \p To that was not present on \p Orig. 771 /// 772 /// Currently this is used to add the noalias annotations based on the 773 /// inserted memchecks. Use this for instructions that are *cloned* into the 774 /// vector loop. 775 void addNewMetadata(Instruction *To, const Instruction *Orig); 776 777 /// Add metadata from one instruction to another. 778 /// 779 /// This includes both the original MDs from \p From and additional ones (\see 780 /// addNewMetadata). Use this for *newly created* instructions in the vector 781 /// loop. 782 void addMetadata(Instruction *To, Instruction *From); 783 784 /// Similar to the previous function but it adds the metadata to a 785 /// vector of instructions. 786 void addMetadata(ArrayRef<Value *> To, Instruction *From); 787 788 /// Allow subclasses to override and print debug traces before/after vplan 789 /// execution, when trace information is requested. 790 virtual void printDebugTracesAtStart(){}; 791 virtual void printDebugTracesAtEnd(){}; 792 793 /// The original loop. 794 Loop *OrigLoop; 795 796 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 797 /// dynamic knowledge to simplify SCEV expressions and converts them to a 798 /// more usable form. 799 PredicatedScalarEvolution &PSE; 800 801 /// Loop Info. 802 LoopInfo *LI; 803 804 /// Dominator Tree. 805 DominatorTree *DT; 806 807 /// Alias Analysis. 808 AAResults *AA; 809 810 /// Target Library Info. 811 const TargetLibraryInfo *TLI; 812 813 /// Target Transform Info. 814 const TargetTransformInfo *TTI; 815 816 /// Assumption Cache. 817 AssumptionCache *AC; 818 819 /// Interface to emit optimization remarks. 820 OptimizationRemarkEmitter *ORE; 821 822 /// LoopVersioning. It's only set up (non-null) if memchecks were 823 /// used. 824 /// 825 /// This is currently only used to add no-alias metadata based on the 826 /// memchecks. The actually versioning is performed manually. 827 std::unique_ptr<LoopVersioning> LVer; 828 829 /// The vectorization SIMD factor to use. Each vector will have this many 830 /// vector elements. 831 ElementCount VF; 832 833 /// The vectorization unroll factor to use. Each scalar is vectorized to this 834 /// many different vector instructions. 835 unsigned UF; 836 837 /// The builder that we use 838 IRBuilder<> Builder; 839 840 // --- Vectorization state --- 841 842 /// The vector-loop preheader. 843 BasicBlock *LoopVectorPreHeader; 844 845 /// The scalar-loop preheader. 846 BasicBlock *LoopScalarPreHeader; 847 848 /// Middle Block between the vector and the scalar. 849 BasicBlock *LoopMiddleBlock; 850 851 /// The (unique) ExitBlock of the scalar loop. Note that 852 /// there can be multiple exiting edges reaching this block. 853 BasicBlock *LoopExitBlock; 854 855 /// The vector loop body. 856 BasicBlock *LoopVectorBody; 857 858 /// The scalar loop body. 859 BasicBlock *LoopScalarBody; 860 861 /// A list of all bypass blocks. The first block is the entry of the loop. 862 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 863 864 /// The new Induction variable which was added to the new block. 865 PHINode *Induction = nullptr; 866 867 /// The induction variable of the old basic block. 868 PHINode *OldInduction = nullptr; 869 870 /// Maps values from the original loop to their corresponding values in the 871 /// vectorized loop. A key value can map to either vector values, scalar 872 /// values or both kinds of values, depending on whether the key was 873 /// vectorized and scalarized. 874 VectorizerValueMap VectorLoopValueMap; 875 876 /// Store instructions that were predicated. 877 SmallVector<Instruction *, 4> PredicatedInstructions; 878 879 /// Trip count of the original loop. 880 Value *TripCount = nullptr; 881 882 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 883 Value *VectorTripCount = nullptr; 884 885 /// The legality analysis. 886 LoopVectorizationLegality *Legal; 887 888 /// The profitablity analysis. 889 LoopVectorizationCostModel *Cost; 890 891 // Record whether runtime checks are added. 892 bool AddedSafetyChecks = false; 893 894 // Holds the end values for each induction variable. We save the end values 895 // so we can later fix-up the external users of the induction variables. 896 DenseMap<PHINode *, Value *> IVEndValues; 897 898 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 899 // fixed up at the end of vector code generation. 900 SmallVector<PHINode *, 8> OrigPHIsToFix; 901 902 /// BFI and PSI are used to check for profile guided size optimizations. 903 BlockFrequencyInfo *BFI; 904 ProfileSummaryInfo *PSI; 905 906 // Whether this loop should be optimized for size based on profile guided size 907 // optimizatios. 908 bool OptForSizeBasedOnProfile; 909 }; 910 911 class InnerLoopUnroller : public InnerLoopVectorizer { 912 public: 913 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 914 LoopInfo *LI, DominatorTree *DT, 915 const TargetLibraryInfo *TLI, 916 const TargetTransformInfo *TTI, AssumptionCache *AC, 917 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 918 LoopVectorizationLegality *LVL, 919 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 920 ProfileSummaryInfo *PSI) 921 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 922 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 923 BFI, PSI) {} 924 925 private: 926 Value *getBroadcastInstrs(Value *V) override; 927 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 928 Instruction::BinaryOps Opcode = 929 Instruction::BinaryOpsEnd) override; 930 Value *reverseVector(Value *Vec) override; 931 }; 932 933 /// Encapsulate information regarding vectorization of a loop and its epilogue. 934 /// This information is meant to be updated and used across two stages of 935 /// epilogue vectorization. 936 struct EpilogueLoopVectorizationInfo { 937 ElementCount MainLoopVF = ElementCount::getFixed(0); 938 unsigned MainLoopUF = 0; 939 ElementCount EpilogueVF = ElementCount::getFixed(0); 940 unsigned EpilogueUF = 0; 941 BasicBlock *MainLoopIterationCountCheck = nullptr; 942 BasicBlock *EpilogueIterationCountCheck = nullptr; 943 BasicBlock *SCEVSafetyCheck = nullptr; 944 BasicBlock *MemSafetyCheck = nullptr; 945 Value *TripCount = nullptr; 946 Value *VectorTripCount = nullptr; 947 948 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 949 unsigned EUF) 950 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 951 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 952 assert(EUF == 1 && 953 "A high UF for the epilogue loop is likely not beneficial."); 954 } 955 }; 956 957 /// An extension of the inner loop vectorizer that creates a skeleton for a 958 /// vectorized loop that has its epilogue (residual) also vectorized. 959 /// The idea is to run the vplan on a given loop twice, firstly to setup the 960 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 961 /// from the first step and vectorize the epilogue. This is achieved by 962 /// deriving two concrete strategy classes from this base class and invoking 963 /// them in succession from the loop vectorizer planner. 964 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 965 public: 966 InnerLoopAndEpilogueVectorizer( 967 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 968 DominatorTree *DT, const TargetLibraryInfo *TLI, 969 const TargetTransformInfo *TTI, AssumptionCache *AC, 970 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 971 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 972 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 973 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 974 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 975 EPI(EPI) {} 976 977 // Override this function to handle the more complex control flow around the 978 // three loops. 979 BasicBlock *createVectorizedLoopSkeleton() final override { 980 return createEpilogueVectorizedLoopSkeleton(); 981 } 982 983 /// The interface for creating a vectorized skeleton using one of two 984 /// different strategies, each corresponding to one execution of the vplan 985 /// as described above. 986 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 987 988 /// Holds and updates state information required to vectorize the main loop 989 /// and its epilogue in two separate passes. This setup helps us avoid 990 /// regenerating and recomputing runtime safety checks. It also helps us to 991 /// shorten the iteration-count-check path length for the cases where the 992 /// iteration count of the loop is so small that the main vector loop is 993 /// completely skipped. 994 EpilogueLoopVectorizationInfo &EPI; 995 }; 996 997 /// A specialized derived class of inner loop vectorizer that performs 998 /// vectorization of *main* loops in the process of vectorizing loops and their 999 /// epilogues. 1000 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerMainLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1009 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1010 EPI, LVL, CM, BFI, PSI) {} 1011 /// Implements the interface for creating a vectorized skeleton using the 1012 /// *main loop* strategy (ie the first pass of vplan execution). 1013 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1014 1015 protected: 1016 /// Emits an iteration count bypass check once for the main loop (when \p 1017 /// ForEpilogue is false) and once for the epilogue loop (when \p 1018 /// ForEpilogue is true). 1019 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1020 bool ForEpilogue); 1021 void printDebugTracesAtStart() override; 1022 void printDebugTracesAtEnd() override; 1023 }; 1024 1025 // A specialized derived class of inner loop vectorizer that performs 1026 // vectorization of *epilogue* loops in the process of vectorizing loops and 1027 // their epilogues. 1028 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1029 public: 1030 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1031 LoopInfo *LI, DominatorTree *DT, 1032 const TargetLibraryInfo *TLI, 1033 const TargetTransformInfo *TTI, AssumptionCache *AC, 1034 OptimizationRemarkEmitter *ORE, 1035 EpilogueLoopVectorizationInfo &EPI, 1036 LoopVectorizationLegality *LVL, 1037 llvm::LoopVectorizationCostModel *CM, 1038 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1039 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1040 EPI, LVL, CM, BFI, PSI) {} 1041 /// Implements the interface for creating a vectorized skeleton using the 1042 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1043 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1044 1045 protected: 1046 /// Emits an iteration count bypass check after the main vector loop has 1047 /// finished to see if there are any iterations left to execute by either 1048 /// the vector epilogue or the scalar epilogue. 1049 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1050 BasicBlock *Bypass, 1051 BasicBlock *Insert); 1052 void printDebugTracesAtStart() override; 1053 void printDebugTracesAtEnd() override; 1054 }; 1055 } // end namespace llvm 1056 1057 /// Look for a meaningful debug location on the instruction or it's 1058 /// operands. 1059 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1060 if (!I) 1061 return I; 1062 1063 DebugLoc Empty; 1064 if (I->getDebugLoc() != Empty) 1065 return I; 1066 1067 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1068 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1069 if (OpInst->getDebugLoc() != Empty) 1070 return OpInst; 1071 } 1072 1073 return I; 1074 } 1075 1076 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1077 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1078 const DILocation *DIL = Inst->getDebugLoc(); 1079 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1080 !isa<DbgInfoIntrinsic>(Inst)) { 1081 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1082 auto NewDIL = 1083 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1084 if (NewDIL) 1085 B.SetCurrentDebugLocation(NewDIL.getValue()); 1086 else 1087 LLVM_DEBUG(dbgs() 1088 << "Failed to create new discriminator: " 1089 << DIL->getFilename() << " Line: " << DIL->getLine()); 1090 } 1091 else 1092 B.SetCurrentDebugLocation(DIL); 1093 } else 1094 B.SetCurrentDebugLocation(DebugLoc()); 1095 } 1096 1097 /// Write a record \p DebugMsg about vectorization failure to the debug 1098 /// output stream. If \p I is passed, it is an instruction that prevents 1099 /// vectorization. 1100 #ifndef NDEBUG 1101 static void debugVectorizationFailure(const StringRef DebugMsg, 1102 Instruction *I) { 1103 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1104 if (I != nullptr) 1105 dbgs() << " " << *I; 1106 else 1107 dbgs() << '.'; 1108 dbgs() << '\n'; 1109 } 1110 #endif 1111 1112 /// Create an analysis remark that explains why vectorization failed 1113 /// 1114 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1115 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1116 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1117 /// the location of the remark. \return the remark object that can be 1118 /// streamed to. 1119 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1120 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1121 Value *CodeRegion = TheLoop->getHeader(); 1122 DebugLoc DL = TheLoop->getStartLoc(); 1123 1124 if (I) { 1125 CodeRegion = I->getParent(); 1126 // If there is no debug location attached to the instruction, revert back to 1127 // using the loop's. 1128 if (I->getDebugLoc()) 1129 DL = I->getDebugLoc(); 1130 } 1131 1132 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1133 R << "loop not vectorized: "; 1134 return R; 1135 } 1136 1137 /// Return a value for Step multiplied by VF. 1138 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1139 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1140 Constant *StepVal = ConstantInt::get( 1141 Step->getType(), 1142 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1143 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1144 } 1145 1146 namespace llvm { 1147 1148 void reportVectorizationFailure(const StringRef DebugMsg, 1149 const StringRef OREMsg, const StringRef ORETag, 1150 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1151 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1152 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1153 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1154 ORETag, TheLoop, I) << OREMsg); 1155 } 1156 1157 } // end namespace llvm 1158 1159 #ifndef NDEBUG 1160 /// \return string containing a file name and a line # for the given loop. 1161 static std::string getDebugLocString(const Loop *L) { 1162 std::string Result; 1163 if (L) { 1164 raw_string_ostream OS(Result); 1165 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1166 LoopDbgLoc.print(OS); 1167 else 1168 // Just print the module name. 1169 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1170 OS.flush(); 1171 } 1172 return Result; 1173 } 1174 #endif 1175 1176 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1177 const Instruction *Orig) { 1178 // If the loop was versioned with memchecks, add the corresponding no-alias 1179 // metadata. 1180 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1181 LVer->annotateInstWithNoAlias(To, Orig); 1182 } 1183 1184 void InnerLoopVectorizer::addMetadata(Instruction *To, 1185 Instruction *From) { 1186 propagateMetadata(To, From); 1187 addNewMetadata(To, From); 1188 } 1189 1190 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1191 Instruction *From) { 1192 for (Value *V : To) { 1193 if (Instruction *I = dyn_cast<Instruction>(V)) 1194 addMetadata(I, From); 1195 } 1196 } 1197 1198 namespace llvm { 1199 1200 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1201 // lowered. 1202 enum ScalarEpilogueLowering { 1203 1204 // The default: allowing scalar epilogues. 1205 CM_ScalarEpilogueAllowed, 1206 1207 // Vectorization with OptForSize: don't allow epilogues. 1208 CM_ScalarEpilogueNotAllowedOptSize, 1209 1210 // A special case of vectorisation with OptForSize: loops with a very small 1211 // trip count are considered for vectorization under OptForSize, thereby 1212 // making sure the cost of their loop body is dominant, free of runtime 1213 // guards and scalar iteration overheads. 1214 CM_ScalarEpilogueNotAllowedLowTripLoop, 1215 1216 // Loop hint predicate indicating an epilogue is undesired. 1217 CM_ScalarEpilogueNotNeededUsePredicate, 1218 1219 // Directive indicating we must either tail fold or not vectorize 1220 CM_ScalarEpilogueNotAllowedUsePredicate 1221 }; 1222 1223 /// LoopVectorizationCostModel - estimates the expected speedups due to 1224 /// vectorization. 1225 /// In many cases vectorization is not profitable. This can happen because of 1226 /// a number of reasons. In this class we mainly attempt to predict the 1227 /// expected speedup/slowdowns due to the supported instruction set. We use the 1228 /// TargetTransformInfo to query the different backends for the cost of 1229 /// different operations. 1230 class LoopVectorizationCostModel { 1231 public: 1232 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1233 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1234 LoopVectorizationLegality *Legal, 1235 const TargetTransformInfo &TTI, 1236 const TargetLibraryInfo *TLI, DemandedBits *DB, 1237 AssumptionCache *AC, 1238 OptimizationRemarkEmitter *ORE, const Function *F, 1239 const LoopVectorizeHints *Hints, 1240 InterleavedAccessInfo &IAI) 1241 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1242 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1243 Hints(Hints), InterleaveInfo(IAI) {} 1244 1245 /// \return An upper bound for the vectorization factor, or None if 1246 /// vectorization and interleaving should be avoided up front. 1247 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1248 1249 /// \return True if runtime checks are required for vectorization, and false 1250 /// otherwise. 1251 bool runtimeChecksRequired(); 1252 1253 /// \return The most profitable vectorization factor and the cost of that VF. 1254 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1255 /// then this vectorization factor will be selected if vectorization is 1256 /// possible. 1257 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1258 VectorizationFactor 1259 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1260 const LoopVectorizationPlanner &LVP); 1261 1262 /// Setup cost-based decisions for user vectorization factor. 1263 void selectUserVectorizationFactor(ElementCount UserVF) { 1264 collectUniformsAndScalars(UserVF); 1265 collectInstsToScalarize(UserVF); 1266 } 1267 1268 /// \return The size (in bits) of the smallest and widest types in the code 1269 /// that needs to be vectorized. We ignore values that remain scalar such as 1270 /// 64 bit loop indices. 1271 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1272 1273 /// \return The desired interleave count. 1274 /// If interleave count has been specified by metadata it will be returned. 1275 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1276 /// are the selected vectorization factor and the cost of the selected VF. 1277 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1278 1279 /// Memory access instruction may be vectorized in more than one way. 1280 /// Form of instruction after vectorization depends on cost. 1281 /// This function takes cost-based decisions for Load/Store instructions 1282 /// and collects them in a map. This decisions map is used for building 1283 /// the lists of loop-uniform and loop-scalar instructions. 1284 /// The calculated cost is saved with widening decision in order to 1285 /// avoid redundant calculations. 1286 void setCostBasedWideningDecision(ElementCount VF); 1287 1288 /// A struct that represents some properties of the register usage 1289 /// of a loop. 1290 struct RegisterUsage { 1291 /// Holds the number of loop invariant values that are used in the loop. 1292 /// The key is ClassID of target-provided register class. 1293 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1294 /// Holds the maximum number of concurrent live intervals in the loop. 1295 /// The key is ClassID of target-provided register class. 1296 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1297 }; 1298 1299 /// \return Returns information about the register usages of the loop for the 1300 /// given vectorization factors. 1301 SmallVector<RegisterUsage, 8> 1302 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1303 1304 /// Collect values we want to ignore in the cost model. 1305 void collectValuesToIgnore(); 1306 1307 /// Split reductions into those that happen in the loop, and those that happen 1308 /// outside. In loop reductions are collected into InLoopReductionChains. 1309 void collectInLoopReductions(); 1310 1311 /// \returns The smallest bitwidth each instruction can be represented with. 1312 /// The vector equivalents of these instructions should be truncated to this 1313 /// type. 1314 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1315 return MinBWs; 1316 } 1317 1318 /// \returns True if it is more profitable to scalarize instruction \p I for 1319 /// vectorization factor \p VF. 1320 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1321 assert(VF.isVector() && 1322 "Profitable to scalarize relevant only for VF > 1."); 1323 1324 // Cost model is not run in the VPlan-native path - return conservative 1325 // result until this changes. 1326 if (EnableVPlanNativePath) 1327 return false; 1328 1329 auto Scalars = InstsToScalarize.find(VF); 1330 assert(Scalars != InstsToScalarize.end() && 1331 "VF not yet analyzed for scalarization profitability"); 1332 return Scalars->second.find(I) != Scalars->second.end(); 1333 } 1334 1335 /// Returns true if \p I is known to be uniform after vectorization. 1336 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1337 if (VF.isScalar()) 1338 return true; 1339 1340 // Cost model is not run in the VPlan-native path - return conservative 1341 // result until this changes. 1342 if (EnableVPlanNativePath) 1343 return false; 1344 1345 auto UniformsPerVF = Uniforms.find(VF); 1346 assert(UniformsPerVF != Uniforms.end() && 1347 "VF not yet analyzed for uniformity"); 1348 return UniformsPerVF->second.count(I); 1349 } 1350 1351 /// Returns true if \p I is known to be scalar after vectorization. 1352 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1353 if (VF.isScalar()) 1354 return true; 1355 1356 // Cost model is not run in the VPlan-native path - return conservative 1357 // result until this changes. 1358 if (EnableVPlanNativePath) 1359 return false; 1360 1361 auto ScalarsPerVF = Scalars.find(VF); 1362 assert(ScalarsPerVF != Scalars.end() && 1363 "Scalar values are not calculated for VF"); 1364 return ScalarsPerVF->second.count(I); 1365 } 1366 1367 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1368 /// for vectorization factor \p VF. 1369 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1370 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1371 !isProfitableToScalarize(I, VF) && 1372 !isScalarAfterVectorization(I, VF); 1373 } 1374 1375 /// Decision that was taken during cost calculation for memory instruction. 1376 enum InstWidening { 1377 CM_Unknown, 1378 CM_Widen, // For consecutive accesses with stride +1. 1379 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1380 CM_Interleave, 1381 CM_GatherScatter, 1382 CM_Scalarize 1383 }; 1384 1385 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1386 /// instruction \p I and vector width \p VF. 1387 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1388 unsigned Cost) { 1389 assert(VF.isVector() && "Expected VF >=2"); 1390 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1391 } 1392 1393 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1394 /// interleaving group \p Grp and vector width \p VF. 1395 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1396 ElementCount VF, InstWidening W, unsigned Cost) { 1397 assert(VF.isVector() && "Expected VF >=2"); 1398 /// Broadcast this decicion to all instructions inside the group. 1399 /// But the cost will be assigned to one instruction only. 1400 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1401 if (auto *I = Grp->getMember(i)) { 1402 if (Grp->getInsertPos() == I) 1403 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1404 else 1405 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1406 } 1407 } 1408 } 1409 1410 /// Return the cost model decision for the given instruction \p I and vector 1411 /// width \p VF. Return CM_Unknown if this instruction did not pass 1412 /// through the cost modeling. 1413 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1414 assert(VF.isVector() && "Expected VF to be a vector VF"); 1415 // Cost model is not run in the VPlan-native path - return conservative 1416 // result until this changes. 1417 if (EnableVPlanNativePath) 1418 return CM_GatherScatter; 1419 1420 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1421 auto Itr = WideningDecisions.find(InstOnVF); 1422 if (Itr == WideningDecisions.end()) 1423 return CM_Unknown; 1424 return Itr->second.first; 1425 } 1426 1427 /// Return the vectorization cost for the given instruction \p I and vector 1428 /// width \p VF. 1429 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1430 assert(VF.isVector() && "Expected VF >=2"); 1431 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1432 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1433 "The cost is not calculated"); 1434 return WideningDecisions[InstOnVF].second; 1435 } 1436 1437 /// Return True if instruction \p I is an optimizable truncate whose operand 1438 /// is an induction variable. Such a truncate will be removed by adding a new 1439 /// induction variable with the destination type. 1440 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1441 // If the instruction is not a truncate, return false. 1442 auto *Trunc = dyn_cast<TruncInst>(I); 1443 if (!Trunc) 1444 return false; 1445 1446 // Get the source and destination types of the truncate. 1447 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1448 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1449 1450 // If the truncate is free for the given types, return false. Replacing a 1451 // free truncate with an induction variable would add an induction variable 1452 // update instruction to each iteration of the loop. We exclude from this 1453 // check the primary induction variable since it will need an update 1454 // instruction regardless. 1455 Value *Op = Trunc->getOperand(0); 1456 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1457 return false; 1458 1459 // If the truncated value is not an induction variable, return false. 1460 return Legal->isInductionPhi(Op); 1461 } 1462 1463 /// Collects the instructions to scalarize for each predicated instruction in 1464 /// the loop. 1465 void collectInstsToScalarize(ElementCount VF); 1466 1467 /// Collect Uniform and Scalar values for the given \p VF. 1468 /// The sets depend on CM decision for Load/Store instructions 1469 /// that may be vectorized as interleave, gather-scatter or scalarized. 1470 void collectUniformsAndScalars(ElementCount VF) { 1471 // Do the analysis once. 1472 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1473 return; 1474 setCostBasedWideningDecision(VF); 1475 collectLoopUniforms(VF); 1476 collectLoopScalars(VF); 1477 } 1478 1479 /// Returns true if the target machine supports masked store operation 1480 /// for the given \p DataType and kind of access to \p Ptr. 1481 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1482 return Legal->isConsecutivePtr(Ptr) && 1483 TTI.isLegalMaskedStore(DataType, Alignment); 1484 } 1485 1486 /// Returns true if the target machine supports masked load operation 1487 /// for the given \p DataType and kind of access to \p Ptr. 1488 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1489 return Legal->isConsecutivePtr(Ptr) && 1490 TTI.isLegalMaskedLoad(DataType, Alignment); 1491 } 1492 1493 /// Returns true if the target machine supports masked scatter operation 1494 /// for the given \p DataType. 1495 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1496 return TTI.isLegalMaskedScatter(DataType, Alignment); 1497 } 1498 1499 /// Returns true if the target machine supports masked gather operation 1500 /// for the given \p DataType. 1501 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1502 return TTI.isLegalMaskedGather(DataType, Alignment); 1503 } 1504 1505 /// Returns true if the target machine can represent \p V as a masked gather 1506 /// or scatter operation. 1507 bool isLegalGatherOrScatter(Value *V) { 1508 bool LI = isa<LoadInst>(V); 1509 bool SI = isa<StoreInst>(V); 1510 if (!LI && !SI) 1511 return false; 1512 auto *Ty = getMemInstValueType(V); 1513 Align Align = getLoadStoreAlignment(V); 1514 return (LI && isLegalMaskedGather(Ty, Align)) || 1515 (SI && isLegalMaskedScatter(Ty, Align)); 1516 } 1517 1518 /// Returns true if \p I is an instruction that will be scalarized with 1519 /// predication. Such instructions include conditional stores and 1520 /// instructions that may divide by zero. 1521 /// If a non-zero VF has been calculated, we check if I will be scalarized 1522 /// predication for that VF. 1523 bool isScalarWithPredication(Instruction *I, 1524 ElementCount VF = ElementCount::getFixed(1)); 1525 1526 // Returns true if \p I is an instruction that will be predicated either 1527 // through scalar predication or masked load/store or masked gather/scatter. 1528 // Superset of instructions that return true for isScalarWithPredication. 1529 bool isPredicatedInst(Instruction *I) { 1530 if (!blockNeedsPredication(I->getParent())) 1531 return false; 1532 // Loads and stores that need some form of masked operation are predicated 1533 // instructions. 1534 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1535 return Legal->isMaskRequired(I); 1536 return isScalarWithPredication(I); 1537 } 1538 1539 /// Returns true if \p I is a memory instruction with consecutive memory 1540 /// access that can be widened. 1541 bool 1542 memoryInstructionCanBeWidened(Instruction *I, 1543 ElementCount VF = ElementCount::getFixed(1)); 1544 1545 /// Returns true if \p I is a memory instruction in an interleaved-group 1546 /// of memory accesses that can be vectorized with wide vector loads/stores 1547 /// and shuffles. 1548 bool 1549 interleavedAccessCanBeWidened(Instruction *I, 1550 ElementCount VF = ElementCount::getFixed(1)); 1551 1552 /// Check if \p Instr belongs to any interleaved access group. 1553 bool isAccessInterleaved(Instruction *Instr) { 1554 return InterleaveInfo.isInterleaved(Instr); 1555 } 1556 1557 /// Get the interleaved access group that \p Instr belongs to. 1558 const InterleaveGroup<Instruction> * 1559 getInterleavedAccessGroup(Instruction *Instr) { 1560 return InterleaveInfo.getInterleaveGroup(Instr); 1561 } 1562 1563 /// Returns true if we're required to use a scalar epilogue for at least 1564 /// the final iteration of the original loop. 1565 bool requiresScalarEpilogue() const { 1566 if (!isScalarEpilogueAllowed()) 1567 return false; 1568 // If we might exit from anywhere but the latch, must run the exiting 1569 // iteration in scalar form. 1570 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1571 return true; 1572 return InterleaveInfo.requiresScalarEpilogue(); 1573 } 1574 1575 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1576 /// loop hint annotation. 1577 bool isScalarEpilogueAllowed() const { 1578 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1579 } 1580 1581 /// Returns true if all loop blocks should be masked to fold tail loop. 1582 bool foldTailByMasking() const { return FoldTailByMasking; } 1583 1584 bool blockNeedsPredication(BasicBlock *BB) { 1585 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1586 } 1587 1588 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1589 /// nodes to the chain of instructions representing the reductions. Uses a 1590 /// MapVector to ensure deterministic iteration order. 1591 using ReductionChainMap = 1592 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1593 1594 /// Return the chain of instructions representing an inloop reduction. 1595 const ReductionChainMap &getInLoopReductionChains() const { 1596 return InLoopReductionChains; 1597 } 1598 1599 /// Returns true if the Phi is part of an inloop reduction. 1600 bool isInLoopReduction(PHINode *Phi) const { 1601 return InLoopReductionChains.count(Phi); 1602 } 1603 1604 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1605 /// with factor VF. Return the cost of the instruction, including 1606 /// scalarization overhead if it's needed. 1607 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1608 1609 /// Estimate cost of a call instruction CI if it were vectorized with factor 1610 /// VF. Return the cost of the instruction, including scalarization overhead 1611 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1612 /// scalarized - 1613 /// i.e. either vector version isn't available, or is too expensive. 1614 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1615 bool &NeedToScalarize); 1616 1617 /// Invalidates decisions already taken by the cost model. 1618 void invalidateCostModelingDecisions() { 1619 WideningDecisions.clear(); 1620 Uniforms.clear(); 1621 Scalars.clear(); 1622 } 1623 1624 private: 1625 unsigned NumPredStores = 0; 1626 1627 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1628 /// than zero. One is returned if vectorization should best be avoided due 1629 /// to cost. 1630 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1631 ElementCount UserVF); 1632 1633 /// The vectorization cost is a combination of the cost itself and a boolean 1634 /// indicating whether any of the contributing operations will actually 1635 /// operate on 1636 /// vector values after type legalization in the backend. If this latter value 1637 /// is 1638 /// false, then all operations will be scalarized (i.e. no vectorization has 1639 /// actually taken place). 1640 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1641 1642 /// Returns the expected execution cost. The unit of the cost does 1643 /// not matter because we use the 'cost' units to compare different 1644 /// vector widths. The cost that is returned is *not* normalized by 1645 /// the factor width. 1646 VectorizationCostTy expectedCost(ElementCount VF); 1647 1648 /// Returns the execution time cost of an instruction for a given vector 1649 /// width. Vector width of one means scalar. 1650 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1651 1652 /// The cost-computation logic from getInstructionCost which provides 1653 /// the vector type as an output parameter. 1654 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1655 Type *&VectorTy); 1656 1657 /// Calculate vectorization cost of memory instruction \p I. 1658 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1659 1660 /// The cost computation for scalarized memory instruction. 1661 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1662 1663 /// The cost computation for interleaving group of memory instructions. 1664 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1665 1666 /// The cost computation for Gather/Scatter instruction. 1667 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1668 1669 /// The cost computation for widening instruction \p I with consecutive 1670 /// memory access. 1671 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1672 1673 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1674 /// Load: scalar load + broadcast. 1675 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1676 /// element) 1677 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1678 1679 /// Estimate the overhead of scalarizing an instruction. This is a 1680 /// convenience wrapper for the type-based getScalarizationOverhead API. 1681 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1682 1683 /// Returns whether the instruction is a load or store and will be a emitted 1684 /// as a vector operation. 1685 bool isConsecutiveLoadOrStore(Instruction *I); 1686 1687 /// Returns true if an artificially high cost for emulated masked memrefs 1688 /// should be used. 1689 bool useEmulatedMaskMemRefHack(Instruction *I); 1690 1691 /// Map of scalar integer values to the smallest bitwidth they can be legally 1692 /// represented as. The vector equivalents of these values should be truncated 1693 /// to this type. 1694 MapVector<Instruction *, uint64_t> MinBWs; 1695 1696 /// A type representing the costs for instructions if they were to be 1697 /// scalarized rather than vectorized. The entries are Instruction-Cost 1698 /// pairs. 1699 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1700 1701 /// A set containing all BasicBlocks that are known to present after 1702 /// vectorization as a predicated block. 1703 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1704 1705 /// Records whether it is allowed to have the original scalar loop execute at 1706 /// least once. This may be needed as a fallback loop in case runtime 1707 /// aliasing/dependence checks fail, or to handle the tail/remainder 1708 /// iterations when the trip count is unknown or doesn't divide by the VF, 1709 /// or as a peel-loop to handle gaps in interleave-groups. 1710 /// Under optsize and when the trip count is very small we don't allow any 1711 /// iterations to execute in the scalar loop. 1712 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1713 1714 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1715 bool FoldTailByMasking = false; 1716 1717 /// A map holding scalar costs for different vectorization factors. The 1718 /// presence of a cost for an instruction in the mapping indicates that the 1719 /// instruction will be scalarized when vectorizing with the associated 1720 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1721 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1722 1723 /// Holds the instructions known to be uniform after vectorization. 1724 /// The data is collected per VF. 1725 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1726 1727 /// Holds the instructions known to be scalar after vectorization. 1728 /// The data is collected per VF. 1729 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1730 1731 /// Holds the instructions (address computations) that are forced to be 1732 /// scalarized. 1733 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1734 1735 /// PHINodes of the reductions that should be expanded in-loop along with 1736 /// their associated chains of reduction operations, in program order from top 1737 /// (PHI) to bottom 1738 ReductionChainMap InLoopReductionChains; 1739 1740 /// Returns the expected difference in cost from scalarizing the expression 1741 /// feeding a predicated instruction \p PredInst. The instructions to 1742 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1743 /// non-negative return value implies the expression will be scalarized. 1744 /// Currently, only single-use chains are considered for scalarization. 1745 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1746 ElementCount VF); 1747 1748 /// Collect the instructions that are uniform after vectorization. An 1749 /// instruction is uniform if we represent it with a single scalar value in 1750 /// the vectorized loop corresponding to each vector iteration. Examples of 1751 /// uniform instructions include pointer operands of consecutive or 1752 /// interleaved memory accesses. Note that although uniformity implies an 1753 /// instruction will be scalar, the reverse is not true. In general, a 1754 /// scalarized instruction will be represented by VF scalar values in the 1755 /// vectorized loop, each corresponding to an iteration of the original 1756 /// scalar loop. 1757 void collectLoopUniforms(ElementCount VF); 1758 1759 /// Collect the instructions that are scalar after vectorization. An 1760 /// instruction is scalar if it is known to be uniform or will be scalarized 1761 /// during vectorization. Non-uniform scalarized instructions will be 1762 /// represented by VF values in the vectorized loop, each corresponding to an 1763 /// iteration of the original scalar loop. 1764 void collectLoopScalars(ElementCount VF); 1765 1766 /// Keeps cost model vectorization decision and cost for instructions. 1767 /// Right now it is used for memory instructions only. 1768 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1769 std::pair<InstWidening, unsigned>>; 1770 1771 DecisionList WideningDecisions; 1772 1773 /// Returns true if \p V is expected to be vectorized and it needs to be 1774 /// extracted. 1775 bool needsExtract(Value *V, ElementCount VF) const { 1776 Instruction *I = dyn_cast<Instruction>(V); 1777 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1778 TheLoop->isLoopInvariant(I)) 1779 return false; 1780 1781 // Assume we can vectorize V (and hence we need extraction) if the 1782 // scalars are not computed yet. This can happen, because it is called 1783 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1784 // the scalars are collected. That should be a safe assumption in most 1785 // cases, because we check if the operands have vectorizable types 1786 // beforehand in LoopVectorizationLegality. 1787 return Scalars.find(VF) == Scalars.end() || 1788 !isScalarAfterVectorization(I, VF); 1789 }; 1790 1791 /// Returns a range containing only operands needing to be extracted. 1792 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1793 ElementCount VF) { 1794 return SmallVector<Value *, 4>(make_filter_range( 1795 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1796 } 1797 1798 /// Determines if we have the infrastructure to vectorize loop \p L and its 1799 /// epilogue, assuming the main loop is vectorized by \p VF. 1800 bool isCandidateForEpilogueVectorization(const Loop &L, 1801 const ElementCount VF) const; 1802 1803 /// Returns true if epilogue vectorization is considered profitable, and 1804 /// false otherwise. 1805 /// \p VF is the vectorization factor chosen for the original loop. 1806 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1807 1808 public: 1809 /// The loop that we evaluate. 1810 Loop *TheLoop; 1811 1812 /// Predicated scalar evolution analysis. 1813 PredicatedScalarEvolution &PSE; 1814 1815 /// Loop Info analysis. 1816 LoopInfo *LI; 1817 1818 /// Vectorization legality. 1819 LoopVectorizationLegality *Legal; 1820 1821 /// Vector target information. 1822 const TargetTransformInfo &TTI; 1823 1824 /// Target Library Info. 1825 const TargetLibraryInfo *TLI; 1826 1827 /// Demanded bits analysis. 1828 DemandedBits *DB; 1829 1830 /// Assumption cache. 1831 AssumptionCache *AC; 1832 1833 /// Interface to emit optimization remarks. 1834 OptimizationRemarkEmitter *ORE; 1835 1836 const Function *TheFunction; 1837 1838 /// Loop Vectorize Hint. 1839 const LoopVectorizeHints *Hints; 1840 1841 /// The interleave access information contains groups of interleaved accesses 1842 /// with the same stride and close to each other. 1843 InterleavedAccessInfo &InterleaveInfo; 1844 1845 /// Values to ignore in the cost model. 1846 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1847 1848 /// Values to ignore in the cost model when VF > 1. 1849 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1850 1851 /// Profitable vector factors. 1852 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1853 }; 1854 1855 } // end namespace llvm 1856 1857 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1858 // vectorization. The loop needs to be annotated with #pragma omp simd 1859 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1860 // vector length information is not provided, vectorization is not considered 1861 // explicit. Interleave hints are not allowed either. These limitations will be 1862 // relaxed in the future. 1863 // Please, note that we are currently forced to abuse the pragma 'clang 1864 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1865 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1866 // provides *explicit vectorization hints* (LV can bypass legal checks and 1867 // assume that vectorization is legal). However, both hints are implemented 1868 // using the same metadata (llvm.loop.vectorize, processed by 1869 // LoopVectorizeHints). This will be fixed in the future when the native IR 1870 // representation for pragma 'omp simd' is introduced. 1871 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1872 OptimizationRemarkEmitter *ORE) { 1873 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1874 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1875 1876 // Only outer loops with an explicit vectorization hint are supported. 1877 // Unannotated outer loops are ignored. 1878 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1879 return false; 1880 1881 Function *Fn = OuterLp->getHeader()->getParent(); 1882 if (!Hints.allowVectorization(Fn, OuterLp, 1883 true /*VectorizeOnlyWhenForced*/)) { 1884 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1885 return false; 1886 } 1887 1888 if (Hints.getInterleave() > 1) { 1889 // TODO: Interleave support is future work. 1890 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1891 "outer loops.\n"); 1892 Hints.emitRemarkWithHints(); 1893 return false; 1894 } 1895 1896 return true; 1897 } 1898 1899 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1900 OptimizationRemarkEmitter *ORE, 1901 SmallVectorImpl<Loop *> &V) { 1902 // Collect inner loops and outer loops without irreducible control flow. For 1903 // now, only collect outer loops that have explicit vectorization hints. If we 1904 // are stress testing the VPlan H-CFG construction, we collect the outermost 1905 // loop of every loop nest. 1906 if (L.isInnermost() || VPlanBuildStressTest || 1907 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1908 LoopBlocksRPO RPOT(&L); 1909 RPOT.perform(LI); 1910 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1911 V.push_back(&L); 1912 // TODO: Collect inner loops inside marked outer loops in case 1913 // vectorization fails for the outer loop. Do not invoke 1914 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1915 // already known to be reducible. We can use an inherited attribute for 1916 // that. 1917 return; 1918 } 1919 } 1920 for (Loop *InnerL : L) 1921 collectSupportedLoops(*InnerL, LI, ORE, V); 1922 } 1923 1924 namespace { 1925 1926 /// The LoopVectorize Pass. 1927 struct LoopVectorize : public FunctionPass { 1928 /// Pass identification, replacement for typeid 1929 static char ID; 1930 1931 LoopVectorizePass Impl; 1932 1933 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1934 bool VectorizeOnlyWhenForced = false) 1935 : FunctionPass(ID), 1936 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1937 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1938 } 1939 1940 bool runOnFunction(Function &F) override { 1941 if (skipFunction(F)) 1942 return false; 1943 1944 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1945 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1946 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1947 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1948 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1949 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1950 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1951 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1952 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1953 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1954 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1955 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1956 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1957 1958 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1959 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1960 1961 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1962 GetLAA, *ORE, PSI).MadeAnyChange; 1963 } 1964 1965 void getAnalysisUsage(AnalysisUsage &AU) const override { 1966 AU.addRequired<AssumptionCacheTracker>(); 1967 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1968 AU.addRequired<DominatorTreeWrapperPass>(); 1969 AU.addRequired<LoopInfoWrapperPass>(); 1970 AU.addRequired<ScalarEvolutionWrapperPass>(); 1971 AU.addRequired<TargetTransformInfoWrapperPass>(); 1972 AU.addRequired<AAResultsWrapperPass>(); 1973 AU.addRequired<LoopAccessLegacyAnalysis>(); 1974 AU.addRequired<DemandedBitsWrapperPass>(); 1975 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1976 AU.addRequired<InjectTLIMappingsLegacy>(); 1977 1978 // We currently do not preserve loopinfo/dominator analyses with outer loop 1979 // vectorization. Until this is addressed, mark these analyses as preserved 1980 // only for non-VPlan-native path. 1981 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1982 if (!EnableVPlanNativePath) { 1983 AU.addPreserved<LoopInfoWrapperPass>(); 1984 AU.addPreserved<DominatorTreeWrapperPass>(); 1985 } 1986 1987 AU.addPreserved<BasicAAWrapperPass>(); 1988 AU.addPreserved<GlobalsAAWrapperPass>(); 1989 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1990 } 1991 }; 1992 1993 } // end anonymous namespace 1994 1995 //===----------------------------------------------------------------------===// 1996 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1997 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1998 //===----------------------------------------------------------------------===// 1999 2000 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2001 // We need to place the broadcast of invariant variables outside the loop, 2002 // but only if it's proven safe to do so. Else, broadcast will be inside 2003 // vector loop body. 2004 Instruction *Instr = dyn_cast<Instruction>(V); 2005 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2006 (!Instr || 2007 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2008 // Place the code for broadcasting invariant variables in the new preheader. 2009 IRBuilder<>::InsertPointGuard Guard(Builder); 2010 if (SafeToHoist) 2011 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2012 2013 // Broadcast the scalar into all locations in the vector. 2014 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2015 2016 return Shuf; 2017 } 2018 2019 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2020 const InductionDescriptor &II, Value *Step, Value *Start, 2021 Instruction *EntryVal) { 2022 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2023 "Expected either an induction phi-node or a truncate of it!"); 2024 2025 // Construct the initial value of the vector IV in the vector loop preheader 2026 auto CurrIP = Builder.saveIP(); 2027 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2028 if (isa<TruncInst>(EntryVal)) { 2029 assert(Start->getType()->isIntegerTy() && 2030 "Truncation requires an integer type"); 2031 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2032 Step = Builder.CreateTrunc(Step, TruncType); 2033 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2034 } 2035 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2036 Value *SteppedStart = 2037 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2038 2039 // We create vector phi nodes for both integer and floating-point induction 2040 // variables. Here, we determine the kind of arithmetic we will perform. 2041 Instruction::BinaryOps AddOp; 2042 Instruction::BinaryOps MulOp; 2043 if (Step->getType()->isIntegerTy()) { 2044 AddOp = Instruction::Add; 2045 MulOp = Instruction::Mul; 2046 } else { 2047 AddOp = II.getInductionOpcode(); 2048 MulOp = Instruction::FMul; 2049 } 2050 2051 // Multiply the vectorization factor by the step using integer or 2052 // floating-point arithmetic as appropriate. 2053 Value *ConstVF = 2054 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2055 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2056 2057 // Create a vector splat to use in the induction update. 2058 // 2059 // FIXME: If the step is non-constant, we create the vector splat with 2060 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2061 // handle a constant vector splat. 2062 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2063 Value *SplatVF = isa<Constant>(Mul) 2064 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2065 : Builder.CreateVectorSplat(VF, Mul); 2066 Builder.restoreIP(CurrIP); 2067 2068 // We may need to add the step a number of times, depending on the unroll 2069 // factor. The last of those goes into the PHI. 2070 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2071 &*LoopVectorBody->getFirstInsertionPt()); 2072 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2073 Instruction *LastInduction = VecInd; 2074 for (unsigned Part = 0; Part < UF; ++Part) { 2075 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2076 2077 if (isa<TruncInst>(EntryVal)) 2078 addMetadata(LastInduction, EntryVal); 2079 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2080 2081 LastInduction = cast<Instruction>(addFastMathFlag( 2082 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2083 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2084 } 2085 2086 // Move the last step to the end of the latch block. This ensures consistent 2087 // placement of all induction updates. 2088 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2089 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2090 auto *ICmp = cast<Instruction>(Br->getCondition()); 2091 LastInduction->moveBefore(ICmp); 2092 LastInduction->setName("vec.ind.next"); 2093 2094 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2095 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2096 } 2097 2098 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2099 return Cost->isScalarAfterVectorization(I, VF) || 2100 Cost->isProfitableToScalarize(I, VF); 2101 } 2102 2103 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2104 if (shouldScalarizeInstruction(IV)) 2105 return true; 2106 auto isScalarInst = [&](User *U) -> bool { 2107 auto *I = cast<Instruction>(U); 2108 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2109 }; 2110 return llvm::any_of(IV->users(), isScalarInst); 2111 } 2112 2113 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2114 const InductionDescriptor &ID, const Instruction *EntryVal, 2115 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2116 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2117 "Expected either an induction phi-node or a truncate of it!"); 2118 2119 // This induction variable is not the phi from the original loop but the 2120 // newly-created IV based on the proof that casted Phi is equal to the 2121 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2122 // re-uses the same InductionDescriptor that original IV uses but we don't 2123 // have to do any recording in this case - that is done when original IV is 2124 // processed. 2125 if (isa<TruncInst>(EntryVal)) 2126 return; 2127 2128 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2129 if (Casts.empty()) 2130 return; 2131 // Only the first Cast instruction in the Casts vector is of interest. 2132 // The rest of the Casts (if exist) have no uses outside the 2133 // induction update chain itself. 2134 Instruction *CastInst = *Casts.begin(); 2135 if (Lane < UINT_MAX) 2136 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2137 else 2138 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2139 } 2140 2141 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2142 TruncInst *Trunc) { 2143 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2144 "Primary induction variable must have an integer type"); 2145 2146 auto II = Legal->getInductionVars().find(IV); 2147 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2148 2149 auto ID = II->second; 2150 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2151 2152 // The value from the original loop to which we are mapping the new induction 2153 // variable. 2154 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2155 2156 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2157 2158 // Generate code for the induction step. Note that induction steps are 2159 // required to be loop-invariant 2160 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2161 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2162 "Induction step should be loop invariant"); 2163 if (PSE.getSE()->isSCEVable(IV->getType())) { 2164 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2165 return Exp.expandCodeFor(Step, Step->getType(), 2166 LoopVectorPreHeader->getTerminator()); 2167 } 2168 return cast<SCEVUnknown>(Step)->getValue(); 2169 }; 2170 2171 // The scalar value to broadcast. This is derived from the canonical 2172 // induction variable. If a truncation type is given, truncate the canonical 2173 // induction variable and step. Otherwise, derive these values from the 2174 // induction descriptor. 2175 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2176 Value *ScalarIV = Induction; 2177 if (IV != OldInduction) { 2178 ScalarIV = IV->getType()->isIntegerTy() 2179 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2180 : Builder.CreateCast(Instruction::SIToFP, Induction, 2181 IV->getType()); 2182 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2183 ScalarIV->setName("offset.idx"); 2184 } 2185 if (Trunc) { 2186 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2187 assert(Step->getType()->isIntegerTy() && 2188 "Truncation requires an integer step"); 2189 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2190 Step = Builder.CreateTrunc(Step, TruncType); 2191 } 2192 return ScalarIV; 2193 }; 2194 2195 // Create the vector values from the scalar IV, in the absence of creating a 2196 // vector IV. 2197 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2198 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2199 for (unsigned Part = 0; Part < UF; ++Part) { 2200 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2201 Value *EntryPart = 2202 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2203 ID.getInductionOpcode()); 2204 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2205 if (Trunc) 2206 addMetadata(EntryPart, Trunc); 2207 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2208 } 2209 }; 2210 2211 // Now do the actual transformations, and start with creating the step value. 2212 Value *Step = CreateStepValue(ID.getStep()); 2213 if (VF.isZero() || VF.isScalar()) { 2214 Value *ScalarIV = CreateScalarIV(Step); 2215 CreateSplatIV(ScalarIV, Step); 2216 return; 2217 } 2218 2219 // Determine if we want a scalar version of the induction variable. This is 2220 // true if the induction variable itself is not widened, or if it has at 2221 // least one user in the loop that is not widened. 2222 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2223 if (!NeedsScalarIV) { 2224 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2225 return; 2226 } 2227 2228 // Try to create a new independent vector induction variable. If we can't 2229 // create the phi node, we will splat the scalar induction variable in each 2230 // loop iteration. 2231 if (!shouldScalarizeInstruction(EntryVal)) { 2232 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2233 Value *ScalarIV = CreateScalarIV(Step); 2234 // Create scalar steps that can be used by instructions we will later 2235 // scalarize. Note that the addition of the scalar steps will not increase 2236 // the number of instructions in the loop in the common case prior to 2237 // InstCombine. We will be trading one vector extract for each scalar step. 2238 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2239 return; 2240 } 2241 2242 // All IV users are scalar instructions, so only emit a scalar IV, not a 2243 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2244 // predicate used by the masked loads/stores. 2245 Value *ScalarIV = CreateScalarIV(Step); 2246 if (!Cost->isScalarEpilogueAllowed()) 2247 CreateSplatIV(ScalarIV, Step); 2248 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2249 } 2250 2251 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2252 Instruction::BinaryOps BinOp) { 2253 // Create and check the types. 2254 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2255 int VLen = ValVTy->getNumElements(); 2256 2257 Type *STy = Val->getType()->getScalarType(); 2258 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2259 "Induction Step must be an integer or FP"); 2260 assert(Step->getType() == STy && "Step has wrong type"); 2261 2262 SmallVector<Constant *, 8> Indices; 2263 2264 if (STy->isIntegerTy()) { 2265 // Create a vector of consecutive numbers from zero to VF. 2266 for (int i = 0; i < VLen; ++i) 2267 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2268 2269 // Add the consecutive indices to the vector value. 2270 Constant *Cv = ConstantVector::get(Indices); 2271 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2272 Step = Builder.CreateVectorSplat(VLen, Step); 2273 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2274 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2275 // which can be found from the original scalar operations. 2276 Step = Builder.CreateMul(Cv, Step); 2277 return Builder.CreateAdd(Val, Step, "induction"); 2278 } 2279 2280 // Floating point induction. 2281 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2282 "Binary Opcode should be specified for FP induction"); 2283 // Create a vector of consecutive numbers from zero to VF. 2284 for (int i = 0; i < VLen; ++i) 2285 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2286 2287 // Add the consecutive indices to the vector value. 2288 Constant *Cv = ConstantVector::get(Indices); 2289 2290 Step = Builder.CreateVectorSplat(VLen, Step); 2291 2292 // Floating point operations had to be 'fast' to enable the induction. 2293 FastMathFlags Flags; 2294 Flags.setFast(); 2295 2296 Value *MulOp = Builder.CreateFMul(Cv, Step); 2297 if (isa<Instruction>(MulOp)) 2298 // Have to check, MulOp may be a constant 2299 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2300 2301 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2302 if (isa<Instruction>(BOp)) 2303 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2304 return BOp; 2305 } 2306 2307 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2308 Instruction *EntryVal, 2309 const InductionDescriptor &ID) { 2310 // We shouldn't have to build scalar steps if we aren't vectorizing. 2311 assert(VF.isVector() && "VF should be greater than one"); 2312 // Get the value type and ensure it and the step have the same integer type. 2313 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2314 assert(ScalarIVTy == Step->getType() && 2315 "Val and Step should have the same type"); 2316 2317 // We build scalar steps for both integer and floating-point induction 2318 // variables. Here, we determine the kind of arithmetic we will perform. 2319 Instruction::BinaryOps AddOp; 2320 Instruction::BinaryOps MulOp; 2321 if (ScalarIVTy->isIntegerTy()) { 2322 AddOp = Instruction::Add; 2323 MulOp = Instruction::Mul; 2324 } else { 2325 AddOp = ID.getInductionOpcode(); 2326 MulOp = Instruction::FMul; 2327 } 2328 2329 // Determine the number of scalars we need to generate for each unroll 2330 // iteration. If EntryVal is uniform, we only need to generate the first 2331 // lane. Otherwise, we generate all VF values. 2332 unsigned Lanes = 2333 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2334 ? 1 2335 : VF.getKnownMinValue(); 2336 assert((!VF.isScalable() || Lanes == 1) && 2337 "Should never scalarize a scalable vector"); 2338 // Compute the scalar steps and save the results in VectorLoopValueMap. 2339 for (unsigned Part = 0; Part < UF; ++Part) { 2340 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2341 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2342 ScalarIVTy->getScalarSizeInBits()); 2343 Value *StartIdx = 2344 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2345 if (ScalarIVTy->isFloatingPointTy()) 2346 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2347 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2348 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2349 // The step returned by `createStepForVF` is a runtime-evaluated value 2350 // when VF is scalable. Otherwise, it should be folded into a Constant. 2351 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2352 "Expected StartIdx to be folded to a constant when VF is not " 2353 "scalable"); 2354 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2355 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2356 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2357 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2358 } 2359 } 2360 } 2361 2362 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2363 assert(V != Induction && "The new induction variable should not be used."); 2364 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2365 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2366 2367 // If we have a stride that is replaced by one, do it here. Defer this for 2368 // the VPlan-native path until we start running Legal checks in that path. 2369 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2370 V = ConstantInt::get(V->getType(), 1); 2371 2372 // If we have a vector mapped to this value, return it. 2373 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2374 return VectorLoopValueMap.getVectorValue(V, Part); 2375 2376 // If the value has not been vectorized, check if it has been scalarized 2377 // instead. If it has been scalarized, and we actually need the value in 2378 // vector form, we will construct the vector values on demand. 2379 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2380 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2381 2382 // If we've scalarized a value, that value should be an instruction. 2383 auto *I = cast<Instruction>(V); 2384 2385 // If we aren't vectorizing, we can just copy the scalar map values over to 2386 // the vector map. 2387 if (VF.isScalar()) { 2388 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2389 return ScalarValue; 2390 } 2391 2392 // Get the last scalar instruction we generated for V and Part. If the value 2393 // is known to be uniform after vectorization, this corresponds to lane zero 2394 // of the Part unroll iteration. Otherwise, the last instruction is the one 2395 // we created for the last vector lane of the Part unroll iteration. 2396 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2397 ? 0 2398 : VF.getKnownMinValue() - 1; 2399 assert((!VF.isScalable() || LastLane == 0) && 2400 "Scalable vectorization can't lead to any scalarized values."); 2401 auto *LastInst = cast<Instruction>( 2402 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2403 2404 // Set the insert point after the last scalarized instruction. This ensures 2405 // the insertelement sequence will directly follow the scalar definitions. 2406 auto OldIP = Builder.saveIP(); 2407 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2408 Builder.SetInsertPoint(&*NewIP); 2409 2410 // However, if we are vectorizing, we need to construct the vector values. 2411 // If the value is known to be uniform after vectorization, we can just 2412 // broadcast the scalar value corresponding to lane zero for each unroll 2413 // iteration. Otherwise, we construct the vector values using insertelement 2414 // instructions. Since the resulting vectors are stored in 2415 // VectorLoopValueMap, we will only generate the insertelements once. 2416 Value *VectorValue = nullptr; 2417 if (Cost->isUniformAfterVectorization(I, VF)) { 2418 VectorValue = getBroadcastInstrs(ScalarValue); 2419 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2420 } else { 2421 // Initialize packing with insertelements to start from poison. 2422 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2423 Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); 2424 VectorLoopValueMap.setVectorValue(V, Part, Poison); 2425 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2426 packScalarIntoVectorValue(V, {Part, Lane}); 2427 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2428 } 2429 Builder.restoreIP(OldIP); 2430 return VectorValue; 2431 } 2432 2433 // If this scalar is unknown, assume that it is a constant or that it is 2434 // loop invariant. Broadcast V and save the value for future uses. 2435 Value *B = getBroadcastInstrs(V); 2436 VectorLoopValueMap.setVectorValue(V, Part, B); 2437 return B; 2438 } 2439 2440 Value * 2441 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2442 const VPIteration &Instance) { 2443 // If the value is not an instruction contained in the loop, it should 2444 // already be scalar. 2445 if (OrigLoop->isLoopInvariant(V)) 2446 return V; 2447 2448 assert(Instance.Lane > 0 2449 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2450 : true && "Uniform values only have lane zero"); 2451 2452 // If the value from the original loop has not been vectorized, it is 2453 // represented by UF x VF scalar values in the new loop. Return the requested 2454 // scalar value. 2455 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2456 return VectorLoopValueMap.getScalarValue(V, Instance); 2457 2458 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2459 // for the given unroll part. If this entry is not a vector type (i.e., the 2460 // vectorization factor is one), there is no need to generate an 2461 // extractelement instruction. 2462 auto *U = getOrCreateVectorValue(V, Instance.Part); 2463 if (!U->getType()->isVectorTy()) { 2464 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2465 return U; 2466 } 2467 2468 // Otherwise, the value from the original loop has been vectorized and is 2469 // represented by UF vector values. Extract and return the requested scalar 2470 // value from the appropriate vector lane. 2471 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2472 } 2473 2474 void InnerLoopVectorizer::packScalarIntoVectorValue( 2475 Value *V, const VPIteration &Instance) { 2476 assert(V != Induction && "The new induction variable should not be used."); 2477 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2478 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2479 2480 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2481 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2482 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2483 Builder.getInt32(Instance.Lane)); 2484 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2485 } 2486 2487 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2488 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2489 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2490 SmallVector<int, 8> ShuffleMask; 2491 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2492 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2493 2494 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2495 } 2496 2497 // Return whether we allow using masked interleave-groups (for dealing with 2498 // strided loads/stores that reside in predicated blocks, or for dealing 2499 // with gaps). 2500 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2501 // If an override option has been passed in for interleaved accesses, use it. 2502 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2503 return EnableMaskedInterleavedMemAccesses; 2504 2505 return TTI.enableMaskedInterleavedAccessVectorization(); 2506 } 2507 2508 // Try to vectorize the interleave group that \p Instr belongs to. 2509 // 2510 // E.g. Translate following interleaved load group (factor = 3): 2511 // for (i = 0; i < N; i+=3) { 2512 // R = Pic[i]; // Member of index 0 2513 // G = Pic[i+1]; // Member of index 1 2514 // B = Pic[i+2]; // Member of index 2 2515 // ... // do something to R, G, B 2516 // } 2517 // To: 2518 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2519 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2520 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2521 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2522 // 2523 // Or translate following interleaved store group (factor = 3): 2524 // for (i = 0; i < N; i+=3) { 2525 // ... do something to R, G, B 2526 // Pic[i] = R; // Member of index 0 2527 // Pic[i+1] = G; // Member of index 1 2528 // Pic[i+2] = B; // Member of index 2 2529 // } 2530 // To: 2531 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2532 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2533 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2534 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2535 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2536 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2537 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2538 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2539 VPValue *BlockInMask) { 2540 Instruction *Instr = Group->getInsertPos(); 2541 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2542 2543 // Prepare for the vector type of the interleaved load/store. 2544 Type *ScalarTy = getMemInstValueType(Instr); 2545 unsigned InterleaveFactor = Group->getFactor(); 2546 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2547 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2548 2549 // Prepare for the new pointers. 2550 SmallVector<Value *, 2> AddrParts; 2551 unsigned Index = Group->getIndex(Instr); 2552 2553 // TODO: extend the masked interleaved-group support to reversed access. 2554 assert((!BlockInMask || !Group->isReverse()) && 2555 "Reversed masked interleave-group not supported."); 2556 2557 // If the group is reverse, adjust the index to refer to the last vector lane 2558 // instead of the first. We adjust the index from the first vector lane, 2559 // rather than directly getting the pointer for lane VF - 1, because the 2560 // pointer operand of the interleaved access is supposed to be uniform. For 2561 // uniform instructions, we're only required to generate a value for the 2562 // first vector lane in each unroll iteration. 2563 assert(!VF.isScalable() && 2564 "scalable vector reverse operation is not implemented"); 2565 if (Group->isReverse()) 2566 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2567 2568 for (unsigned Part = 0; Part < UF; Part++) { 2569 Value *AddrPart = State.get(Addr, {Part, 0}); 2570 setDebugLocFromInst(Builder, AddrPart); 2571 2572 // Notice current instruction could be any index. Need to adjust the address 2573 // to the member of index 0. 2574 // 2575 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2576 // b = A[i]; // Member of index 0 2577 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2578 // 2579 // E.g. A[i+1] = a; // Member of index 1 2580 // A[i] = b; // Member of index 0 2581 // A[i+2] = c; // Member of index 2 (Current instruction) 2582 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2583 2584 bool InBounds = false; 2585 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2586 InBounds = gep->isInBounds(); 2587 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2588 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2589 2590 // Cast to the vector pointer type. 2591 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2592 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2593 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2594 } 2595 2596 setDebugLocFromInst(Builder, Instr); 2597 Value *PoisonVec = PoisonValue::get(VecTy); 2598 2599 Value *MaskForGaps = nullptr; 2600 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2601 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2602 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2603 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2604 } 2605 2606 // Vectorize the interleaved load group. 2607 if (isa<LoadInst>(Instr)) { 2608 // For each unroll part, create a wide load for the group. 2609 SmallVector<Value *, 2> NewLoads; 2610 for (unsigned Part = 0; Part < UF; Part++) { 2611 Instruction *NewLoad; 2612 if (BlockInMask || MaskForGaps) { 2613 assert(useMaskedInterleavedAccesses(*TTI) && 2614 "masked interleaved groups are not allowed."); 2615 Value *GroupMask = MaskForGaps; 2616 if (BlockInMask) { 2617 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2618 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2619 Value *ShuffledMask = Builder.CreateShuffleVector( 2620 BlockInMaskPart, 2621 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2622 "interleaved.mask"); 2623 GroupMask = MaskForGaps 2624 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2625 MaskForGaps) 2626 : ShuffledMask; 2627 } 2628 NewLoad = 2629 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2630 GroupMask, PoisonVec, "wide.masked.vec"); 2631 } 2632 else 2633 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2634 Group->getAlign(), "wide.vec"); 2635 Group->addMetadata(NewLoad); 2636 NewLoads.push_back(NewLoad); 2637 } 2638 2639 // For each member in the group, shuffle out the appropriate data from the 2640 // wide loads. 2641 unsigned J = 0; 2642 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2643 Instruction *Member = Group->getMember(I); 2644 2645 // Skip the gaps in the group. 2646 if (!Member) 2647 continue; 2648 2649 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2650 auto StrideMask = 2651 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2652 for (unsigned Part = 0; Part < UF; Part++) { 2653 Value *StridedVec = Builder.CreateShuffleVector( 2654 NewLoads[Part], StrideMask, "strided.vec"); 2655 2656 // If this member has different type, cast the result type. 2657 if (Member->getType() != ScalarTy) { 2658 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2659 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2660 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2661 } 2662 2663 if (Group->isReverse()) 2664 StridedVec = reverseVector(StridedVec); 2665 2666 State.set(VPDefs[J], Member, StridedVec, Part); 2667 } 2668 ++J; 2669 } 2670 return; 2671 } 2672 2673 // The sub vector type for current instruction. 2674 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2675 auto *SubVT = VectorType::get(ScalarTy, VF); 2676 2677 // Vectorize the interleaved store group. 2678 for (unsigned Part = 0; Part < UF; Part++) { 2679 // Collect the stored vector from each member. 2680 SmallVector<Value *, 4> StoredVecs; 2681 for (unsigned i = 0; i < InterleaveFactor; i++) { 2682 // Interleaved store group doesn't allow a gap, so each index has a member 2683 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2684 2685 Value *StoredVec = State.get(StoredValues[i], Part); 2686 2687 if (Group->isReverse()) 2688 StoredVec = reverseVector(StoredVec); 2689 2690 // If this member has different type, cast it to a unified type. 2691 2692 if (StoredVec->getType() != SubVT) 2693 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2694 2695 StoredVecs.push_back(StoredVec); 2696 } 2697 2698 // Concatenate all vectors into a wide vector. 2699 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2700 2701 // Interleave the elements in the wide vector. 2702 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2703 Value *IVec = Builder.CreateShuffleVector( 2704 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2705 "interleaved.vec"); 2706 2707 Instruction *NewStoreInstr; 2708 if (BlockInMask) { 2709 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2710 Value *ShuffledMask = Builder.CreateShuffleVector( 2711 BlockInMaskPart, 2712 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2713 "interleaved.mask"); 2714 NewStoreInstr = Builder.CreateMaskedStore( 2715 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2716 } 2717 else 2718 NewStoreInstr = 2719 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2720 2721 Group->addMetadata(NewStoreInstr); 2722 } 2723 } 2724 2725 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2726 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2727 VPValue *StoredValue, VPValue *BlockInMask) { 2728 // Attempt to issue a wide load. 2729 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2730 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2731 2732 assert((LI || SI) && "Invalid Load/Store instruction"); 2733 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2734 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2735 2736 LoopVectorizationCostModel::InstWidening Decision = 2737 Cost->getWideningDecision(Instr, VF); 2738 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2739 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2740 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2741 "CM decision is not to widen the memory instruction"); 2742 2743 Type *ScalarDataTy = getMemInstValueType(Instr); 2744 2745 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2746 const Align Alignment = getLoadStoreAlignment(Instr); 2747 2748 // Determine if the pointer operand of the access is either consecutive or 2749 // reverse consecutive. 2750 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2751 bool ConsecutiveStride = 2752 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2753 bool CreateGatherScatter = 2754 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2755 2756 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2757 // gather/scatter. Otherwise Decision should have been to Scalarize. 2758 assert((ConsecutiveStride || CreateGatherScatter) && 2759 "The instruction should be scalarized"); 2760 (void)ConsecutiveStride; 2761 2762 VectorParts BlockInMaskParts(UF); 2763 bool isMaskRequired = BlockInMask; 2764 if (isMaskRequired) 2765 for (unsigned Part = 0; Part < UF; ++Part) 2766 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2767 2768 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2769 // Calculate the pointer for the specific unroll-part. 2770 GetElementPtrInst *PartPtr = nullptr; 2771 2772 bool InBounds = false; 2773 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2774 InBounds = gep->isInBounds(); 2775 2776 if (Reverse) { 2777 assert(!VF.isScalable() && 2778 "Reversing vectors is not yet supported for scalable vectors."); 2779 2780 // If the address is consecutive but reversed, then the 2781 // wide store needs to start at the last vector element. 2782 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2783 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2784 PartPtr->setIsInBounds(InBounds); 2785 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2786 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2787 PartPtr->setIsInBounds(InBounds); 2788 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2789 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2790 } else { 2791 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2792 PartPtr = cast<GetElementPtrInst>( 2793 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2794 PartPtr->setIsInBounds(InBounds); 2795 } 2796 2797 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2798 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2799 }; 2800 2801 // Handle Stores: 2802 if (SI) { 2803 setDebugLocFromInst(Builder, SI); 2804 2805 for (unsigned Part = 0; Part < UF; ++Part) { 2806 Instruction *NewSI = nullptr; 2807 Value *StoredVal = State.get(StoredValue, Part); 2808 if (CreateGatherScatter) { 2809 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2810 Value *VectorGep = State.get(Addr, Part); 2811 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2812 MaskPart); 2813 } else { 2814 if (Reverse) { 2815 // If we store to reverse consecutive memory locations, then we need 2816 // to reverse the order of elements in the stored value. 2817 StoredVal = reverseVector(StoredVal); 2818 // We don't want to update the value in the map as it might be used in 2819 // another expression. So don't call resetVectorValue(StoredVal). 2820 } 2821 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2822 if (isMaskRequired) 2823 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2824 BlockInMaskParts[Part]); 2825 else 2826 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2827 } 2828 addMetadata(NewSI, SI); 2829 } 2830 return; 2831 } 2832 2833 // Handle loads. 2834 assert(LI && "Must have a load instruction"); 2835 setDebugLocFromInst(Builder, LI); 2836 for (unsigned Part = 0; Part < UF; ++Part) { 2837 Value *NewLI; 2838 if (CreateGatherScatter) { 2839 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2840 Value *VectorGep = State.get(Addr, Part); 2841 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2842 nullptr, "wide.masked.gather"); 2843 addMetadata(NewLI, LI); 2844 } else { 2845 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2846 if (isMaskRequired) 2847 NewLI = Builder.CreateMaskedLoad( 2848 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2849 "wide.masked.load"); 2850 else 2851 NewLI = 2852 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2853 2854 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2855 addMetadata(NewLI, LI); 2856 if (Reverse) 2857 NewLI = reverseVector(NewLI); 2858 } 2859 2860 State.set(Def, Instr, NewLI, Part); 2861 } 2862 } 2863 2864 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2865 const VPIteration &Instance, 2866 bool IfPredicateInstr, 2867 VPTransformState &State) { 2868 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2869 2870 setDebugLocFromInst(Builder, Instr); 2871 2872 // Does this instruction return a value ? 2873 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2874 2875 Instruction *Cloned = Instr->clone(); 2876 if (!IsVoidRetTy) 2877 Cloned->setName(Instr->getName() + ".cloned"); 2878 2879 // Replace the operands of the cloned instructions with their scalar 2880 // equivalents in the new loop. 2881 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2882 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2883 auto InputInstance = Instance; 2884 if (!Operand || !OrigLoop->contains(Operand) || 2885 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2886 InputInstance.Lane = 0; 2887 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2888 Cloned->setOperand(op, NewOp); 2889 } 2890 addNewMetadata(Cloned, Instr); 2891 2892 // Place the cloned scalar in the new loop. 2893 Builder.Insert(Cloned); 2894 2895 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2896 // representing scalar values in VPTransformState. Add the cloned scalar to 2897 // the scalar map entry. 2898 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2899 2900 // If we just cloned a new assumption, add it the assumption cache. 2901 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2902 if (II->getIntrinsicID() == Intrinsic::assume) 2903 AC->registerAssumption(II); 2904 2905 // End if-block. 2906 if (IfPredicateInstr) 2907 PredicatedInstructions.push_back(Cloned); 2908 } 2909 2910 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2911 Value *End, Value *Step, 2912 Instruction *DL) { 2913 BasicBlock *Header = L->getHeader(); 2914 BasicBlock *Latch = L->getLoopLatch(); 2915 // As we're just creating this loop, it's possible no latch exists 2916 // yet. If so, use the header as this will be a single block loop. 2917 if (!Latch) 2918 Latch = Header; 2919 2920 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2921 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2922 setDebugLocFromInst(Builder, OldInst); 2923 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2924 2925 Builder.SetInsertPoint(Latch->getTerminator()); 2926 setDebugLocFromInst(Builder, OldInst); 2927 2928 // Create i+1 and fill the PHINode. 2929 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2930 Induction->addIncoming(Start, L->getLoopPreheader()); 2931 Induction->addIncoming(Next, Latch); 2932 // Create the compare. 2933 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2934 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 2935 2936 // Now we have two terminators. Remove the old one from the block. 2937 Latch->getTerminator()->eraseFromParent(); 2938 2939 return Induction; 2940 } 2941 2942 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2943 if (TripCount) 2944 return TripCount; 2945 2946 assert(L && "Create Trip Count for null loop."); 2947 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2948 // Find the loop boundaries. 2949 ScalarEvolution *SE = PSE.getSE(); 2950 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2951 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2952 "Invalid loop count"); 2953 2954 Type *IdxTy = Legal->getWidestInductionType(); 2955 assert(IdxTy && "No type for induction"); 2956 2957 // The exit count might have the type of i64 while the phi is i32. This can 2958 // happen if we have an induction variable that is sign extended before the 2959 // compare. The only way that we get a backedge taken count is that the 2960 // induction variable was signed and as such will not overflow. In such a case 2961 // truncation is legal. 2962 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2963 IdxTy->getPrimitiveSizeInBits()) 2964 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2965 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2966 2967 // Get the total trip count from the count by adding 1. 2968 const SCEV *ExitCount = SE->getAddExpr( 2969 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2970 2971 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2972 2973 // Expand the trip count and place the new instructions in the preheader. 2974 // Notice that the pre-header does not change, only the loop body. 2975 SCEVExpander Exp(*SE, DL, "induction"); 2976 2977 // Count holds the overall loop count (N). 2978 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2979 L->getLoopPreheader()->getTerminator()); 2980 2981 if (TripCount->getType()->isPointerTy()) 2982 TripCount = 2983 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2984 L->getLoopPreheader()->getTerminator()); 2985 2986 return TripCount; 2987 } 2988 2989 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2990 if (VectorTripCount) 2991 return VectorTripCount; 2992 2993 Value *TC = getOrCreateTripCount(L); 2994 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2995 2996 Type *Ty = TC->getType(); 2997 // This is where we can make the step a runtime constant. 2998 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 2999 3000 // If the tail is to be folded by masking, round the number of iterations N 3001 // up to a multiple of Step instead of rounding down. This is done by first 3002 // adding Step-1 and then rounding down. Note that it's ok if this addition 3003 // overflows: the vector induction variable will eventually wrap to zero given 3004 // that it starts at zero and its Step is a power of two; the loop will then 3005 // exit, with the last early-exit vector comparison also producing all-true. 3006 if (Cost->foldTailByMasking()) { 3007 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3008 "VF*UF must be a power of 2 when folding tail by masking"); 3009 assert(!VF.isScalable() && 3010 "Tail folding not yet supported for scalable vectors"); 3011 TC = Builder.CreateAdd( 3012 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3013 } 3014 3015 // Now we need to generate the expression for the part of the loop that the 3016 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3017 // iterations are not required for correctness, or N - Step, otherwise. Step 3018 // is equal to the vectorization factor (number of SIMD elements) times the 3019 // unroll factor (number of SIMD instructions). 3020 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3021 3022 // There are two cases where we need to ensure (at least) the last iteration 3023 // runs in the scalar remainder loop. Thus, if the step evenly divides 3024 // the trip count, we set the remainder to be equal to the step. If the step 3025 // does not evenly divide the trip count, no adjustment is necessary since 3026 // there will already be scalar iterations. Note that the minimum iterations 3027 // check ensures that N >= Step. The cases are: 3028 // 1) If there is a non-reversed interleaved group that may speculatively 3029 // access memory out-of-bounds. 3030 // 2) If any instruction may follow a conditionally taken exit. That is, if 3031 // the loop contains multiple exiting blocks, or a single exiting block 3032 // which is not the latch. 3033 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3034 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3035 R = Builder.CreateSelect(IsZero, Step, R); 3036 } 3037 3038 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3039 3040 return VectorTripCount; 3041 } 3042 3043 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3044 const DataLayout &DL) { 3045 // Verify that V is a vector type with same number of elements as DstVTy. 3046 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3047 unsigned VF = DstFVTy->getNumElements(); 3048 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3049 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3050 Type *SrcElemTy = SrcVecTy->getElementType(); 3051 Type *DstElemTy = DstFVTy->getElementType(); 3052 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3053 "Vector elements must have same size"); 3054 3055 // Do a direct cast if element types are castable. 3056 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3057 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3058 } 3059 // V cannot be directly casted to desired vector type. 3060 // May happen when V is a floating point vector but DstVTy is a vector of 3061 // pointers or vice-versa. Handle this using a two-step bitcast using an 3062 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3063 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3064 "Only one type should be a pointer type"); 3065 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3066 "Only one type should be a floating point type"); 3067 Type *IntTy = 3068 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3069 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3070 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3071 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3072 } 3073 3074 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3075 BasicBlock *Bypass) { 3076 Value *Count = getOrCreateTripCount(L); 3077 // Reuse existing vector loop preheader for TC checks. 3078 // Note that new preheader block is generated for vector loop. 3079 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3080 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3081 3082 // Generate code to check if the loop's trip count is less than VF * UF, or 3083 // equal to it in case a scalar epilogue is required; this implies that the 3084 // vector trip count is zero. This check also covers the case where adding one 3085 // to the backedge-taken count overflowed leading to an incorrect trip count 3086 // of zero. In this case we will also jump to the scalar loop. 3087 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3088 : ICmpInst::ICMP_ULT; 3089 3090 // If tail is to be folded, vector loop takes care of all iterations. 3091 Value *CheckMinIters = Builder.getFalse(); 3092 if (!Cost->foldTailByMasking()) { 3093 Value *Step = 3094 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3095 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3096 } 3097 // Create new preheader for vector loop. 3098 LoopVectorPreHeader = 3099 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3100 "vector.ph"); 3101 3102 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3103 DT->getNode(Bypass)->getIDom()) && 3104 "TC check is expected to dominate Bypass"); 3105 3106 // Update dominator for Bypass & LoopExit. 3107 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3108 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3109 3110 ReplaceInstWithInst( 3111 TCCheckBlock->getTerminator(), 3112 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3113 LoopBypassBlocks.push_back(TCCheckBlock); 3114 } 3115 3116 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3117 // Reuse existing vector loop preheader for SCEV checks. 3118 // Note that new preheader block is generated for vector loop. 3119 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3120 3121 // Generate the code to check that the SCEV assumptions that we made. 3122 // We want the new basic block to start at the first instruction in a 3123 // sequence of instructions that form a check. 3124 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3125 "scev.check"); 3126 Value *SCEVCheck = Exp.expandCodeForPredicate( 3127 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3128 3129 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3130 if (C->isZero()) 3131 return; 3132 3133 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3134 (OptForSizeBasedOnProfile && 3135 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3136 "Cannot SCEV check stride or overflow when optimizing for size"); 3137 3138 SCEVCheckBlock->setName("vector.scevcheck"); 3139 // Create new preheader for vector loop. 3140 LoopVectorPreHeader = 3141 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3142 nullptr, "vector.ph"); 3143 3144 // Update dominator only if this is first RT check. 3145 if (LoopBypassBlocks.empty()) { 3146 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3147 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3148 } 3149 3150 ReplaceInstWithInst( 3151 SCEVCheckBlock->getTerminator(), 3152 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3153 LoopBypassBlocks.push_back(SCEVCheckBlock); 3154 AddedSafetyChecks = true; 3155 } 3156 3157 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3158 // VPlan-native path does not do any analysis for runtime checks currently. 3159 if (EnableVPlanNativePath) 3160 return; 3161 3162 // Reuse existing vector loop preheader for runtime memory checks. 3163 // Note that new preheader block is generated for vector loop. 3164 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3165 3166 // Generate the code that checks in runtime if arrays overlap. We put the 3167 // checks into a separate block to make the more common case of few elements 3168 // faster. 3169 auto *LAI = Legal->getLAI(); 3170 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3171 if (!RtPtrChecking.Need) 3172 return; 3173 3174 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3175 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3176 "Cannot emit memory checks when optimizing for size, unless forced " 3177 "to vectorize."); 3178 ORE->emit([&]() { 3179 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3180 L->getStartLoc(), L->getHeader()) 3181 << "Code-size may be reduced by not forcing " 3182 "vectorization, or by source-code modifications " 3183 "eliminating the need for runtime checks " 3184 "(e.g., adding 'restrict')."; 3185 }); 3186 } 3187 3188 MemCheckBlock->setName("vector.memcheck"); 3189 // Create new preheader for vector loop. 3190 LoopVectorPreHeader = 3191 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3192 "vector.ph"); 3193 3194 auto *CondBranch = cast<BranchInst>( 3195 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3196 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3197 LoopBypassBlocks.push_back(MemCheckBlock); 3198 AddedSafetyChecks = true; 3199 3200 // Update dominator only if this is first RT check. 3201 if (LoopBypassBlocks.empty()) { 3202 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3203 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3204 } 3205 3206 Instruction *FirstCheckInst; 3207 Instruction *MemRuntimeCheck; 3208 std::tie(FirstCheckInst, MemRuntimeCheck) = 3209 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3210 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3211 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3212 "claimed checks are required"); 3213 CondBranch->setCondition(MemRuntimeCheck); 3214 3215 // We currently don't use LoopVersioning for the actual loop cloning but we 3216 // still use it to add the noalias metadata. 3217 LVer = std::make_unique<LoopVersioning>( 3218 *Legal->getLAI(), 3219 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3220 DT, PSE.getSE()); 3221 LVer->prepareNoAliasMetadata(); 3222 } 3223 3224 Value *InnerLoopVectorizer::emitTransformedIndex( 3225 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3226 const InductionDescriptor &ID) const { 3227 3228 SCEVExpander Exp(*SE, DL, "induction"); 3229 auto Step = ID.getStep(); 3230 auto StartValue = ID.getStartValue(); 3231 assert(Index->getType() == Step->getType() && 3232 "Index type does not match StepValue type"); 3233 3234 // Note: the IR at this point is broken. We cannot use SE to create any new 3235 // SCEV and then expand it, hoping that SCEV's simplification will give us 3236 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3237 // lead to various SCEV crashes. So all we can do is to use builder and rely 3238 // on InstCombine for future simplifications. Here we handle some trivial 3239 // cases only. 3240 auto CreateAdd = [&B](Value *X, Value *Y) { 3241 assert(X->getType() == Y->getType() && "Types don't match!"); 3242 if (auto *CX = dyn_cast<ConstantInt>(X)) 3243 if (CX->isZero()) 3244 return Y; 3245 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3246 if (CY->isZero()) 3247 return X; 3248 return B.CreateAdd(X, Y); 3249 }; 3250 3251 auto CreateMul = [&B](Value *X, Value *Y) { 3252 assert(X->getType() == Y->getType() && "Types don't match!"); 3253 if (auto *CX = dyn_cast<ConstantInt>(X)) 3254 if (CX->isOne()) 3255 return Y; 3256 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3257 if (CY->isOne()) 3258 return X; 3259 return B.CreateMul(X, Y); 3260 }; 3261 3262 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3263 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3264 // the DomTree is not kept up-to-date for additional blocks generated in the 3265 // vector loop. By using the header as insertion point, we guarantee that the 3266 // expanded instructions dominate all their uses. 3267 auto GetInsertPoint = [this, &B]() { 3268 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3269 if (InsertBB != LoopVectorBody && 3270 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3271 return LoopVectorBody->getTerminator(); 3272 return &*B.GetInsertPoint(); 3273 }; 3274 switch (ID.getKind()) { 3275 case InductionDescriptor::IK_IntInduction: { 3276 assert(Index->getType() == StartValue->getType() && 3277 "Index type does not match StartValue type"); 3278 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3279 return B.CreateSub(StartValue, Index); 3280 auto *Offset = CreateMul( 3281 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3282 return CreateAdd(StartValue, Offset); 3283 } 3284 case InductionDescriptor::IK_PtrInduction: { 3285 assert(isa<SCEVConstant>(Step) && 3286 "Expected constant step for pointer induction"); 3287 return B.CreateGEP( 3288 StartValue->getType()->getPointerElementType(), StartValue, 3289 CreateMul(Index, 3290 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3291 } 3292 case InductionDescriptor::IK_FpInduction: { 3293 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3294 auto InductionBinOp = ID.getInductionBinOp(); 3295 assert(InductionBinOp && 3296 (InductionBinOp->getOpcode() == Instruction::FAdd || 3297 InductionBinOp->getOpcode() == Instruction::FSub) && 3298 "Original bin op should be defined for FP induction"); 3299 3300 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3301 3302 // Floating point operations had to be 'fast' to enable the induction. 3303 FastMathFlags Flags; 3304 Flags.setFast(); 3305 3306 Value *MulExp = B.CreateFMul(StepValue, Index); 3307 if (isa<Instruction>(MulExp)) 3308 // We have to check, the MulExp may be a constant. 3309 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3310 3311 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3312 "induction"); 3313 if (isa<Instruction>(BOp)) 3314 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3315 3316 return BOp; 3317 } 3318 case InductionDescriptor::IK_NoInduction: 3319 return nullptr; 3320 } 3321 llvm_unreachable("invalid enum"); 3322 } 3323 3324 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3325 LoopScalarBody = OrigLoop->getHeader(); 3326 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3327 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3328 assert(LoopExitBlock && "Must have an exit block"); 3329 assert(LoopVectorPreHeader && "Invalid loop structure"); 3330 3331 LoopMiddleBlock = 3332 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3333 LI, nullptr, Twine(Prefix) + "middle.block"); 3334 LoopScalarPreHeader = 3335 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3336 nullptr, Twine(Prefix) + "scalar.ph"); 3337 3338 // Set up branch from middle block to the exit and scalar preheader blocks. 3339 // completeLoopSkeleton will update the condition to use an iteration check, 3340 // if required to decide whether to execute the remainder. 3341 BranchInst *BrInst = 3342 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3343 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3344 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3345 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3346 3347 // We intentionally don't let SplitBlock to update LoopInfo since 3348 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3349 // LoopVectorBody is explicitly added to the correct place few lines later. 3350 LoopVectorBody = 3351 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3352 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3353 3354 // Update dominator for loop exit. 3355 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3356 3357 // Create and register the new vector loop. 3358 Loop *Lp = LI->AllocateLoop(); 3359 Loop *ParentLoop = OrigLoop->getParentLoop(); 3360 3361 // Insert the new loop into the loop nest and register the new basic blocks 3362 // before calling any utilities such as SCEV that require valid LoopInfo. 3363 if (ParentLoop) { 3364 ParentLoop->addChildLoop(Lp); 3365 } else { 3366 LI->addTopLevelLoop(Lp); 3367 } 3368 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3369 return Lp; 3370 } 3371 3372 void InnerLoopVectorizer::createInductionResumeValues( 3373 Loop *L, Value *VectorTripCount, 3374 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3375 assert(VectorTripCount && L && "Expected valid arguments"); 3376 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3377 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3378 "Inconsistent information about additional bypass."); 3379 // We are going to resume the execution of the scalar loop. 3380 // Go over all of the induction variables that we found and fix the 3381 // PHIs that are left in the scalar version of the loop. 3382 // The starting values of PHI nodes depend on the counter of the last 3383 // iteration in the vectorized loop. 3384 // If we come from a bypass edge then we need to start from the original 3385 // start value. 3386 for (auto &InductionEntry : Legal->getInductionVars()) { 3387 PHINode *OrigPhi = InductionEntry.first; 3388 InductionDescriptor II = InductionEntry.second; 3389 3390 // Create phi nodes to merge from the backedge-taken check block. 3391 PHINode *BCResumeVal = 3392 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3393 LoopScalarPreHeader->getTerminator()); 3394 // Copy original phi DL over to the new one. 3395 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3396 Value *&EndValue = IVEndValues[OrigPhi]; 3397 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3398 if (OrigPhi == OldInduction) { 3399 // We know what the end value is. 3400 EndValue = VectorTripCount; 3401 } else { 3402 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3403 Type *StepType = II.getStep()->getType(); 3404 Instruction::CastOps CastOp = 3405 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3406 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3407 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3408 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3409 EndValue->setName("ind.end"); 3410 3411 // Compute the end value for the additional bypass (if applicable). 3412 if (AdditionalBypass.first) { 3413 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3414 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3415 StepType, true); 3416 CRD = 3417 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3418 EndValueFromAdditionalBypass = 3419 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3420 EndValueFromAdditionalBypass->setName("ind.end"); 3421 } 3422 } 3423 // The new PHI merges the original incoming value, in case of a bypass, 3424 // or the value at the end of the vectorized loop. 3425 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3426 3427 // Fix the scalar body counter (PHI node). 3428 // The old induction's phi node in the scalar body needs the truncated 3429 // value. 3430 for (BasicBlock *BB : LoopBypassBlocks) 3431 BCResumeVal->addIncoming(II.getStartValue(), BB); 3432 3433 if (AdditionalBypass.first) 3434 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3435 EndValueFromAdditionalBypass); 3436 3437 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3438 } 3439 } 3440 3441 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3442 MDNode *OrigLoopID) { 3443 assert(L && "Expected valid loop."); 3444 3445 // The trip counts should be cached by now. 3446 Value *Count = getOrCreateTripCount(L); 3447 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3448 3449 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3450 3451 // Add a check in the middle block to see if we have completed 3452 // all of the iterations in the first vector loop. 3453 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3454 // If tail is to be folded, we know we don't need to run the remainder. 3455 if (!Cost->foldTailByMasking()) { 3456 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3457 Count, VectorTripCount, "cmp.n", 3458 LoopMiddleBlock->getTerminator()); 3459 3460 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3461 // of the corresponding compare because they may have ended up with 3462 // different line numbers and we want to avoid awkward line stepping while 3463 // debugging. Eg. if the compare has got a line number inside the loop. 3464 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3465 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3466 } 3467 3468 // Get ready to start creating new instructions into the vectorized body. 3469 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3470 "Inconsistent vector loop preheader"); 3471 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3472 3473 Optional<MDNode *> VectorizedLoopID = 3474 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3475 LLVMLoopVectorizeFollowupVectorized}); 3476 if (VectorizedLoopID.hasValue()) { 3477 L->setLoopID(VectorizedLoopID.getValue()); 3478 3479 // Do not setAlreadyVectorized if loop attributes have been defined 3480 // explicitly. 3481 return LoopVectorPreHeader; 3482 } 3483 3484 // Keep all loop hints from the original loop on the vector loop (we'll 3485 // replace the vectorizer-specific hints below). 3486 if (MDNode *LID = OrigLoop->getLoopID()) 3487 L->setLoopID(LID); 3488 3489 LoopVectorizeHints Hints(L, true, *ORE); 3490 Hints.setAlreadyVectorized(); 3491 3492 #ifdef EXPENSIVE_CHECKS 3493 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3494 LI->verify(*DT); 3495 #endif 3496 3497 return LoopVectorPreHeader; 3498 } 3499 3500 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3501 /* 3502 In this function we generate a new loop. The new loop will contain 3503 the vectorized instructions while the old loop will continue to run the 3504 scalar remainder. 3505 3506 [ ] <-- loop iteration number check. 3507 / | 3508 / v 3509 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3510 | / | 3511 | / v 3512 || [ ] <-- vector pre header. 3513 |/ | 3514 | v 3515 | [ ] \ 3516 | [ ]_| <-- vector loop. 3517 | | 3518 | v 3519 | -[ ] <--- middle-block. 3520 | / | 3521 | / v 3522 -|- >[ ] <--- new preheader. 3523 | | 3524 | v 3525 | [ ] \ 3526 | [ ]_| <-- old scalar loop to handle remainder. 3527 \ | 3528 \ v 3529 >[ ] <-- exit block. 3530 ... 3531 */ 3532 3533 // Get the metadata of the original loop before it gets modified. 3534 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3535 3536 // Create an empty vector loop, and prepare basic blocks for the runtime 3537 // checks. 3538 Loop *Lp = createVectorLoopSkeleton(""); 3539 3540 // Now, compare the new count to zero. If it is zero skip the vector loop and 3541 // jump to the scalar loop. This check also covers the case where the 3542 // backedge-taken count is uint##_max: adding one to it will overflow leading 3543 // to an incorrect trip count of zero. In this (rare) case we will also jump 3544 // to the scalar loop. 3545 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3546 3547 // Generate the code to check any assumptions that we've made for SCEV 3548 // expressions. 3549 emitSCEVChecks(Lp, LoopScalarPreHeader); 3550 3551 // Generate the code that checks in runtime if arrays overlap. We put the 3552 // checks into a separate block to make the more common case of few elements 3553 // faster. 3554 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3555 3556 // Some loops have a single integer induction variable, while other loops 3557 // don't. One example is c++ iterators that often have multiple pointer 3558 // induction variables. In the code below we also support a case where we 3559 // don't have a single induction variable. 3560 // 3561 // We try to obtain an induction variable from the original loop as hard 3562 // as possible. However if we don't find one that: 3563 // - is an integer 3564 // - counts from zero, stepping by one 3565 // - is the size of the widest induction variable type 3566 // then we create a new one. 3567 OldInduction = Legal->getPrimaryInduction(); 3568 Type *IdxTy = Legal->getWidestInductionType(); 3569 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3570 // The loop step is equal to the vectorization factor (num of SIMD elements) 3571 // times the unroll factor (num of SIMD instructions). 3572 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3573 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3574 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3575 Induction = 3576 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3577 getDebugLocFromInstOrOperands(OldInduction)); 3578 3579 // Emit phis for the new starting index of the scalar loop. 3580 createInductionResumeValues(Lp, CountRoundDown); 3581 3582 return completeLoopSkeleton(Lp, OrigLoopID); 3583 } 3584 3585 // Fix up external users of the induction variable. At this point, we are 3586 // in LCSSA form, with all external PHIs that use the IV having one input value, 3587 // coming from the remainder loop. We need those PHIs to also have a correct 3588 // value for the IV when arriving directly from the middle block. 3589 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3590 const InductionDescriptor &II, 3591 Value *CountRoundDown, Value *EndValue, 3592 BasicBlock *MiddleBlock) { 3593 // There are two kinds of external IV usages - those that use the value 3594 // computed in the last iteration (the PHI) and those that use the penultimate 3595 // value (the value that feeds into the phi from the loop latch). 3596 // We allow both, but they, obviously, have different values. 3597 3598 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3599 3600 DenseMap<Value *, Value *> MissingVals; 3601 3602 // An external user of the last iteration's value should see the value that 3603 // the remainder loop uses to initialize its own IV. 3604 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3605 for (User *U : PostInc->users()) { 3606 Instruction *UI = cast<Instruction>(U); 3607 if (!OrigLoop->contains(UI)) { 3608 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3609 MissingVals[UI] = EndValue; 3610 } 3611 } 3612 3613 // An external user of the penultimate value need to see EndValue - Step. 3614 // The simplest way to get this is to recompute it from the constituent SCEVs, 3615 // that is Start + (Step * (CRD - 1)). 3616 for (User *U : OrigPhi->users()) { 3617 auto *UI = cast<Instruction>(U); 3618 if (!OrigLoop->contains(UI)) { 3619 const DataLayout &DL = 3620 OrigLoop->getHeader()->getModule()->getDataLayout(); 3621 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3622 3623 IRBuilder<> B(MiddleBlock->getTerminator()); 3624 Value *CountMinusOne = B.CreateSub( 3625 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3626 Value *CMO = 3627 !II.getStep()->getType()->isIntegerTy() 3628 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3629 II.getStep()->getType()) 3630 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3631 CMO->setName("cast.cmo"); 3632 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3633 Escape->setName("ind.escape"); 3634 MissingVals[UI] = Escape; 3635 } 3636 } 3637 3638 for (auto &I : MissingVals) { 3639 PHINode *PHI = cast<PHINode>(I.first); 3640 // One corner case we have to handle is two IVs "chasing" each-other, 3641 // that is %IV2 = phi [...], [ %IV1, %latch ] 3642 // In this case, if IV1 has an external use, we need to avoid adding both 3643 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3644 // don't already have an incoming value for the middle block. 3645 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3646 PHI->addIncoming(I.second, MiddleBlock); 3647 } 3648 } 3649 3650 namespace { 3651 3652 struct CSEDenseMapInfo { 3653 static bool canHandle(const Instruction *I) { 3654 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3655 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3656 } 3657 3658 static inline Instruction *getEmptyKey() { 3659 return DenseMapInfo<Instruction *>::getEmptyKey(); 3660 } 3661 3662 static inline Instruction *getTombstoneKey() { 3663 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3664 } 3665 3666 static unsigned getHashValue(const Instruction *I) { 3667 assert(canHandle(I) && "Unknown instruction!"); 3668 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3669 I->value_op_end())); 3670 } 3671 3672 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3673 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3674 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3675 return LHS == RHS; 3676 return LHS->isIdenticalTo(RHS); 3677 } 3678 }; 3679 3680 } // end anonymous namespace 3681 3682 ///Perform cse of induction variable instructions. 3683 static void cse(BasicBlock *BB) { 3684 // Perform simple cse. 3685 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3686 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3687 Instruction *In = &*I++; 3688 3689 if (!CSEDenseMapInfo::canHandle(In)) 3690 continue; 3691 3692 // Check if we can replace this instruction with any of the 3693 // visited instructions. 3694 if (Instruction *V = CSEMap.lookup(In)) { 3695 In->replaceAllUsesWith(V); 3696 In->eraseFromParent(); 3697 continue; 3698 } 3699 3700 CSEMap[In] = In; 3701 } 3702 } 3703 3704 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3705 ElementCount VF, 3706 bool &NeedToScalarize) { 3707 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3708 Function *F = CI->getCalledFunction(); 3709 Type *ScalarRetTy = CI->getType(); 3710 SmallVector<Type *, 4> Tys, ScalarTys; 3711 for (auto &ArgOp : CI->arg_operands()) 3712 ScalarTys.push_back(ArgOp->getType()); 3713 3714 // Estimate cost of scalarized vector call. The source operands are assumed 3715 // to be vectors, so we need to extract individual elements from there, 3716 // execute VF scalar calls, and then gather the result into the vector return 3717 // value. 3718 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3719 TTI::TCK_RecipThroughput); 3720 if (VF.isScalar()) 3721 return ScalarCallCost; 3722 3723 // Compute corresponding vector type for return value and arguments. 3724 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3725 for (Type *ScalarTy : ScalarTys) 3726 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3727 3728 // Compute costs of unpacking argument values for the scalar calls and 3729 // packing the return values to a vector. 3730 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3731 3732 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3733 3734 // If we can't emit a vector call for this function, then the currently found 3735 // cost is the cost we need to return. 3736 NeedToScalarize = true; 3737 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3738 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3739 3740 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3741 return Cost; 3742 3743 // If the corresponding vector cost is cheaper, return its cost. 3744 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3745 TTI::TCK_RecipThroughput); 3746 if (VectorCallCost < Cost) { 3747 NeedToScalarize = false; 3748 return VectorCallCost; 3749 } 3750 return Cost; 3751 } 3752 3753 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3754 ElementCount VF) { 3755 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3756 assert(ID && "Expected intrinsic call!"); 3757 3758 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3759 return TTI.getIntrinsicInstrCost(CostAttrs, 3760 TargetTransformInfo::TCK_RecipThroughput); 3761 } 3762 3763 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3764 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3765 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3766 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3767 } 3768 3769 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3770 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3771 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3772 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3773 } 3774 3775 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3776 // For every instruction `I` in MinBWs, truncate the operands, create a 3777 // truncated version of `I` and reextend its result. InstCombine runs 3778 // later and will remove any ext/trunc pairs. 3779 SmallPtrSet<Value *, 4> Erased; 3780 for (const auto &KV : Cost->getMinimalBitwidths()) { 3781 // If the value wasn't vectorized, we must maintain the original scalar 3782 // type. The absence of the value from VectorLoopValueMap indicates that it 3783 // wasn't vectorized. 3784 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3785 continue; 3786 for (unsigned Part = 0; Part < UF; ++Part) { 3787 Value *I = getOrCreateVectorValue(KV.first, Part); 3788 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3789 continue; 3790 Type *OriginalTy = I->getType(); 3791 Type *ScalarTruncatedTy = 3792 IntegerType::get(OriginalTy->getContext(), KV.second); 3793 auto *TruncatedTy = FixedVectorType::get( 3794 ScalarTruncatedTy, 3795 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3796 if (TruncatedTy == OriginalTy) 3797 continue; 3798 3799 IRBuilder<> B(cast<Instruction>(I)); 3800 auto ShrinkOperand = [&](Value *V) -> Value * { 3801 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3802 if (ZI->getSrcTy() == TruncatedTy) 3803 return ZI->getOperand(0); 3804 return B.CreateZExtOrTrunc(V, TruncatedTy); 3805 }; 3806 3807 // The actual instruction modification depends on the instruction type, 3808 // unfortunately. 3809 Value *NewI = nullptr; 3810 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3811 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3812 ShrinkOperand(BO->getOperand(1))); 3813 3814 // Any wrapping introduced by shrinking this operation shouldn't be 3815 // considered undefined behavior. So, we can't unconditionally copy 3816 // arithmetic wrapping flags to NewI. 3817 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3818 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3819 NewI = 3820 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3821 ShrinkOperand(CI->getOperand(1))); 3822 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3823 NewI = B.CreateSelect(SI->getCondition(), 3824 ShrinkOperand(SI->getTrueValue()), 3825 ShrinkOperand(SI->getFalseValue())); 3826 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3827 switch (CI->getOpcode()) { 3828 default: 3829 llvm_unreachable("Unhandled cast!"); 3830 case Instruction::Trunc: 3831 NewI = ShrinkOperand(CI->getOperand(0)); 3832 break; 3833 case Instruction::SExt: 3834 NewI = B.CreateSExtOrTrunc( 3835 CI->getOperand(0), 3836 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3837 break; 3838 case Instruction::ZExt: 3839 NewI = B.CreateZExtOrTrunc( 3840 CI->getOperand(0), 3841 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3842 break; 3843 } 3844 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3845 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3846 ->getNumElements(); 3847 auto *O0 = B.CreateZExtOrTrunc( 3848 SI->getOperand(0), 3849 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3850 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3851 ->getNumElements(); 3852 auto *O1 = B.CreateZExtOrTrunc( 3853 SI->getOperand(1), 3854 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3855 3856 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3857 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3858 // Don't do anything with the operands, just extend the result. 3859 continue; 3860 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3861 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3862 ->getNumElements(); 3863 auto *O0 = B.CreateZExtOrTrunc( 3864 IE->getOperand(0), 3865 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3866 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3867 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3868 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3869 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3870 ->getNumElements(); 3871 auto *O0 = B.CreateZExtOrTrunc( 3872 EE->getOperand(0), 3873 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3874 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3875 } else { 3876 // If we don't know what to do, be conservative and don't do anything. 3877 continue; 3878 } 3879 3880 // Lastly, extend the result. 3881 NewI->takeName(cast<Instruction>(I)); 3882 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3883 I->replaceAllUsesWith(Res); 3884 cast<Instruction>(I)->eraseFromParent(); 3885 Erased.insert(I); 3886 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3887 } 3888 } 3889 3890 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3891 for (const auto &KV : Cost->getMinimalBitwidths()) { 3892 // If the value wasn't vectorized, we must maintain the original scalar 3893 // type. The absence of the value from VectorLoopValueMap indicates that it 3894 // wasn't vectorized. 3895 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3896 continue; 3897 for (unsigned Part = 0; Part < UF; ++Part) { 3898 Value *I = getOrCreateVectorValue(KV.first, Part); 3899 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3900 if (Inst && Inst->use_empty()) { 3901 Value *NewI = Inst->getOperand(0); 3902 Inst->eraseFromParent(); 3903 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3904 } 3905 } 3906 } 3907 } 3908 3909 void InnerLoopVectorizer::fixVectorizedLoop() { 3910 // Insert truncates and extends for any truncated instructions as hints to 3911 // InstCombine. 3912 if (VF.isVector()) 3913 truncateToMinimalBitwidths(); 3914 3915 // Fix widened non-induction PHIs by setting up the PHI operands. 3916 if (OrigPHIsToFix.size()) { 3917 assert(EnableVPlanNativePath && 3918 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3919 fixNonInductionPHIs(); 3920 } 3921 3922 // At this point every instruction in the original loop is widened to a 3923 // vector form. Now we need to fix the recurrences in the loop. These PHI 3924 // nodes are currently empty because we did not want to introduce cycles. 3925 // This is the second stage of vectorizing recurrences. 3926 fixCrossIterationPHIs(); 3927 3928 // Forget the original basic block. 3929 PSE.getSE()->forgetLoop(OrigLoop); 3930 3931 // Fix-up external users of the induction variables. 3932 for (auto &Entry : Legal->getInductionVars()) 3933 fixupIVUsers(Entry.first, Entry.second, 3934 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3935 IVEndValues[Entry.first], LoopMiddleBlock); 3936 3937 fixLCSSAPHIs(); 3938 for (Instruction *PI : PredicatedInstructions) 3939 sinkScalarOperands(&*PI); 3940 3941 // Remove redundant induction instructions. 3942 cse(LoopVectorBody); 3943 3944 // Set/update profile weights for the vector and remainder loops as original 3945 // loop iterations are now distributed among them. Note that original loop 3946 // represented by LoopScalarBody becomes remainder loop after vectorization. 3947 // 3948 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3949 // end up getting slightly roughened result but that should be OK since 3950 // profile is not inherently precise anyway. Note also possible bypass of 3951 // vector code caused by legality checks is ignored, assigning all the weight 3952 // to the vector loop, optimistically. 3953 // 3954 // For scalable vectorization we can't know at compile time how many iterations 3955 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3956 // vscale of '1'. 3957 setProfileInfoAfterUnrolling( 3958 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3959 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3960 } 3961 3962 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3963 // In order to support recurrences we need to be able to vectorize Phi nodes. 3964 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3965 // stage #2: We now need to fix the recurrences by adding incoming edges to 3966 // the currently empty PHI nodes. At this point every instruction in the 3967 // original loop is widened to a vector form so we can use them to construct 3968 // the incoming edges. 3969 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3970 // Handle first-order recurrences and reductions that need to be fixed. 3971 if (Legal->isFirstOrderRecurrence(&Phi)) 3972 fixFirstOrderRecurrence(&Phi); 3973 else if (Legal->isReductionVariable(&Phi)) 3974 fixReduction(&Phi); 3975 } 3976 } 3977 3978 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3979 // This is the second phase of vectorizing first-order recurrences. An 3980 // overview of the transformation is described below. Suppose we have the 3981 // following loop. 3982 // 3983 // for (int i = 0; i < n; ++i) 3984 // b[i] = a[i] - a[i - 1]; 3985 // 3986 // There is a first-order recurrence on "a". For this loop, the shorthand 3987 // scalar IR looks like: 3988 // 3989 // scalar.ph: 3990 // s_init = a[-1] 3991 // br scalar.body 3992 // 3993 // scalar.body: 3994 // i = phi [0, scalar.ph], [i+1, scalar.body] 3995 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3996 // s2 = a[i] 3997 // b[i] = s2 - s1 3998 // br cond, scalar.body, ... 3999 // 4000 // In this example, s1 is a recurrence because it's value depends on the 4001 // previous iteration. In the first phase of vectorization, we created a 4002 // temporary value for s1. We now complete the vectorization and produce the 4003 // shorthand vector IR shown below (for VF = 4, UF = 1). 4004 // 4005 // vector.ph: 4006 // v_init = vector(..., ..., ..., a[-1]) 4007 // br vector.body 4008 // 4009 // vector.body 4010 // i = phi [0, vector.ph], [i+4, vector.body] 4011 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4012 // v2 = a[i, i+1, i+2, i+3]; 4013 // v3 = vector(v1(3), v2(0, 1, 2)) 4014 // b[i, i+1, i+2, i+3] = v2 - v3 4015 // br cond, vector.body, middle.block 4016 // 4017 // middle.block: 4018 // x = v2(3) 4019 // br scalar.ph 4020 // 4021 // scalar.ph: 4022 // s_init = phi [x, middle.block], [a[-1], otherwise] 4023 // br scalar.body 4024 // 4025 // After execution completes the vector loop, we extract the next value of 4026 // the recurrence (x) to use as the initial value in the scalar loop. 4027 4028 // Get the original loop preheader and single loop latch. 4029 auto *Preheader = OrigLoop->getLoopPreheader(); 4030 auto *Latch = OrigLoop->getLoopLatch(); 4031 4032 // Get the initial and previous values of the scalar recurrence. 4033 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4034 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4035 4036 // Create a vector from the initial value. 4037 auto *VectorInit = ScalarInit; 4038 if (VF.isVector()) { 4039 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4040 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4041 VectorInit = Builder.CreateInsertElement( 4042 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4043 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4044 } 4045 4046 // We constructed a temporary phi node in the first phase of vectorization. 4047 // This phi node will eventually be deleted. 4048 Builder.SetInsertPoint( 4049 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4050 4051 // Create a phi node for the new recurrence. The current value will either be 4052 // the initial value inserted into a vector or loop-varying vector value. 4053 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4054 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4055 4056 // Get the vectorized previous value of the last part UF - 1. It appears last 4057 // among all unrolled iterations, due to the order of their construction. 4058 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4059 4060 // Find and set the insertion point after the previous value if it is an 4061 // instruction. 4062 BasicBlock::iterator InsertPt; 4063 // Note that the previous value may have been constant-folded so it is not 4064 // guaranteed to be an instruction in the vector loop. 4065 // FIXME: Loop invariant values do not form recurrences. We should deal with 4066 // them earlier. 4067 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4068 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4069 else { 4070 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4071 if (isa<PHINode>(PreviousLastPart)) 4072 // If the previous value is a phi node, we should insert after all the phi 4073 // nodes in the block containing the PHI to avoid breaking basic block 4074 // verification. Note that the basic block may be different to 4075 // LoopVectorBody, in case we predicate the loop. 4076 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4077 else 4078 InsertPt = ++PreviousInst->getIterator(); 4079 } 4080 Builder.SetInsertPoint(&*InsertPt); 4081 4082 // We will construct a vector for the recurrence by combining the values for 4083 // the current and previous iterations. This is the required shuffle mask. 4084 assert(!VF.isScalable()); 4085 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4086 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4087 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4088 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4089 4090 // The vector from which to take the initial value for the current iteration 4091 // (actual or unrolled). Initially, this is the vector phi node. 4092 Value *Incoming = VecPhi; 4093 4094 // Shuffle the current and previous vector and update the vector parts. 4095 for (unsigned Part = 0; Part < UF; ++Part) { 4096 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4097 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4098 auto *Shuffle = 4099 VF.isVector() 4100 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4101 : Incoming; 4102 PhiPart->replaceAllUsesWith(Shuffle); 4103 cast<Instruction>(PhiPart)->eraseFromParent(); 4104 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4105 Incoming = PreviousPart; 4106 } 4107 4108 // Fix the latch value of the new recurrence in the vector loop. 4109 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4110 4111 // Extract the last vector element in the middle block. This will be the 4112 // initial value for the recurrence when jumping to the scalar loop. 4113 auto *ExtractForScalar = Incoming; 4114 if (VF.isVector()) { 4115 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4116 ExtractForScalar = Builder.CreateExtractElement( 4117 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4118 "vector.recur.extract"); 4119 } 4120 // Extract the second last element in the middle block if the 4121 // Phi is used outside the loop. We need to extract the phi itself 4122 // and not the last element (the phi update in the current iteration). This 4123 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4124 // when the scalar loop is not run at all. 4125 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4126 if (VF.isVector()) 4127 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4128 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4129 "vector.recur.extract.for.phi"); 4130 // When loop is unrolled without vectorizing, initialize 4131 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4132 // `Incoming`. This is analogous to the vectorized case above: extracting the 4133 // second last element when VF > 1. 4134 else if (UF > 1) 4135 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4136 4137 // Fix the initial value of the original recurrence in the scalar loop. 4138 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4139 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4140 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4141 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4142 Start->addIncoming(Incoming, BB); 4143 } 4144 4145 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4146 Phi->setName("scalar.recur"); 4147 4148 // Finally, fix users of the recurrence outside the loop. The users will need 4149 // either the last value of the scalar recurrence or the last value of the 4150 // vector recurrence we extracted in the middle block. Since the loop is in 4151 // LCSSA form, we just need to find all the phi nodes for the original scalar 4152 // recurrence in the exit block, and then add an edge for the middle block. 4153 // Note that LCSSA does not imply single entry when the original scalar loop 4154 // had multiple exiting edges (as we always run the last iteration in the 4155 // scalar epilogue); in that case, the exiting path through middle will be 4156 // dynamically dead and the value picked for the phi doesn't matter. 4157 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4158 if (any_of(LCSSAPhi.incoming_values(), 4159 [Phi](Value *V) { return V == Phi; })) 4160 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4161 } 4162 4163 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4164 // Get it's reduction variable descriptor. 4165 assert(Legal->isReductionVariable(Phi) && 4166 "Unable to find the reduction variable"); 4167 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4168 4169 RecurKind RK = RdxDesc.getRecurrenceKind(); 4170 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4171 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4172 setDebugLocFromInst(Builder, ReductionStartValue); 4173 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4174 4175 // This is the vector-clone of the value that leaves the loop. 4176 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4177 4178 // Wrap flags are in general invalid after vectorization, clear them. 4179 clearReductionWrapFlags(RdxDesc); 4180 4181 // Fix the vector-loop phi. 4182 4183 // Reductions do not have to start at zero. They can start with 4184 // any loop invariant values. 4185 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4186 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4187 4188 for (unsigned Part = 0; Part < UF; ++Part) { 4189 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4190 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4191 cast<PHINode>(VecRdxPhi) 4192 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4193 } 4194 4195 // Before each round, move the insertion point right between 4196 // the PHIs and the values we are going to write. 4197 // This allows us to write both PHINodes and the extractelement 4198 // instructions. 4199 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4200 4201 setDebugLocFromInst(Builder, LoopExitInst); 4202 4203 // If tail is folded by masking, the vector value to leave the loop should be 4204 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4205 // instead of the former. For an inloop reduction the reduction will already 4206 // be predicated, and does not need to be handled here. 4207 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4208 for (unsigned Part = 0; Part < UF; ++Part) { 4209 Value *VecLoopExitInst = 4210 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4211 Value *Sel = nullptr; 4212 for (User *U : VecLoopExitInst->users()) { 4213 if (isa<SelectInst>(U)) { 4214 assert(!Sel && "Reduction exit feeding two selects"); 4215 Sel = U; 4216 } else 4217 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4218 } 4219 assert(Sel && "Reduction exit feeds no select"); 4220 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4221 4222 // If the target can create a predicated operator for the reduction at no 4223 // extra cost in the loop (for example a predicated vadd), it can be 4224 // cheaper for the select to remain in the loop than be sunk out of it, 4225 // and so use the select value for the phi instead of the old 4226 // LoopExitValue. 4227 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4228 if (PreferPredicatedReductionSelect || 4229 TTI->preferPredicatedReductionSelect( 4230 RdxDesc.getOpcode(), Phi->getType(), 4231 TargetTransformInfo::ReductionFlags())) { 4232 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4233 VecRdxPhi->setIncomingValueForBlock( 4234 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4235 } 4236 } 4237 } 4238 4239 // If the vector reduction can be performed in a smaller type, we truncate 4240 // then extend the loop exit value to enable InstCombine to evaluate the 4241 // entire expression in the smaller type. 4242 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4243 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4244 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4245 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4246 Builder.SetInsertPoint( 4247 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4248 VectorParts RdxParts(UF); 4249 for (unsigned Part = 0; Part < UF; ++Part) { 4250 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4251 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4252 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4253 : Builder.CreateZExt(Trunc, VecTy); 4254 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4255 UI != RdxParts[Part]->user_end();) 4256 if (*UI != Trunc) { 4257 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4258 RdxParts[Part] = Extnd; 4259 } else { 4260 ++UI; 4261 } 4262 } 4263 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4264 for (unsigned Part = 0; Part < UF; ++Part) { 4265 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4266 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4267 } 4268 } 4269 4270 // Reduce all of the unrolled parts into a single vector. 4271 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4272 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4273 4274 // The middle block terminator has already been assigned a DebugLoc here (the 4275 // OrigLoop's single latch terminator). We want the whole middle block to 4276 // appear to execute on this line because: (a) it is all compiler generated, 4277 // (b) these instructions are always executed after evaluating the latch 4278 // conditional branch, and (c) other passes may add new predecessors which 4279 // terminate on this line. This is the easiest way to ensure we don't 4280 // accidentally cause an extra step back into the loop while debugging. 4281 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4282 for (unsigned Part = 1; Part < UF; ++Part) { 4283 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4284 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4285 // Floating point operations had to be 'fast' to enable the reduction. 4286 ReducedPartRdx = addFastMathFlag( 4287 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4288 ReducedPartRdx, "bin.rdx"), 4289 RdxDesc.getFastMathFlags()); 4290 else 4291 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4292 } 4293 4294 // Create the reduction after the loop. Note that inloop reductions create the 4295 // target reduction in the loop using a Reduction recipe. 4296 if (VF.isVector() && !IsInLoopReductionPhi) { 4297 ReducedPartRdx = 4298 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4299 // If the reduction can be performed in a smaller type, we need to extend 4300 // the reduction to the wider type before we branch to the original loop. 4301 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4302 ReducedPartRdx = 4303 RdxDesc.isSigned() 4304 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4305 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4306 } 4307 4308 // Create a phi node that merges control-flow from the backedge-taken check 4309 // block and the middle block. 4310 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4311 LoopScalarPreHeader->getTerminator()); 4312 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4313 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4314 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4315 4316 // Now, we need to fix the users of the reduction variable 4317 // inside and outside of the scalar remainder loop. 4318 4319 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4320 // in the exit blocks. See comment on analogous loop in 4321 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4322 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4323 if (any_of(LCSSAPhi.incoming_values(), 4324 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4325 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4326 4327 // Fix the scalar loop reduction variable with the incoming reduction sum 4328 // from the vector body and from the backedge value. 4329 int IncomingEdgeBlockIdx = 4330 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4331 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4332 // Pick the other block. 4333 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4334 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4335 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4336 } 4337 4338 void InnerLoopVectorizer::clearReductionWrapFlags( 4339 RecurrenceDescriptor &RdxDesc) { 4340 RecurKind RK = RdxDesc.getRecurrenceKind(); 4341 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4342 return; 4343 4344 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4345 assert(LoopExitInstr && "null loop exit instruction"); 4346 SmallVector<Instruction *, 8> Worklist; 4347 SmallPtrSet<Instruction *, 8> Visited; 4348 Worklist.push_back(LoopExitInstr); 4349 Visited.insert(LoopExitInstr); 4350 4351 while (!Worklist.empty()) { 4352 Instruction *Cur = Worklist.pop_back_val(); 4353 if (isa<OverflowingBinaryOperator>(Cur)) 4354 for (unsigned Part = 0; Part < UF; ++Part) { 4355 Value *V = getOrCreateVectorValue(Cur, Part); 4356 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4357 } 4358 4359 for (User *U : Cur->users()) { 4360 Instruction *UI = cast<Instruction>(U); 4361 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4362 Visited.insert(UI).second) 4363 Worklist.push_back(UI); 4364 } 4365 } 4366 } 4367 4368 void InnerLoopVectorizer::fixLCSSAPHIs() { 4369 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4370 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4371 // Some phis were already hand updated by the reduction and recurrence 4372 // code above, leave them alone. 4373 continue; 4374 4375 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4376 // Non-instruction incoming values will have only one value. 4377 unsigned LastLane = 0; 4378 if (isa<Instruction>(IncomingValue)) 4379 LastLane = Cost->isUniformAfterVectorization( 4380 cast<Instruction>(IncomingValue), VF) 4381 ? 0 4382 : VF.getKnownMinValue() - 1; 4383 assert((!VF.isScalable() || LastLane == 0) && 4384 "scalable vectors dont support non-uniform scalars yet"); 4385 // Can be a loop invariant incoming value or the last scalar value to be 4386 // extracted from the vectorized loop. 4387 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4388 Value *lastIncomingValue = 4389 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4390 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4391 } 4392 } 4393 4394 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4395 // The basic block and loop containing the predicated instruction. 4396 auto *PredBB = PredInst->getParent(); 4397 auto *VectorLoop = LI->getLoopFor(PredBB); 4398 4399 // Initialize a worklist with the operands of the predicated instruction. 4400 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4401 4402 // Holds instructions that we need to analyze again. An instruction may be 4403 // reanalyzed if we don't yet know if we can sink it or not. 4404 SmallVector<Instruction *, 8> InstsToReanalyze; 4405 4406 // Returns true if a given use occurs in the predicated block. Phi nodes use 4407 // their operands in their corresponding predecessor blocks. 4408 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4409 auto *I = cast<Instruction>(U.getUser()); 4410 BasicBlock *BB = I->getParent(); 4411 if (auto *Phi = dyn_cast<PHINode>(I)) 4412 BB = Phi->getIncomingBlock( 4413 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4414 return BB == PredBB; 4415 }; 4416 4417 // Iteratively sink the scalarized operands of the predicated instruction 4418 // into the block we created for it. When an instruction is sunk, it's 4419 // operands are then added to the worklist. The algorithm ends after one pass 4420 // through the worklist doesn't sink a single instruction. 4421 bool Changed; 4422 do { 4423 // Add the instructions that need to be reanalyzed to the worklist, and 4424 // reset the changed indicator. 4425 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4426 InstsToReanalyze.clear(); 4427 Changed = false; 4428 4429 while (!Worklist.empty()) { 4430 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4431 4432 // We can't sink an instruction if it is a phi node, is already in the 4433 // predicated block, is not in the loop, or may have side effects. 4434 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4435 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4436 continue; 4437 4438 // It's legal to sink the instruction if all its uses occur in the 4439 // predicated block. Otherwise, there's nothing to do yet, and we may 4440 // need to reanalyze the instruction. 4441 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4442 InstsToReanalyze.push_back(I); 4443 continue; 4444 } 4445 4446 // Move the instruction to the beginning of the predicated block, and add 4447 // it's operands to the worklist. 4448 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4449 Worklist.insert(I->op_begin(), I->op_end()); 4450 4451 // The sinking may have enabled other instructions to be sunk, so we will 4452 // need to iterate. 4453 Changed = true; 4454 } 4455 } while (Changed); 4456 } 4457 4458 void InnerLoopVectorizer::fixNonInductionPHIs() { 4459 for (PHINode *OrigPhi : OrigPHIsToFix) { 4460 PHINode *NewPhi = 4461 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4462 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4463 4464 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4465 predecessors(OrigPhi->getParent())); 4466 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4467 predecessors(NewPhi->getParent())); 4468 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4469 "Scalar and Vector BB should have the same number of predecessors"); 4470 4471 // The insertion point in Builder may be invalidated by the time we get 4472 // here. Force the Builder insertion point to something valid so that we do 4473 // not run into issues during insertion point restore in 4474 // getOrCreateVectorValue calls below. 4475 Builder.SetInsertPoint(NewPhi); 4476 4477 // The predecessor order is preserved and we can rely on mapping between 4478 // scalar and vector block predecessors. 4479 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4480 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4481 4482 // When looking up the new scalar/vector values to fix up, use incoming 4483 // values from original phi. 4484 Value *ScIncV = 4485 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4486 4487 // Scalar incoming value may need a broadcast 4488 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4489 NewPhi->addIncoming(NewIncV, NewPredBB); 4490 } 4491 } 4492 } 4493 4494 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4495 VPUser &Operands, unsigned UF, 4496 ElementCount VF, bool IsPtrLoopInvariant, 4497 SmallBitVector &IsIndexLoopInvariant, 4498 VPTransformState &State) { 4499 // Construct a vector GEP by widening the operands of the scalar GEP as 4500 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4501 // results in a vector of pointers when at least one operand of the GEP 4502 // is vector-typed. Thus, to keep the representation compact, we only use 4503 // vector-typed operands for loop-varying values. 4504 4505 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4506 // If we are vectorizing, but the GEP has only loop-invariant operands, 4507 // the GEP we build (by only using vector-typed operands for 4508 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4509 // produce a vector of pointers, we need to either arbitrarily pick an 4510 // operand to broadcast, or broadcast a clone of the original GEP. 4511 // Here, we broadcast a clone of the original. 4512 // 4513 // TODO: If at some point we decide to scalarize instructions having 4514 // loop-invariant operands, this special case will no longer be 4515 // required. We would add the scalarization decision to 4516 // collectLoopScalars() and teach getVectorValue() to broadcast 4517 // the lane-zero scalar value. 4518 auto *Clone = Builder.Insert(GEP->clone()); 4519 for (unsigned Part = 0; Part < UF; ++Part) { 4520 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4521 State.set(VPDef, GEP, EntryPart, Part); 4522 addMetadata(EntryPart, GEP); 4523 } 4524 } else { 4525 // If the GEP has at least one loop-varying operand, we are sure to 4526 // produce a vector of pointers. But if we are only unrolling, we want 4527 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4528 // produce with the code below will be scalar (if VF == 1) or vector 4529 // (otherwise). Note that for the unroll-only case, we still maintain 4530 // values in the vector mapping with initVector, as we do for other 4531 // instructions. 4532 for (unsigned Part = 0; Part < UF; ++Part) { 4533 // The pointer operand of the new GEP. If it's loop-invariant, we 4534 // won't broadcast it. 4535 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4536 : State.get(Operands.getOperand(0), Part); 4537 4538 // Collect all the indices for the new GEP. If any index is 4539 // loop-invariant, we won't broadcast it. 4540 SmallVector<Value *, 4> Indices; 4541 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4542 VPValue *Operand = Operands.getOperand(I); 4543 if (IsIndexLoopInvariant[I - 1]) 4544 Indices.push_back(State.get(Operand, {0, 0})); 4545 else 4546 Indices.push_back(State.get(Operand, Part)); 4547 } 4548 4549 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4550 // but it should be a vector, otherwise. 4551 auto *NewGEP = 4552 GEP->isInBounds() 4553 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4554 Indices) 4555 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4556 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4557 "NewGEP is not a pointer vector"); 4558 State.set(VPDef, GEP, NewGEP, Part); 4559 addMetadata(NewGEP, GEP); 4560 } 4561 } 4562 } 4563 4564 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4565 RecurrenceDescriptor *RdxDesc, 4566 Value *StartV, unsigned UF, 4567 ElementCount VF) { 4568 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4569 PHINode *P = cast<PHINode>(PN); 4570 if (EnableVPlanNativePath) { 4571 // Currently we enter here in the VPlan-native path for non-induction 4572 // PHIs where all control flow is uniform. We simply widen these PHIs. 4573 // Create a vector phi with no operands - the vector phi operands will be 4574 // set at the end of vector code generation. 4575 Type *VecTy = 4576 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4577 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4578 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4579 OrigPHIsToFix.push_back(P); 4580 4581 return; 4582 } 4583 4584 assert(PN->getParent() == OrigLoop->getHeader() && 4585 "Non-header phis should have been handled elsewhere"); 4586 4587 // In order to support recurrences we need to be able to vectorize Phi nodes. 4588 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4589 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4590 // this value when we vectorize all of the instructions that use the PHI. 4591 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4592 Value *Iden = nullptr; 4593 bool ScalarPHI = 4594 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4595 Type *VecTy = 4596 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4597 4598 if (RdxDesc) { 4599 assert(Legal->isReductionVariable(P) && StartV && 4600 "RdxDesc should only be set for reduction variables; in that case " 4601 "a StartV is also required"); 4602 RecurKind RK = RdxDesc->getRecurrenceKind(); 4603 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4604 // MinMax reduction have the start value as their identify. 4605 if (ScalarPHI) { 4606 Iden = StartV; 4607 } else { 4608 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4609 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4610 StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); 4611 } 4612 } else { 4613 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4614 RK, VecTy->getScalarType()); 4615 Iden = IdenC; 4616 4617 if (!ScalarPHI) { 4618 Iden = ConstantVector::getSplat(VF, IdenC); 4619 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4620 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4621 Constant *Zero = Builder.getInt32(0); 4622 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4623 } 4624 } 4625 } 4626 4627 for (unsigned Part = 0; Part < UF; ++Part) { 4628 // This is phase one of vectorizing PHIs. 4629 Value *EntryPart = PHINode::Create( 4630 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4631 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4632 if (StartV) { 4633 // Make sure to add the reduction start value only to the 4634 // first unroll part. 4635 Value *StartVal = (Part == 0) ? StartV : Iden; 4636 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4637 } 4638 } 4639 return; 4640 } 4641 4642 assert(!Legal->isReductionVariable(P) && 4643 "reductions should be handled above"); 4644 4645 setDebugLocFromInst(Builder, P); 4646 4647 // This PHINode must be an induction variable. 4648 // Make sure that we know about it. 4649 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4650 4651 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4652 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4653 4654 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4655 // which can be found from the original scalar operations. 4656 switch (II.getKind()) { 4657 case InductionDescriptor::IK_NoInduction: 4658 llvm_unreachable("Unknown induction"); 4659 case InductionDescriptor::IK_IntInduction: 4660 case InductionDescriptor::IK_FpInduction: 4661 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4662 case InductionDescriptor::IK_PtrInduction: { 4663 // Handle the pointer induction variable case. 4664 assert(P->getType()->isPointerTy() && "Unexpected type."); 4665 4666 if (Cost->isScalarAfterVectorization(P, VF)) { 4667 // This is the normalized GEP that starts counting at zero. 4668 Value *PtrInd = 4669 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4670 // Determine the number of scalars we need to generate for each unroll 4671 // iteration. If the instruction is uniform, we only need to generate the 4672 // first lane. Otherwise, we generate all VF values. 4673 unsigned Lanes = 4674 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4675 for (unsigned Part = 0; Part < UF; ++Part) { 4676 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4677 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4678 Lane + Part * VF.getKnownMinValue()); 4679 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4680 Value *SclrGep = 4681 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4682 SclrGep->setName("next.gep"); 4683 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4684 } 4685 } 4686 return; 4687 } 4688 assert(isa<SCEVConstant>(II.getStep()) && 4689 "Induction step not a SCEV constant!"); 4690 Type *PhiType = II.getStep()->getType(); 4691 4692 // Build a pointer phi 4693 Value *ScalarStartValue = II.getStartValue(); 4694 Type *ScStValueType = ScalarStartValue->getType(); 4695 PHINode *NewPointerPhi = 4696 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4697 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4698 4699 // A pointer induction, performed by using a gep 4700 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4701 Instruction *InductionLoc = LoopLatch->getTerminator(); 4702 const SCEV *ScalarStep = II.getStep(); 4703 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4704 Value *ScalarStepValue = 4705 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4706 Value *InductionGEP = GetElementPtrInst::Create( 4707 ScStValueType->getPointerElementType(), NewPointerPhi, 4708 Builder.CreateMul( 4709 ScalarStepValue, 4710 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4711 "ptr.ind", InductionLoc); 4712 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4713 4714 // Create UF many actual address geps that use the pointer 4715 // phi as base and a vectorized version of the step value 4716 // (<step*0, ..., step*N>) as offset. 4717 for (unsigned Part = 0; Part < UF; ++Part) { 4718 SmallVector<Constant *, 8> Indices; 4719 // Create a vector of consecutive numbers from zero to VF. 4720 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4721 Indices.push_back( 4722 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4723 Constant *StartOffset = ConstantVector::get(Indices); 4724 4725 Value *GEP = Builder.CreateGEP( 4726 ScStValueType->getPointerElementType(), NewPointerPhi, 4727 Builder.CreateMul( 4728 StartOffset, 4729 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4730 "vector.gep")); 4731 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4732 } 4733 } 4734 } 4735 } 4736 4737 /// A helper function for checking whether an integer division-related 4738 /// instruction may divide by zero (in which case it must be predicated if 4739 /// executed conditionally in the scalar code). 4740 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4741 /// Non-zero divisors that are non compile-time constants will not be 4742 /// converted into multiplication, so we will still end up scalarizing 4743 /// the division, but can do so w/o predication. 4744 static bool mayDivideByZero(Instruction &I) { 4745 assert((I.getOpcode() == Instruction::UDiv || 4746 I.getOpcode() == Instruction::SDiv || 4747 I.getOpcode() == Instruction::URem || 4748 I.getOpcode() == Instruction::SRem) && 4749 "Unexpected instruction"); 4750 Value *Divisor = I.getOperand(1); 4751 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4752 return !CInt || CInt->isZero(); 4753 } 4754 4755 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4756 VPUser &User, 4757 VPTransformState &State) { 4758 switch (I.getOpcode()) { 4759 case Instruction::Call: 4760 case Instruction::Br: 4761 case Instruction::PHI: 4762 case Instruction::GetElementPtr: 4763 case Instruction::Select: 4764 llvm_unreachable("This instruction is handled by a different recipe."); 4765 case Instruction::UDiv: 4766 case Instruction::SDiv: 4767 case Instruction::SRem: 4768 case Instruction::URem: 4769 case Instruction::Add: 4770 case Instruction::FAdd: 4771 case Instruction::Sub: 4772 case Instruction::FSub: 4773 case Instruction::FNeg: 4774 case Instruction::Mul: 4775 case Instruction::FMul: 4776 case Instruction::FDiv: 4777 case Instruction::FRem: 4778 case Instruction::Shl: 4779 case Instruction::LShr: 4780 case Instruction::AShr: 4781 case Instruction::And: 4782 case Instruction::Or: 4783 case Instruction::Xor: { 4784 // Just widen unops and binops. 4785 setDebugLocFromInst(Builder, &I); 4786 4787 for (unsigned Part = 0; Part < UF; ++Part) { 4788 SmallVector<Value *, 2> Ops; 4789 for (VPValue *VPOp : User.operands()) 4790 Ops.push_back(State.get(VPOp, Part)); 4791 4792 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4793 4794 if (auto *VecOp = dyn_cast<Instruction>(V)) 4795 VecOp->copyIRFlags(&I); 4796 4797 // Use this vector value for all users of the original instruction. 4798 State.set(Def, &I, V, Part); 4799 addMetadata(V, &I); 4800 } 4801 4802 break; 4803 } 4804 case Instruction::ICmp: 4805 case Instruction::FCmp: { 4806 // Widen compares. Generate vector compares. 4807 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4808 auto *Cmp = cast<CmpInst>(&I); 4809 setDebugLocFromInst(Builder, Cmp); 4810 for (unsigned Part = 0; Part < UF; ++Part) { 4811 Value *A = State.get(User.getOperand(0), Part); 4812 Value *B = State.get(User.getOperand(1), Part); 4813 Value *C = nullptr; 4814 if (FCmp) { 4815 // Propagate fast math flags. 4816 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4817 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4818 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4819 } else { 4820 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4821 } 4822 State.set(Def, &I, C, Part); 4823 addMetadata(C, &I); 4824 } 4825 4826 break; 4827 } 4828 4829 case Instruction::ZExt: 4830 case Instruction::SExt: 4831 case Instruction::FPToUI: 4832 case Instruction::FPToSI: 4833 case Instruction::FPExt: 4834 case Instruction::PtrToInt: 4835 case Instruction::IntToPtr: 4836 case Instruction::SIToFP: 4837 case Instruction::UIToFP: 4838 case Instruction::Trunc: 4839 case Instruction::FPTrunc: 4840 case Instruction::BitCast: { 4841 auto *CI = cast<CastInst>(&I); 4842 setDebugLocFromInst(Builder, CI); 4843 4844 /// Vectorize casts. 4845 Type *DestTy = 4846 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4847 4848 for (unsigned Part = 0; Part < UF; ++Part) { 4849 Value *A = State.get(User.getOperand(0), Part); 4850 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4851 State.set(Def, &I, Cast, Part); 4852 addMetadata(Cast, &I); 4853 } 4854 break; 4855 } 4856 default: 4857 // This instruction is not vectorized by simple widening. 4858 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4859 llvm_unreachable("Unhandled instruction!"); 4860 } // end of switch. 4861 } 4862 4863 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4864 VPUser &ArgOperands, 4865 VPTransformState &State) { 4866 assert(!isa<DbgInfoIntrinsic>(I) && 4867 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4868 setDebugLocFromInst(Builder, &I); 4869 4870 Module *M = I.getParent()->getParent()->getParent(); 4871 auto *CI = cast<CallInst>(&I); 4872 4873 SmallVector<Type *, 4> Tys; 4874 for (Value *ArgOperand : CI->arg_operands()) 4875 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4876 4877 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4878 4879 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4880 // version of the instruction. 4881 // Is it beneficial to perform intrinsic call compared to lib call? 4882 bool NeedToScalarize = false; 4883 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4884 bool UseVectorIntrinsic = 4885 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4886 assert((UseVectorIntrinsic || !NeedToScalarize) && 4887 "Instruction should be scalarized elsewhere."); 4888 4889 for (unsigned Part = 0; Part < UF; ++Part) { 4890 SmallVector<Value *, 4> Args; 4891 for (auto &I : enumerate(ArgOperands.operands())) { 4892 // Some intrinsics have a scalar argument - don't replace it with a 4893 // vector. 4894 Value *Arg; 4895 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4896 Arg = State.get(I.value(), Part); 4897 else 4898 Arg = State.get(I.value(), {0, 0}); 4899 Args.push_back(Arg); 4900 } 4901 4902 Function *VectorF; 4903 if (UseVectorIntrinsic) { 4904 // Use vector version of the intrinsic. 4905 Type *TysForDecl[] = {CI->getType()}; 4906 if (VF.isVector()) { 4907 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4908 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4909 } 4910 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4911 assert(VectorF && "Can't retrieve vector intrinsic."); 4912 } else { 4913 // Use vector version of the function call. 4914 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4915 #ifndef NDEBUG 4916 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4917 "Can't create vector function."); 4918 #endif 4919 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4920 } 4921 SmallVector<OperandBundleDef, 1> OpBundles; 4922 CI->getOperandBundlesAsDefs(OpBundles); 4923 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4924 4925 if (isa<FPMathOperator>(V)) 4926 V->copyFastMathFlags(CI); 4927 4928 State.set(Def, &I, V, Part); 4929 addMetadata(V, &I); 4930 } 4931 } 4932 4933 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4934 VPUser &Operands, 4935 bool InvariantCond, 4936 VPTransformState &State) { 4937 setDebugLocFromInst(Builder, &I); 4938 4939 // The condition can be loop invariant but still defined inside the 4940 // loop. This means that we can't just use the original 'cond' value. 4941 // We have to take the 'vectorized' value and pick the first lane. 4942 // Instcombine will make this a no-op. 4943 auto *InvarCond = 4944 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4945 4946 for (unsigned Part = 0; Part < UF; ++Part) { 4947 Value *Cond = 4948 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4949 Value *Op0 = State.get(Operands.getOperand(1), Part); 4950 Value *Op1 = State.get(Operands.getOperand(2), Part); 4951 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4952 State.set(VPDef, &I, Sel, Part); 4953 addMetadata(Sel, &I); 4954 } 4955 } 4956 4957 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4958 // We should not collect Scalars more than once per VF. Right now, this 4959 // function is called from collectUniformsAndScalars(), which already does 4960 // this check. Collecting Scalars for VF=1 does not make any sense. 4961 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4962 "This function should not be visited twice for the same VF"); 4963 4964 SmallSetVector<Instruction *, 8> Worklist; 4965 4966 // These sets are used to seed the analysis with pointers used by memory 4967 // accesses that will remain scalar. 4968 SmallSetVector<Instruction *, 8> ScalarPtrs; 4969 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4970 auto *Latch = TheLoop->getLoopLatch(); 4971 4972 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4973 // The pointer operands of loads and stores will be scalar as long as the 4974 // memory access is not a gather or scatter operation. The value operand of a 4975 // store will remain scalar if the store is scalarized. 4976 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4977 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4978 assert(WideningDecision != CM_Unknown && 4979 "Widening decision should be ready at this moment"); 4980 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4981 if (Ptr == Store->getValueOperand()) 4982 return WideningDecision == CM_Scalarize; 4983 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4984 "Ptr is neither a value or pointer operand"); 4985 return WideningDecision != CM_GatherScatter; 4986 }; 4987 4988 // A helper that returns true if the given value is a bitcast or 4989 // getelementptr instruction contained in the loop. 4990 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4991 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4992 isa<GetElementPtrInst>(V)) && 4993 !TheLoop->isLoopInvariant(V); 4994 }; 4995 4996 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4997 if (!isa<PHINode>(Ptr) || 4998 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4999 return false; 5000 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5001 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5002 return false; 5003 return isScalarUse(MemAccess, Ptr); 5004 }; 5005 5006 // A helper that evaluates a memory access's use of a pointer. If the 5007 // pointer is actually the pointer induction of a loop, it is being 5008 // inserted into Worklist. If the use will be a scalar use, and the 5009 // pointer is only used by memory accesses, we place the pointer in 5010 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5011 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5012 if (isScalarPtrInduction(MemAccess, Ptr)) { 5013 Worklist.insert(cast<Instruction>(Ptr)); 5014 Instruction *Update = cast<Instruction>( 5015 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5016 Worklist.insert(Update); 5017 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5018 << "\n"); 5019 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5020 << "\n"); 5021 return; 5022 } 5023 // We only care about bitcast and getelementptr instructions contained in 5024 // the loop. 5025 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5026 return; 5027 5028 // If the pointer has already been identified as scalar (e.g., if it was 5029 // also identified as uniform), there's nothing to do. 5030 auto *I = cast<Instruction>(Ptr); 5031 if (Worklist.count(I)) 5032 return; 5033 5034 // If the use of the pointer will be a scalar use, and all users of the 5035 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5036 // place the pointer in PossibleNonScalarPtrs. 5037 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5038 return isa<LoadInst>(U) || isa<StoreInst>(U); 5039 })) 5040 ScalarPtrs.insert(I); 5041 else 5042 PossibleNonScalarPtrs.insert(I); 5043 }; 5044 5045 // We seed the scalars analysis with three classes of instructions: (1) 5046 // instructions marked uniform-after-vectorization and (2) bitcast, 5047 // getelementptr and (pointer) phi instructions used by memory accesses 5048 // requiring a scalar use. 5049 // 5050 // (1) Add to the worklist all instructions that have been identified as 5051 // uniform-after-vectorization. 5052 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5053 5054 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5055 // memory accesses requiring a scalar use. The pointer operands of loads and 5056 // stores will be scalar as long as the memory accesses is not a gather or 5057 // scatter operation. The value operand of a store will remain scalar if the 5058 // store is scalarized. 5059 for (auto *BB : TheLoop->blocks()) 5060 for (auto &I : *BB) { 5061 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5062 evaluatePtrUse(Load, Load->getPointerOperand()); 5063 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5064 evaluatePtrUse(Store, Store->getPointerOperand()); 5065 evaluatePtrUse(Store, Store->getValueOperand()); 5066 } 5067 } 5068 for (auto *I : ScalarPtrs) 5069 if (!PossibleNonScalarPtrs.count(I)) { 5070 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5071 Worklist.insert(I); 5072 } 5073 5074 // Insert the forced scalars. 5075 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5076 // induction variable when the PHI user is scalarized. 5077 auto ForcedScalar = ForcedScalars.find(VF); 5078 if (ForcedScalar != ForcedScalars.end()) 5079 for (auto *I : ForcedScalar->second) 5080 Worklist.insert(I); 5081 5082 // Expand the worklist by looking through any bitcasts and getelementptr 5083 // instructions we've already identified as scalar. This is similar to the 5084 // expansion step in collectLoopUniforms(); however, here we're only 5085 // expanding to include additional bitcasts and getelementptr instructions. 5086 unsigned Idx = 0; 5087 while (Idx != Worklist.size()) { 5088 Instruction *Dst = Worklist[Idx++]; 5089 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5090 continue; 5091 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5092 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5093 auto *J = cast<Instruction>(U); 5094 return !TheLoop->contains(J) || Worklist.count(J) || 5095 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5096 isScalarUse(J, Src)); 5097 })) { 5098 Worklist.insert(Src); 5099 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5100 } 5101 } 5102 5103 // An induction variable will remain scalar if all users of the induction 5104 // variable and induction variable update remain scalar. 5105 for (auto &Induction : Legal->getInductionVars()) { 5106 auto *Ind = Induction.first; 5107 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5108 5109 // If tail-folding is applied, the primary induction variable will be used 5110 // to feed a vector compare. 5111 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5112 continue; 5113 5114 // Determine if all users of the induction variable are scalar after 5115 // vectorization. 5116 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5117 auto *I = cast<Instruction>(U); 5118 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5119 }); 5120 if (!ScalarInd) 5121 continue; 5122 5123 // Determine if all users of the induction variable update instruction are 5124 // scalar after vectorization. 5125 auto ScalarIndUpdate = 5126 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5127 auto *I = cast<Instruction>(U); 5128 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5129 }); 5130 if (!ScalarIndUpdate) 5131 continue; 5132 5133 // The induction variable and its update instruction will remain scalar. 5134 Worklist.insert(Ind); 5135 Worklist.insert(IndUpdate); 5136 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5137 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5138 << "\n"); 5139 } 5140 5141 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5142 } 5143 5144 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5145 ElementCount VF) { 5146 if (!blockNeedsPredication(I->getParent())) 5147 return false; 5148 switch(I->getOpcode()) { 5149 default: 5150 break; 5151 case Instruction::Load: 5152 case Instruction::Store: { 5153 if (!Legal->isMaskRequired(I)) 5154 return false; 5155 auto *Ptr = getLoadStorePointerOperand(I); 5156 auto *Ty = getMemInstValueType(I); 5157 // We have already decided how to vectorize this instruction, get that 5158 // result. 5159 if (VF.isVector()) { 5160 InstWidening WideningDecision = getWideningDecision(I, VF); 5161 assert(WideningDecision != CM_Unknown && 5162 "Widening decision should be ready at this moment"); 5163 return WideningDecision == CM_Scalarize; 5164 } 5165 const Align Alignment = getLoadStoreAlignment(I); 5166 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5167 isLegalMaskedGather(Ty, Alignment)) 5168 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5169 isLegalMaskedScatter(Ty, Alignment)); 5170 } 5171 case Instruction::UDiv: 5172 case Instruction::SDiv: 5173 case Instruction::SRem: 5174 case Instruction::URem: 5175 return mayDivideByZero(*I); 5176 } 5177 return false; 5178 } 5179 5180 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5181 Instruction *I, ElementCount VF) { 5182 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5183 assert(getWideningDecision(I, VF) == CM_Unknown && 5184 "Decision should not be set yet."); 5185 auto *Group = getInterleavedAccessGroup(I); 5186 assert(Group && "Must have a group."); 5187 5188 // If the instruction's allocated size doesn't equal it's type size, it 5189 // requires padding and will be scalarized. 5190 auto &DL = I->getModule()->getDataLayout(); 5191 auto *ScalarTy = getMemInstValueType(I); 5192 if (hasIrregularType(ScalarTy, DL, VF)) 5193 return false; 5194 5195 // Check if masking is required. 5196 // A Group may need masking for one of two reasons: it resides in a block that 5197 // needs predication, or it was decided to use masking to deal with gaps. 5198 bool PredicatedAccessRequiresMasking = 5199 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5200 bool AccessWithGapsRequiresMasking = 5201 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5202 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5203 return true; 5204 5205 // If masked interleaving is required, we expect that the user/target had 5206 // enabled it, because otherwise it either wouldn't have been created or 5207 // it should have been invalidated by the CostModel. 5208 assert(useMaskedInterleavedAccesses(TTI) && 5209 "Masked interleave-groups for predicated accesses are not enabled."); 5210 5211 auto *Ty = getMemInstValueType(I); 5212 const Align Alignment = getLoadStoreAlignment(I); 5213 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5214 : TTI.isLegalMaskedStore(Ty, Alignment); 5215 } 5216 5217 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5218 Instruction *I, ElementCount VF) { 5219 // Get and ensure we have a valid memory instruction. 5220 LoadInst *LI = dyn_cast<LoadInst>(I); 5221 StoreInst *SI = dyn_cast<StoreInst>(I); 5222 assert((LI || SI) && "Invalid memory instruction"); 5223 5224 auto *Ptr = getLoadStorePointerOperand(I); 5225 5226 // In order to be widened, the pointer should be consecutive, first of all. 5227 if (!Legal->isConsecutivePtr(Ptr)) 5228 return false; 5229 5230 // If the instruction is a store located in a predicated block, it will be 5231 // scalarized. 5232 if (isScalarWithPredication(I)) 5233 return false; 5234 5235 // If the instruction's allocated size doesn't equal it's type size, it 5236 // requires padding and will be scalarized. 5237 auto &DL = I->getModule()->getDataLayout(); 5238 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5239 if (hasIrregularType(ScalarTy, DL, VF)) 5240 return false; 5241 5242 return true; 5243 } 5244 5245 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5246 // We should not collect Uniforms more than once per VF. Right now, 5247 // this function is called from collectUniformsAndScalars(), which 5248 // already does this check. Collecting Uniforms for VF=1 does not make any 5249 // sense. 5250 5251 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5252 "This function should not be visited twice for the same VF"); 5253 5254 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5255 // not analyze again. Uniforms.count(VF) will return 1. 5256 Uniforms[VF].clear(); 5257 5258 // We now know that the loop is vectorizable! 5259 // Collect instructions inside the loop that will remain uniform after 5260 // vectorization. 5261 5262 // Global values, params and instructions outside of current loop are out of 5263 // scope. 5264 auto isOutOfScope = [&](Value *V) -> bool { 5265 Instruction *I = dyn_cast<Instruction>(V); 5266 return (!I || !TheLoop->contains(I)); 5267 }; 5268 5269 SetVector<Instruction *> Worklist; 5270 BasicBlock *Latch = TheLoop->getLoopLatch(); 5271 5272 // Instructions that are scalar with predication must not be considered 5273 // uniform after vectorization, because that would create an erroneous 5274 // replicating region where only a single instance out of VF should be formed. 5275 // TODO: optimize such seldom cases if found important, see PR40816. 5276 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5277 if (isOutOfScope(I)) { 5278 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5279 << *I << "\n"); 5280 return; 5281 } 5282 if (isScalarWithPredication(I, VF)) { 5283 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5284 << *I << "\n"); 5285 return; 5286 } 5287 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5288 Worklist.insert(I); 5289 }; 5290 5291 // Start with the conditional branch. If the branch condition is an 5292 // instruction contained in the loop that is only used by the branch, it is 5293 // uniform. 5294 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5295 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5296 addToWorklistIfAllowed(Cmp); 5297 5298 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5299 InstWidening WideningDecision = getWideningDecision(I, VF); 5300 assert(WideningDecision != CM_Unknown && 5301 "Widening decision should be ready at this moment"); 5302 5303 // A uniform memory op is itself uniform. We exclude uniform stores 5304 // here as they demand the last lane, not the first one. 5305 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5306 assert(WideningDecision == CM_Scalarize); 5307 return true; 5308 } 5309 5310 return (WideningDecision == CM_Widen || 5311 WideningDecision == CM_Widen_Reverse || 5312 WideningDecision == CM_Interleave); 5313 }; 5314 5315 5316 // Returns true if Ptr is the pointer operand of a memory access instruction 5317 // I, and I is known to not require scalarization. 5318 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5319 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5320 }; 5321 5322 // Holds a list of values which are known to have at least one uniform use. 5323 // Note that there may be other uses which aren't uniform. A "uniform use" 5324 // here is something which only demands lane 0 of the unrolled iterations; 5325 // it does not imply that all lanes produce the same value (e.g. this is not 5326 // the usual meaning of uniform) 5327 SmallPtrSet<Value *, 8> HasUniformUse; 5328 5329 // Scan the loop for instructions which are either a) known to have only 5330 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5331 for (auto *BB : TheLoop->blocks()) 5332 for (auto &I : *BB) { 5333 // If there's no pointer operand, there's nothing to do. 5334 auto *Ptr = getLoadStorePointerOperand(&I); 5335 if (!Ptr) 5336 continue; 5337 5338 // A uniform memory op is itself uniform. We exclude uniform stores 5339 // here as they demand the last lane, not the first one. 5340 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5341 addToWorklistIfAllowed(&I); 5342 5343 if (isUniformDecision(&I, VF)) { 5344 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5345 HasUniformUse.insert(Ptr); 5346 } 5347 } 5348 5349 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5350 // demanding) users. Since loops are assumed to be in LCSSA form, this 5351 // disallows uses outside the loop as well. 5352 for (auto *V : HasUniformUse) { 5353 if (isOutOfScope(V)) 5354 continue; 5355 auto *I = cast<Instruction>(V); 5356 auto UsersAreMemAccesses = 5357 llvm::all_of(I->users(), [&](User *U) -> bool { 5358 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5359 }); 5360 if (UsersAreMemAccesses) 5361 addToWorklistIfAllowed(I); 5362 } 5363 5364 // Expand Worklist in topological order: whenever a new instruction 5365 // is added , its users should be already inside Worklist. It ensures 5366 // a uniform instruction will only be used by uniform instructions. 5367 unsigned idx = 0; 5368 while (idx != Worklist.size()) { 5369 Instruction *I = Worklist[idx++]; 5370 5371 for (auto OV : I->operand_values()) { 5372 // isOutOfScope operands cannot be uniform instructions. 5373 if (isOutOfScope(OV)) 5374 continue; 5375 // First order recurrence Phi's should typically be considered 5376 // non-uniform. 5377 auto *OP = dyn_cast<PHINode>(OV); 5378 if (OP && Legal->isFirstOrderRecurrence(OP)) 5379 continue; 5380 // If all the users of the operand are uniform, then add the 5381 // operand into the uniform worklist. 5382 auto *OI = cast<Instruction>(OV); 5383 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5384 auto *J = cast<Instruction>(U); 5385 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5386 })) 5387 addToWorklistIfAllowed(OI); 5388 } 5389 } 5390 5391 // For an instruction to be added into Worklist above, all its users inside 5392 // the loop should also be in Worklist. However, this condition cannot be 5393 // true for phi nodes that form a cyclic dependence. We must process phi 5394 // nodes separately. An induction variable will remain uniform if all users 5395 // of the induction variable and induction variable update remain uniform. 5396 // The code below handles both pointer and non-pointer induction variables. 5397 for (auto &Induction : Legal->getInductionVars()) { 5398 auto *Ind = Induction.first; 5399 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5400 5401 // Determine if all users of the induction variable are uniform after 5402 // vectorization. 5403 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5404 auto *I = cast<Instruction>(U); 5405 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5406 isVectorizedMemAccessUse(I, Ind); 5407 }); 5408 if (!UniformInd) 5409 continue; 5410 5411 // Determine if all users of the induction variable update instruction are 5412 // uniform after vectorization. 5413 auto UniformIndUpdate = 5414 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5415 auto *I = cast<Instruction>(U); 5416 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5417 isVectorizedMemAccessUse(I, IndUpdate); 5418 }); 5419 if (!UniformIndUpdate) 5420 continue; 5421 5422 // The induction variable and its update instruction will remain uniform. 5423 addToWorklistIfAllowed(Ind); 5424 addToWorklistIfAllowed(IndUpdate); 5425 } 5426 5427 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5428 } 5429 5430 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5431 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5432 5433 if (Legal->getRuntimePointerChecking()->Need) { 5434 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5435 "runtime pointer checks needed. Enable vectorization of this " 5436 "loop with '#pragma clang loop vectorize(enable)' when " 5437 "compiling with -Os/-Oz", 5438 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5439 return true; 5440 } 5441 5442 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5443 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5444 "runtime SCEV checks needed. Enable vectorization of this " 5445 "loop with '#pragma clang loop vectorize(enable)' when " 5446 "compiling with -Os/-Oz", 5447 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5448 return true; 5449 } 5450 5451 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5452 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5453 reportVectorizationFailure("Runtime stride check for small trip count", 5454 "runtime stride == 1 checks needed. Enable vectorization of " 5455 "this loop without such check by compiling with -Os/-Oz", 5456 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5457 return true; 5458 } 5459 5460 return false; 5461 } 5462 5463 Optional<ElementCount> 5464 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5465 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5466 // TODO: It may by useful to do since it's still likely to be dynamically 5467 // uniform if the target can skip. 5468 reportVectorizationFailure( 5469 "Not inserting runtime ptr check for divergent target", 5470 "runtime pointer checks needed. Not enabled for divergent target", 5471 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5472 return None; 5473 } 5474 5475 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5476 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5477 if (TC == 1) { 5478 reportVectorizationFailure("Single iteration (non) loop", 5479 "loop trip count is one, irrelevant for vectorization", 5480 "SingleIterationLoop", ORE, TheLoop); 5481 return None; 5482 } 5483 5484 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5485 5486 switch (ScalarEpilogueStatus) { 5487 case CM_ScalarEpilogueAllowed: 5488 return MaxVF; 5489 case CM_ScalarEpilogueNotAllowedUsePredicate: 5490 LLVM_FALLTHROUGH; 5491 case CM_ScalarEpilogueNotNeededUsePredicate: 5492 LLVM_DEBUG( 5493 dbgs() << "LV: vector predicate hint/switch found.\n" 5494 << "LV: Not allowing scalar epilogue, creating predicated " 5495 << "vector loop.\n"); 5496 break; 5497 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5498 // fallthrough as a special case of OptForSize 5499 case CM_ScalarEpilogueNotAllowedOptSize: 5500 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5501 LLVM_DEBUG( 5502 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5503 else 5504 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5505 << "count.\n"); 5506 5507 // Bail if runtime checks are required, which are not good when optimising 5508 // for size. 5509 if (runtimeChecksRequired()) 5510 return None; 5511 5512 break; 5513 } 5514 5515 // The only loops we can vectorize without a scalar epilogue, are loops with 5516 // a bottom-test and a single exiting block. We'd have to handle the fact 5517 // that not every instruction executes on the last iteration. This will 5518 // require a lane mask which varies through the vector loop body. (TODO) 5519 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5520 // If there was a tail-folding hint/switch, but we can't fold the tail by 5521 // masking, fallback to a vectorization with a scalar epilogue. 5522 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5523 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5524 "scalar epilogue instead.\n"); 5525 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5526 return MaxVF; 5527 } 5528 return None; 5529 } 5530 5531 // Now try the tail folding 5532 5533 // Invalidate interleave groups that require an epilogue if we can't mask 5534 // the interleave-group. 5535 if (!useMaskedInterleavedAccesses(TTI)) { 5536 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5537 "No decisions should have been taken at this point"); 5538 // Note: There is no need to invalidate any cost modeling decisions here, as 5539 // non where taken so far. 5540 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5541 } 5542 5543 assert(!MaxVF.isScalable() && 5544 "Scalable vectors do not yet support tail folding"); 5545 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5546 "MaxVF must be a power of 2"); 5547 unsigned MaxVFtimesIC = 5548 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5549 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5550 // chose. 5551 ScalarEvolution *SE = PSE.getSE(); 5552 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5553 const SCEV *ExitCount = SE->getAddExpr( 5554 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5555 const SCEV *Rem = SE->getURemExpr( 5556 ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5557 if (Rem->isZero()) { 5558 // Accept MaxVF if we do not have a tail. 5559 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5560 return MaxVF; 5561 } 5562 5563 // If we don't know the precise trip count, or if the trip count that we 5564 // found modulo the vectorization factor is not zero, try to fold the tail 5565 // by masking. 5566 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5567 if (Legal->prepareToFoldTailByMasking()) { 5568 FoldTailByMasking = true; 5569 return MaxVF; 5570 } 5571 5572 // If there was a tail-folding hint/switch, but we can't fold the tail by 5573 // masking, fallback to a vectorization with a scalar epilogue. 5574 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5575 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5576 "scalar epilogue instead.\n"); 5577 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5578 return MaxVF; 5579 } 5580 5581 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5582 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5583 return None; 5584 } 5585 5586 if (TC == 0) { 5587 reportVectorizationFailure( 5588 "Unable to calculate the loop count due to complex control flow", 5589 "unable to calculate the loop count due to complex control flow", 5590 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5591 return None; 5592 } 5593 5594 reportVectorizationFailure( 5595 "Cannot optimize for size and vectorize at the same time.", 5596 "cannot optimize for size and vectorize at the same time. " 5597 "Enable vectorization of this loop with '#pragma clang loop " 5598 "vectorize(enable)' when compiling with -Os/-Oz", 5599 "NoTailLoopWithOptForSize", ORE, TheLoop); 5600 return None; 5601 } 5602 5603 ElementCount 5604 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5605 ElementCount UserVF) { 5606 bool IgnoreScalableUserVF = UserVF.isScalable() && 5607 !TTI.supportsScalableVectors() && 5608 !ForceTargetSupportsScalableVectors; 5609 if (IgnoreScalableUserVF) { 5610 LLVM_DEBUG( 5611 dbgs() << "LV: Ignoring VF=" << UserVF 5612 << " because target does not support scalable vectors.\n"); 5613 ORE->emit([&]() { 5614 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5615 TheLoop->getStartLoc(), 5616 TheLoop->getHeader()) 5617 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5618 << " because target does not support scalable vectors."; 5619 }); 5620 } 5621 5622 // Beyond this point two scenarios are handled. If UserVF isn't specified 5623 // then a suitable VF is chosen. If UserVF is specified and there are 5624 // dependencies, check if it's legal. However, if a UserVF is specified and 5625 // there are no dependencies, then there's nothing to do. 5626 if (UserVF.isNonZero() && !IgnoreScalableUserVF && 5627 Legal->isSafeForAnyVectorWidth()) 5628 return UserVF; 5629 5630 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5631 unsigned SmallestType, WidestType; 5632 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5633 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5634 5635 // Get the maximum safe dependence distance in bits computed by LAA. 5636 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5637 // the memory accesses that is most restrictive (involved in the smallest 5638 // dependence distance). 5639 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5640 5641 // If the user vectorization factor is legally unsafe, clamp it to a safe 5642 // value. Otherwise, return as is. 5643 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5644 unsigned MaxSafeElements = 5645 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5646 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5647 5648 if (UserVF.isScalable()) { 5649 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5650 5651 // Scale VF by vscale before checking if it's safe. 5652 MaxSafeVF = ElementCount::getScalable( 5653 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5654 5655 if (MaxSafeVF.isZero()) { 5656 // The dependence distance is too small to use scalable vectors, 5657 // fallback on fixed. 5658 LLVM_DEBUG( 5659 dbgs() 5660 << "LV: Max legal vector width too small, scalable vectorization " 5661 "unfeasible. Using fixed-width vectorization instead.\n"); 5662 ORE->emit([&]() { 5663 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5664 TheLoop->getStartLoc(), 5665 TheLoop->getHeader()) 5666 << "Max legal vector width too small, scalable vectorization " 5667 << "unfeasible. Using fixed-width vectorization instead."; 5668 }); 5669 return computeFeasibleMaxVF( 5670 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5671 } 5672 } 5673 5674 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5675 5676 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5677 return UserVF; 5678 5679 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5680 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5681 << ".\n"); 5682 ORE->emit([&]() { 5683 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5684 TheLoop->getStartLoc(), 5685 TheLoop->getHeader()) 5686 << "User-specified vectorization factor " 5687 << ore::NV("UserVectorizationFactor", UserVF) 5688 << " is unsafe, clamping to maximum safe vectorization factor " 5689 << ore::NV("VectorizationFactor", MaxSafeVF); 5690 }); 5691 return MaxSafeVF; 5692 } 5693 5694 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5695 5696 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5697 // Note that both WidestRegister and WidestType may not be a powers of 2. 5698 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5699 5700 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5701 << " / " << WidestType << " bits.\n"); 5702 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5703 << WidestRegister << " bits.\n"); 5704 5705 assert(MaxVectorSize <= WidestRegister && 5706 "Did not expect to pack so many elements" 5707 " into one vector!"); 5708 if (MaxVectorSize == 0) { 5709 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5710 MaxVectorSize = 1; 5711 return ElementCount::getFixed(MaxVectorSize); 5712 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5713 isPowerOf2_32(ConstTripCount)) { 5714 // We need to clamp the VF to be the ConstTripCount. There is no point in 5715 // choosing a higher viable VF as done in the loop below. 5716 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5717 << ConstTripCount << "\n"); 5718 MaxVectorSize = ConstTripCount; 5719 return ElementCount::getFixed(MaxVectorSize); 5720 } 5721 5722 unsigned MaxVF = MaxVectorSize; 5723 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5724 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5725 // Collect all viable vectorization factors larger than the default MaxVF 5726 // (i.e. MaxVectorSize). 5727 SmallVector<ElementCount, 8> VFs; 5728 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5729 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5730 VFs.push_back(ElementCount::getFixed(VS)); 5731 5732 // For each VF calculate its register usage. 5733 auto RUs = calculateRegisterUsage(VFs); 5734 5735 // Select the largest VF which doesn't require more registers than existing 5736 // ones. 5737 for (int i = RUs.size() - 1; i >= 0; --i) { 5738 bool Selected = true; 5739 for (auto& pair : RUs[i].MaxLocalUsers) { 5740 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5741 if (pair.second > TargetNumRegisters) 5742 Selected = false; 5743 } 5744 if (Selected) { 5745 MaxVF = VFs[i].getKnownMinValue(); 5746 break; 5747 } 5748 } 5749 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5750 if (MaxVF < MinVF) { 5751 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5752 << ") with target's minimum: " << MinVF << '\n'); 5753 MaxVF = MinVF; 5754 } 5755 } 5756 } 5757 return ElementCount::getFixed(MaxVF); 5758 } 5759 5760 VectorizationFactor 5761 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5762 // FIXME: This can be fixed for scalable vectors later, because at this stage 5763 // the LoopVectorizer will only consider vectorizing a loop with scalable 5764 // vectors when the loop has a hint to enable vectorization for a given VF. 5765 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5766 5767 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5768 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5769 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5770 5771 unsigned Width = 1; 5772 const float ScalarCost = *ExpectedCost.getValue(); 5773 float Cost = ScalarCost; 5774 5775 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5776 if (ForceVectorization && MaxVF.isVector()) { 5777 // Ignore scalar width, because the user explicitly wants vectorization. 5778 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5779 // evaluation. 5780 Cost = std::numeric_limits<float>::max(); 5781 } 5782 5783 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5784 // Notice that the vector loop needs to be executed less times, so 5785 // we need to divide the cost of the vector loops by the width of 5786 // the vector elements. 5787 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5788 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5789 float VectorCost = *C.first.getValue() / (float)i; 5790 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5791 << " costs: " << (int)VectorCost << ".\n"); 5792 if (!C.second && !ForceVectorization) { 5793 LLVM_DEBUG( 5794 dbgs() << "LV: Not considering vector loop of width " << i 5795 << " because it will not generate any vector instructions.\n"); 5796 continue; 5797 } 5798 5799 // If profitable add it to ProfitableVF list. 5800 if (VectorCost < ScalarCost) { 5801 ProfitableVFs.push_back(VectorizationFactor( 5802 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5803 } 5804 5805 if (VectorCost < Cost) { 5806 Cost = VectorCost; 5807 Width = i; 5808 } 5809 } 5810 5811 if (!EnableCondStoresVectorization && NumPredStores) { 5812 reportVectorizationFailure("There are conditional stores.", 5813 "store that is conditionally executed prevents vectorization", 5814 "ConditionalStore", ORE, TheLoop); 5815 Width = 1; 5816 Cost = ScalarCost; 5817 } 5818 5819 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5820 << "LV: Vectorization seems to be not beneficial, " 5821 << "but was forced by a user.\n"); 5822 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5823 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5824 (unsigned)(Width * Cost)}; 5825 return Factor; 5826 } 5827 5828 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5829 const Loop &L, ElementCount VF) const { 5830 // Cross iteration phis such as reductions need special handling and are 5831 // currently unsupported. 5832 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5833 return Legal->isFirstOrderRecurrence(&Phi) || 5834 Legal->isReductionVariable(&Phi); 5835 })) 5836 return false; 5837 5838 // Phis with uses outside of the loop require special handling and are 5839 // currently unsupported. 5840 for (auto &Entry : Legal->getInductionVars()) { 5841 // Look for uses of the value of the induction at the last iteration. 5842 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5843 for (User *U : PostInc->users()) 5844 if (!L.contains(cast<Instruction>(U))) 5845 return false; 5846 // Look for uses of penultimate value of the induction. 5847 for (User *U : Entry.first->users()) 5848 if (!L.contains(cast<Instruction>(U))) 5849 return false; 5850 } 5851 5852 // Induction variables that are widened require special handling that is 5853 // currently not supported. 5854 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5855 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5856 this->isProfitableToScalarize(Entry.first, VF)); 5857 })) 5858 return false; 5859 5860 return true; 5861 } 5862 5863 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5864 const ElementCount VF) const { 5865 // FIXME: We need a much better cost-model to take different parameters such 5866 // as register pressure, code size increase and cost of extra branches into 5867 // account. For now we apply a very crude heuristic and only consider loops 5868 // with vectorization factors larger than a certain value. 5869 // We also consider epilogue vectorization unprofitable for targets that don't 5870 // consider interleaving beneficial (eg. MVE). 5871 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5872 return false; 5873 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5874 return true; 5875 return false; 5876 } 5877 5878 VectorizationFactor 5879 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5880 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5881 VectorizationFactor Result = VectorizationFactor::Disabled(); 5882 if (!EnableEpilogueVectorization) { 5883 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5884 return Result; 5885 } 5886 5887 if (!isScalarEpilogueAllowed()) { 5888 LLVM_DEBUG( 5889 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5890 "allowed.\n";); 5891 return Result; 5892 } 5893 5894 // FIXME: This can be fixed for scalable vectors later, because at this stage 5895 // the LoopVectorizer will only consider vectorizing a loop with scalable 5896 // vectors when the loop has a hint to enable vectorization for a given VF. 5897 if (MainLoopVF.isScalable()) { 5898 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5899 "yet supported.\n"); 5900 return Result; 5901 } 5902 5903 // Not really a cost consideration, but check for unsupported cases here to 5904 // simplify the logic. 5905 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5906 LLVM_DEBUG( 5907 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5908 "not a supported candidate.\n";); 5909 return Result; 5910 } 5911 5912 if (EpilogueVectorizationForceVF > 1) { 5913 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5914 if (LVP.hasPlanWithVFs( 5915 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5916 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5917 else { 5918 LLVM_DEBUG( 5919 dbgs() 5920 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5921 return Result; 5922 } 5923 } 5924 5925 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5926 TheLoop->getHeader()->getParent()->hasMinSize()) { 5927 LLVM_DEBUG( 5928 dbgs() 5929 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5930 return Result; 5931 } 5932 5933 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5934 return Result; 5935 5936 for (auto &NextVF : ProfitableVFs) 5937 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5938 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5939 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5940 Result = NextVF; 5941 5942 if (Result != VectorizationFactor::Disabled()) 5943 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5944 << Result.Width.getFixedValue() << "\n";); 5945 return Result; 5946 } 5947 5948 std::pair<unsigned, unsigned> 5949 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5950 unsigned MinWidth = -1U; 5951 unsigned MaxWidth = 8; 5952 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5953 5954 // For each block. 5955 for (BasicBlock *BB : TheLoop->blocks()) { 5956 // For each instruction in the loop. 5957 for (Instruction &I : BB->instructionsWithoutDebug()) { 5958 Type *T = I.getType(); 5959 5960 // Skip ignored values. 5961 if (ValuesToIgnore.count(&I)) 5962 continue; 5963 5964 // Only examine Loads, Stores and PHINodes. 5965 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5966 continue; 5967 5968 // Examine PHI nodes that are reduction variables. Update the type to 5969 // account for the recurrence type. 5970 if (auto *PN = dyn_cast<PHINode>(&I)) { 5971 if (!Legal->isReductionVariable(PN)) 5972 continue; 5973 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5974 T = RdxDesc.getRecurrenceType(); 5975 } 5976 5977 // Examine the stored values. 5978 if (auto *ST = dyn_cast<StoreInst>(&I)) 5979 T = ST->getValueOperand()->getType(); 5980 5981 // Ignore loaded pointer types and stored pointer types that are not 5982 // vectorizable. 5983 // 5984 // FIXME: The check here attempts to predict whether a load or store will 5985 // be vectorized. We only know this for certain after a VF has 5986 // been selected. Here, we assume that if an access can be 5987 // vectorized, it will be. We should also look at extending this 5988 // optimization to non-pointer types. 5989 // 5990 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5991 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5992 continue; 5993 5994 MinWidth = std::min(MinWidth, 5995 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5996 MaxWidth = std::max(MaxWidth, 5997 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5998 } 5999 } 6000 6001 return {MinWidth, MaxWidth}; 6002 } 6003 6004 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6005 unsigned LoopCost) { 6006 // -- The interleave heuristics -- 6007 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6008 // There are many micro-architectural considerations that we can't predict 6009 // at this level. For example, frontend pressure (on decode or fetch) due to 6010 // code size, or the number and capabilities of the execution ports. 6011 // 6012 // We use the following heuristics to select the interleave count: 6013 // 1. If the code has reductions, then we interleave to break the cross 6014 // iteration dependency. 6015 // 2. If the loop is really small, then we interleave to reduce the loop 6016 // overhead. 6017 // 3. We don't interleave if we think that we will spill registers to memory 6018 // due to the increased register pressure. 6019 6020 if (!isScalarEpilogueAllowed()) 6021 return 1; 6022 6023 // We used the distance for the interleave count. 6024 if (Legal->getMaxSafeDepDistBytes() != -1U) 6025 return 1; 6026 6027 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6028 const bool HasReductions = !Legal->getReductionVars().empty(); 6029 // Do not interleave loops with a relatively small known or estimated trip 6030 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6031 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6032 // because with the above conditions interleaving can expose ILP and break 6033 // cross iteration dependences for reductions. 6034 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6035 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6036 return 1; 6037 6038 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6039 // We divide by these constants so assume that we have at least one 6040 // instruction that uses at least one register. 6041 for (auto& pair : R.MaxLocalUsers) { 6042 pair.second = std::max(pair.second, 1U); 6043 } 6044 6045 // We calculate the interleave count using the following formula. 6046 // Subtract the number of loop invariants from the number of available 6047 // registers. These registers are used by all of the interleaved instances. 6048 // Next, divide the remaining registers by the number of registers that is 6049 // required by the loop, in order to estimate how many parallel instances 6050 // fit without causing spills. All of this is rounded down if necessary to be 6051 // a power of two. We want power of two interleave count to simplify any 6052 // addressing operations or alignment considerations. 6053 // We also want power of two interleave counts to ensure that the induction 6054 // variable of the vector loop wraps to zero, when tail is folded by masking; 6055 // this currently happens when OptForSize, in which case IC is set to 1 above. 6056 unsigned IC = UINT_MAX; 6057 6058 for (auto& pair : R.MaxLocalUsers) { 6059 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6060 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6061 << " registers of " 6062 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6063 if (VF.isScalar()) { 6064 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6065 TargetNumRegisters = ForceTargetNumScalarRegs; 6066 } else { 6067 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6068 TargetNumRegisters = ForceTargetNumVectorRegs; 6069 } 6070 unsigned MaxLocalUsers = pair.second; 6071 unsigned LoopInvariantRegs = 0; 6072 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6073 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6074 6075 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6076 // Don't count the induction variable as interleaved. 6077 if (EnableIndVarRegisterHeur) { 6078 TmpIC = 6079 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6080 std::max(1U, (MaxLocalUsers - 1))); 6081 } 6082 6083 IC = std::min(IC, TmpIC); 6084 } 6085 6086 // Clamp the interleave ranges to reasonable counts. 6087 unsigned MaxInterleaveCount = 6088 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6089 6090 // Check if the user has overridden the max. 6091 if (VF.isScalar()) { 6092 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6093 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6094 } else { 6095 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6096 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6097 } 6098 6099 // If trip count is known or estimated compile time constant, limit the 6100 // interleave count to be less than the trip count divided by VF, provided it 6101 // is at least 1. 6102 // 6103 // For scalable vectors we can't know if interleaving is beneficial. It may 6104 // not be beneficial for small loops if none of the lanes in the second vector 6105 // iterations is enabled. However, for larger loops, there is likely to be a 6106 // similar benefit as for fixed-width vectors. For now, we choose to leave 6107 // the InterleaveCount as if vscale is '1', although if some information about 6108 // the vector is known (e.g. min vector size), we can make a better decision. 6109 if (BestKnownTC) { 6110 MaxInterleaveCount = 6111 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6112 // Make sure MaxInterleaveCount is greater than 0. 6113 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6114 } 6115 6116 assert(MaxInterleaveCount > 0 && 6117 "Maximum interleave count must be greater than 0"); 6118 6119 // Clamp the calculated IC to be between the 1 and the max interleave count 6120 // that the target and trip count allows. 6121 if (IC > MaxInterleaveCount) 6122 IC = MaxInterleaveCount; 6123 else 6124 // Make sure IC is greater than 0. 6125 IC = std::max(1u, IC); 6126 6127 assert(IC > 0 && "Interleave count must be greater than 0."); 6128 6129 // If we did not calculate the cost for VF (because the user selected the VF) 6130 // then we calculate the cost of VF here. 6131 if (LoopCost == 0) { 6132 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6133 LoopCost = *expectedCost(VF).first.getValue(); 6134 } 6135 6136 assert(LoopCost && "Non-zero loop cost expected"); 6137 6138 // Interleave if we vectorized this loop and there is a reduction that could 6139 // benefit from interleaving. 6140 if (VF.isVector() && HasReductions) { 6141 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6142 return IC; 6143 } 6144 6145 // Note that if we've already vectorized the loop we will have done the 6146 // runtime check and so interleaving won't require further checks. 6147 bool InterleavingRequiresRuntimePointerCheck = 6148 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6149 6150 // We want to interleave small loops in order to reduce the loop overhead and 6151 // potentially expose ILP opportunities. 6152 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6153 << "LV: IC is " << IC << '\n' 6154 << "LV: VF is " << VF << '\n'); 6155 const bool AggressivelyInterleaveReductions = 6156 TTI.enableAggressiveInterleaving(HasReductions); 6157 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6158 // We assume that the cost overhead is 1 and we use the cost model 6159 // to estimate the cost of the loop and interleave until the cost of the 6160 // loop overhead is about 5% of the cost of the loop. 6161 unsigned SmallIC = 6162 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6163 6164 // Interleave until store/load ports (estimated by max interleave count) are 6165 // saturated. 6166 unsigned NumStores = Legal->getNumStores(); 6167 unsigned NumLoads = Legal->getNumLoads(); 6168 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6169 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6170 6171 // If we have a scalar reduction (vector reductions are already dealt with 6172 // by this point), we can increase the critical path length if the loop 6173 // we're interleaving is inside another loop. Limit, by default to 2, so the 6174 // critical path only gets increased by one reduction operation. 6175 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6176 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6177 SmallIC = std::min(SmallIC, F); 6178 StoresIC = std::min(StoresIC, F); 6179 LoadsIC = std::min(LoadsIC, F); 6180 } 6181 6182 if (EnableLoadStoreRuntimeInterleave && 6183 std::max(StoresIC, LoadsIC) > SmallIC) { 6184 LLVM_DEBUG( 6185 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6186 return std::max(StoresIC, LoadsIC); 6187 } 6188 6189 // If there are scalar reductions and TTI has enabled aggressive 6190 // interleaving for reductions, we will interleave to expose ILP. 6191 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6192 AggressivelyInterleaveReductions) { 6193 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6194 // Interleave no less than SmallIC but not as aggressive as the normal IC 6195 // to satisfy the rare situation when resources are too limited. 6196 return std::max(IC / 2, SmallIC); 6197 } else { 6198 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6199 return SmallIC; 6200 } 6201 } 6202 6203 // Interleave if this is a large loop (small loops are already dealt with by 6204 // this point) that could benefit from interleaving. 6205 if (AggressivelyInterleaveReductions) { 6206 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6207 return IC; 6208 } 6209 6210 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6211 return 1; 6212 } 6213 6214 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6215 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6216 // This function calculates the register usage by measuring the highest number 6217 // of values that are alive at a single location. Obviously, this is a very 6218 // rough estimation. We scan the loop in a topological order in order and 6219 // assign a number to each instruction. We use RPO to ensure that defs are 6220 // met before their users. We assume that each instruction that has in-loop 6221 // users starts an interval. We record every time that an in-loop value is 6222 // used, so we have a list of the first and last occurrences of each 6223 // instruction. Next, we transpose this data structure into a multi map that 6224 // holds the list of intervals that *end* at a specific location. This multi 6225 // map allows us to perform a linear search. We scan the instructions linearly 6226 // and record each time that a new interval starts, by placing it in a set. 6227 // If we find this value in the multi-map then we remove it from the set. 6228 // The max register usage is the maximum size of the set. 6229 // We also search for instructions that are defined outside the loop, but are 6230 // used inside the loop. We need this number separately from the max-interval 6231 // usage number because when we unroll, loop-invariant values do not take 6232 // more register. 6233 LoopBlocksDFS DFS(TheLoop); 6234 DFS.perform(LI); 6235 6236 RegisterUsage RU; 6237 6238 // Each 'key' in the map opens a new interval. The values 6239 // of the map are the index of the 'last seen' usage of the 6240 // instruction that is the key. 6241 using IntervalMap = DenseMap<Instruction *, unsigned>; 6242 6243 // Maps instruction to its index. 6244 SmallVector<Instruction *, 64> IdxToInstr; 6245 // Marks the end of each interval. 6246 IntervalMap EndPoint; 6247 // Saves the list of instruction indices that are used in the loop. 6248 SmallPtrSet<Instruction *, 8> Ends; 6249 // Saves the list of values that are used in the loop but are 6250 // defined outside the loop, such as arguments and constants. 6251 SmallPtrSet<Value *, 8> LoopInvariants; 6252 6253 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6254 for (Instruction &I : BB->instructionsWithoutDebug()) { 6255 IdxToInstr.push_back(&I); 6256 6257 // Save the end location of each USE. 6258 for (Value *U : I.operands()) { 6259 auto *Instr = dyn_cast<Instruction>(U); 6260 6261 // Ignore non-instruction values such as arguments, constants, etc. 6262 if (!Instr) 6263 continue; 6264 6265 // If this instruction is outside the loop then record it and continue. 6266 if (!TheLoop->contains(Instr)) { 6267 LoopInvariants.insert(Instr); 6268 continue; 6269 } 6270 6271 // Overwrite previous end points. 6272 EndPoint[Instr] = IdxToInstr.size(); 6273 Ends.insert(Instr); 6274 } 6275 } 6276 } 6277 6278 // Saves the list of intervals that end with the index in 'key'. 6279 using InstrList = SmallVector<Instruction *, 2>; 6280 DenseMap<unsigned, InstrList> TransposeEnds; 6281 6282 // Transpose the EndPoints to a list of values that end at each index. 6283 for (auto &Interval : EndPoint) 6284 TransposeEnds[Interval.second].push_back(Interval.first); 6285 6286 SmallPtrSet<Instruction *, 8> OpenIntervals; 6287 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6288 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6289 6290 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6291 6292 // A lambda that gets the register usage for the given type and VF. 6293 const auto &TTICapture = TTI; 6294 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6295 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6296 return 0U; 6297 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6298 }; 6299 6300 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6301 Instruction *I = IdxToInstr[i]; 6302 6303 // Remove all of the instructions that end at this location. 6304 InstrList &List = TransposeEnds[i]; 6305 for (Instruction *ToRemove : List) 6306 OpenIntervals.erase(ToRemove); 6307 6308 // Ignore instructions that are never used within the loop. 6309 if (!Ends.count(I)) 6310 continue; 6311 6312 // Skip ignored values. 6313 if (ValuesToIgnore.count(I)) 6314 continue; 6315 6316 // For each VF find the maximum usage of registers. 6317 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6318 // Count the number of live intervals. 6319 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6320 6321 if (VFs[j].isScalar()) { 6322 for (auto Inst : OpenIntervals) { 6323 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6324 if (RegUsage.find(ClassID) == RegUsage.end()) 6325 RegUsage[ClassID] = 1; 6326 else 6327 RegUsage[ClassID] += 1; 6328 } 6329 } else { 6330 collectUniformsAndScalars(VFs[j]); 6331 for (auto Inst : OpenIntervals) { 6332 // Skip ignored values for VF > 1. 6333 if (VecValuesToIgnore.count(Inst)) 6334 continue; 6335 if (isScalarAfterVectorization(Inst, VFs[j])) { 6336 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6337 if (RegUsage.find(ClassID) == RegUsage.end()) 6338 RegUsage[ClassID] = 1; 6339 else 6340 RegUsage[ClassID] += 1; 6341 } else { 6342 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6343 if (RegUsage.find(ClassID) == RegUsage.end()) 6344 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6345 else 6346 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6347 } 6348 } 6349 } 6350 6351 for (auto& pair : RegUsage) { 6352 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6353 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6354 else 6355 MaxUsages[j][pair.first] = pair.second; 6356 } 6357 } 6358 6359 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6360 << OpenIntervals.size() << '\n'); 6361 6362 // Add the current instruction to the list of open intervals. 6363 OpenIntervals.insert(I); 6364 } 6365 6366 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6367 SmallMapVector<unsigned, unsigned, 4> Invariant; 6368 6369 for (auto Inst : LoopInvariants) { 6370 unsigned Usage = 6371 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6372 unsigned ClassID = 6373 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6374 if (Invariant.find(ClassID) == Invariant.end()) 6375 Invariant[ClassID] = Usage; 6376 else 6377 Invariant[ClassID] += Usage; 6378 } 6379 6380 LLVM_DEBUG({ 6381 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6382 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6383 << " item\n"; 6384 for (const auto &pair : MaxUsages[i]) { 6385 dbgs() << "LV(REG): RegisterClass: " 6386 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6387 << " registers\n"; 6388 } 6389 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6390 << " item\n"; 6391 for (const auto &pair : Invariant) { 6392 dbgs() << "LV(REG): RegisterClass: " 6393 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6394 << " registers\n"; 6395 } 6396 }); 6397 6398 RU.LoopInvariantRegs = Invariant; 6399 RU.MaxLocalUsers = MaxUsages[i]; 6400 RUs[i] = RU; 6401 } 6402 6403 return RUs; 6404 } 6405 6406 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6407 // TODO: Cost model for emulated masked load/store is completely 6408 // broken. This hack guides the cost model to use an artificially 6409 // high enough value to practically disable vectorization with such 6410 // operations, except where previously deployed legality hack allowed 6411 // using very low cost values. This is to avoid regressions coming simply 6412 // from moving "masked load/store" check from legality to cost model. 6413 // Masked Load/Gather emulation was previously never allowed. 6414 // Limited number of Masked Store/Scatter emulation was allowed. 6415 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6416 return isa<LoadInst>(I) || 6417 (isa<StoreInst>(I) && 6418 NumPredStores > NumberOfStoresToPredicate); 6419 } 6420 6421 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6422 // If we aren't vectorizing the loop, or if we've already collected the 6423 // instructions to scalarize, there's nothing to do. Collection may already 6424 // have occurred if we have a user-selected VF and are now computing the 6425 // expected cost for interleaving. 6426 if (VF.isScalar() || VF.isZero() || 6427 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6428 return; 6429 6430 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6431 // not profitable to scalarize any instructions, the presence of VF in the 6432 // map will indicate that we've analyzed it already. 6433 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6434 6435 // Find all the instructions that are scalar with predication in the loop and 6436 // determine if it would be better to not if-convert the blocks they are in. 6437 // If so, we also record the instructions to scalarize. 6438 for (BasicBlock *BB : TheLoop->blocks()) { 6439 if (!blockNeedsPredication(BB)) 6440 continue; 6441 for (Instruction &I : *BB) 6442 if (isScalarWithPredication(&I)) { 6443 ScalarCostsTy ScalarCosts; 6444 // Do not apply discount logic if hacked cost is needed 6445 // for emulated masked memrefs. 6446 if (!useEmulatedMaskMemRefHack(&I) && 6447 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6448 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6449 // Remember that BB will remain after vectorization. 6450 PredicatedBBsAfterVectorization.insert(BB); 6451 } 6452 } 6453 } 6454 6455 int LoopVectorizationCostModel::computePredInstDiscount( 6456 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6457 assert(!isUniformAfterVectorization(PredInst, VF) && 6458 "Instruction marked uniform-after-vectorization will be predicated"); 6459 6460 // Initialize the discount to zero, meaning that the scalar version and the 6461 // vector version cost the same. 6462 InstructionCost Discount = 0; 6463 6464 // Holds instructions to analyze. The instructions we visit are mapped in 6465 // ScalarCosts. Those instructions are the ones that would be scalarized if 6466 // we find that the scalar version costs less. 6467 SmallVector<Instruction *, 8> Worklist; 6468 6469 // Returns true if the given instruction can be scalarized. 6470 auto canBeScalarized = [&](Instruction *I) -> bool { 6471 // We only attempt to scalarize instructions forming a single-use chain 6472 // from the original predicated block that would otherwise be vectorized. 6473 // Although not strictly necessary, we give up on instructions we know will 6474 // already be scalar to avoid traversing chains that are unlikely to be 6475 // beneficial. 6476 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6477 isScalarAfterVectorization(I, VF)) 6478 return false; 6479 6480 // If the instruction is scalar with predication, it will be analyzed 6481 // separately. We ignore it within the context of PredInst. 6482 if (isScalarWithPredication(I)) 6483 return false; 6484 6485 // If any of the instruction's operands are uniform after vectorization, 6486 // the instruction cannot be scalarized. This prevents, for example, a 6487 // masked load from being scalarized. 6488 // 6489 // We assume we will only emit a value for lane zero of an instruction 6490 // marked uniform after vectorization, rather than VF identical values. 6491 // Thus, if we scalarize an instruction that uses a uniform, we would 6492 // create uses of values corresponding to the lanes we aren't emitting code 6493 // for. This behavior can be changed by allowing getScalarValue to clone 6494 // the lane zero values for uniforms rather than asserting. 6495 for (Use &U : I->operands()) 6496 if (auto *J = dyn_cast<Instruction>(U.get())) 6497 if (isUniformAfterVectorization(J, VF)) 6498 return false; 6499 6500 // Otherwise, we can scalarize the instruction. 6501 return true; 6502 }; 6503 6504 // Compute the expected cost discount from scalarizing the entire expression 6505 // feeding the predicated instruction. We currently only consider expressions 6506 // that are single-use instruction chains. 6507 Worklist.push_back(PredInst); 6508 while (!Worklist.empty()) { 6509 Instruction *I = Worklist.pop_back_val(); 6510 6511 // If we've already analyzed the instruction, there's nothing to do. 6512 if (ScalarCosts.find(I) != ScalarCosts.end()) 6513 continue; 6514 6515 // Compute the cost of the vector instruction. Note that this cost already 6516 // includes the scalarization overhead of the predicated instruction. 6517 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6518 6519 // Compute the cost of the scalarized instruction. This cost is the cost of 6520 // the instruction as if it wasn't if-converted and instead remained in the 6521 // predicated block. We will scale this cost by block probability after 6522 // computing the scalarization overhead. 6523 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6524 InstructionCost ScalarCost = 6525 VF.getKnownMinValue() * 6526 getInstructionCost(I, ElementCount::getFixed(1)).first; 6527 6528 // Compute the scalarization overhead of needed insertelement instructions 6529 // and phi nodes. 6530 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6531 ScalarCost += TTI.getScalarizationOverhead( 6532 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6533 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6534 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6535 ScalarCost += 6536 VF.getKnownMinValue() * 6537 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6538 } 6539 6540 // Compute the scalarization overhead of needed extractelement 6541 // instructions. For each of the instruction's operands, if the operand can 6542 // be scalarized, add it to the worklist; otherwise, account for the 6543 // overhead. 6544 for (Use &U : I->operands()) 6545 if (auto *J = dyn_cast<Instruction>(U.get())) { 6546 assert(VectorType::isValidElementType(J->getType()) && 6547 "Instruction has non-scalar type"); 6548 if (canBeScalarized(J)) 6549 Worklist.push_back(J); 6550 else if (needsExtract(J, VF)) { 6551 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6552 ScalarCost += TTI.getScalarizationOverhead( 6553 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6554 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6555 } 6556 } 6557 6558 // Scale the total scalar cost by block probability. 6559 ScalarCost /= getReciprocalPredBlockProb(); 6560 6561 // Compute the discount. A non-negative discount means the vector version 6562 // of the instruction costs more, and scalarizing would be beneficial. 6563 Discount += VectorCost - ScalarCost; 6564 ScalarCosts[I] = ScalarCost; 6565 } 6566 6567 return *Discount.getValue(); 6568 } 6569 6570 LoopVectorizationCostModel::VectorizationCostTy 6571 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6572 VectorizationCostTy Cost; 6573 6574 // For each block. 6575 for (BasicBlock *BB : TheLoop->blocks()) { 6576 VectorizationCostTy BlockCost; 6577 6578 // For each instruction in the old loop. 6579 for (Instruction &I : BB->instructionsWithoutDebug()) { 6580 // Skip ignored values. 6581 if (ValuesToIgnore.count(&I) || 6582 (VF.isVector() && VecValuesToIgnore.count(&I))) 6583 continue; 6584 6585 VectorizationCostTy C = getInstructionCost(&I, VF); 6586 6587 // Check if we should override the cost. 6588 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6589 C.first = InstructionCost(ForceTargetInstructionCost); 6590 6591 BlockCost.first += C.first; 6592 BlockCost.second |= C.second; 6593 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6594 << " for VF " << VF << " For instruction: " << I 6595 << '\n'); 6596 } 6597 6598 // If we are vectorizing a predicated block, it will have been 6599 // if-converted. This means that the block's instructions (aside from 6600 // stores and instructions that may divide by zero) will now be 6601 // unconditionally executed. For the scalar case, we may not always execute 6602 // the predicated block, if it is an if-else block. Thus, scale the block's 6603 // cost by the probability of executing it. blockNeedsPredication from 6604 // Legal is used so as to not include all blocks in tail folded loops. 6605 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6606 BlockCost.first /= getReciprocalPredBlockProb(); 6607 6608 Cost.first += BlockCost.first; 6609 Cost.second |= BlockCost.second; 6610 } 6611 6612 return Cost; 6613 } 6614 6615 /// Gets Address Access SCEV after verifying that the access pattern 6616 /// is loop invariant except the induction variable dependence. 6617 /// 6618 /// This SCEV can be sent to the Target in order to estimate the address 6619 /// calculation cost. 6620 static const SCEV *getAddressAccessSCEV( 6621 Value *Ptr, 6622 LoopVectorizationLegality *Legal, 6623 PredicatedScalarEvolution &PSE, 6624 const Loop *TheLoop) { 6625 6626 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6627 if (!Gep) 6628 return nullptr; 6629 6630 // We are looking for a gep with all loop invariant indices except for one 6631 // which should be an induction variable. 6632 auto SE = PSE.getSE(); 6633 unsigned NumOperands = Gep->getNumOperands(); 6634 for (unsigned i = 1; i < NumOperands; ++i) { 6635 Value *Opd = Gep->getOperand(i); 6636 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6637 !Legal->isInductionVariable(Opd)) 6638 return nullptr; 6639 } 6640 6641 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6642 return PSE.getSCEV(Ptr); 6643 } 6644 6645 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6646 return Legal->hasStride(I->getOperand(0)) || 6647 Legal->hasStride(I->getOperand(1)); 6648 } 6649 6650 unsigned 6651 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6652 ElementCount VF) { 6653 assert(VF.isVector() && 6654 "Scalarization cost of instruction implies vectorization."); 6655 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6656 Type *ValTy = getMemInstValueType(I); 6657 auto SE = PSE.getSE(); 6658 6659 unsigned AS = getLoadStoreAddressSpace(I); 6660 Value *Ptr = getLoadStorePointerOperand(I); 6661 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6662 6663 // Figure out whether the access is strided and get the stride value 6664 // if it's known in compile time 6665 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6666 6667 // Get the cost of the scalar memory instruction and address computation. 6668 unsigned Cost = 6669 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6670 6671 // Don't pass *I here, since it is scalar but will actually be part of a 6672 // vectorized loop where the user of it is a vectorized instruction. 6673 const Align Alignment = getLoadStoreAlignment(I); 6674 Cost += VF.getKnownMinValue() * 6675 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6676 AS, TTI::TCK_RecipThroughput); 6677 6678 // Get the overhead of the extractelement and insertelement instructions 6679 // we might create due to scalarization. 6680 Cost += getScalarizationOverhead(I, VF); 6681 6682 // If we have a predicated store, it may not be executed for each vector 6683 // lane. Scale the cost by the probability of executing the predicated 6684 // block. 6685 if (isPredicatedInst(I)) { 6686 Cost /= getReciprocalPredBlockProb(); 6687 6688 if (useEmulatedMaskMemRefHack(I)) 6689 // Artificially setting to a high enough value to practically disable 6690 // vectorization with such operations. 6691 Cost = 3000000; 6692 } 6693 6694 return Cost; 6695 } 6696 6697 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6698 ElementCount VF) { 6699 Type *ValTy = getMemInstValueType(I); 6700 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6701 Value *Ptr = getLoadStorePointerOperand(I); 6702 unsigned AS = getLoadStoreAddressSpace(I); 6703 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6704 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6705 6706 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6707 "Stride should be 1 or -1 for consecutive memory access"); 6708 const Align Alignment = getLoadStoreAlignment(I); 6709 unsigned Cost = 0; 6710 if (Legal->isMaskRequired(I)) 6711 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6712 CostKind); 6713 else 6714 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6715 CostKind, I); 6716 6717 bool Reverse = ConsecutiveStride < 0; 6718 if (Reverse) 6719 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6720 return Cost; 6721 } 6722 6723 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6724 ElementCount VF) { 6725 assert(Legal->isUniformMemOp(*I)); 6726 6727 Type *ValTy = getMemInstValueType(I); 6728 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6729 const Align Alignment = getLoadStoreAlignment(I); 6730 unsigned AS = getLoadStoreAddressSpace(I); 6731 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6732 if (isa<LoadInst>(I)) { 6733 return TTI.getAddressComputationCost(ValTy) + 6734 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6735 CostKind) + 6736 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6737 } 6738 StoreInst *SI = cast<StoreInst>(I); 6739 6740 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6741 return TTI.getAddressComputationCost(ValTy) + 6742 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6743 CostKind) + 6744 (isLoopInvariantStoreValue 6745 ? 0 6746 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6747 VF.getKnownMinValue() - 1)); 6748 } 6749 6750 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6751 ElementCount VF) { 6752 Type *ValTy = getMemInstValueType(I); 6753 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6754 const Align Alignment = getLoadStoreAlignment(I); 6755 const Value *Ptr = getLoadStorePointerOperand(I); 6756 6757 return TTI.getAddressComputationCost(VectorTy) + 6758 TTI.getGatherScatterOpCost( 6759 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6760 TargetTransformInfo::TCK_RecipThroughput, I); 6761 } 6762 6763 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6764 ElementCount VF) { 6765 Type *ValTy = getMemInstValueType(I); 6766 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6767 unsigned AS = getLoadStoreAddressSpace(I); 6768 6769 auto Group = getInterleavedAccessGroup(I); 6770 assert(Group && "Fail to get an interleaved access group."); 6771 6772 unsigned InterleaveFactor = Group->getFactor(); 6773 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6774 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6775 6776 // Holds the indices of existing members in an interleaved load group. 6777 // An interleaved store group doesn't need this as it doesn't allow gaps. 6778 SmallVector<unsigned, 4> Indices; 6779 if (isa<LoadInst>(I)) { 6780 for (unsigned i = 0; i < InterleaveFactor; i++) 6781 if (Group->getMember(i)) 6782 Indices.push_back(i); 6783 } 6784 6785 // Calculate the cost of the whole interleaved group. 6786 bool UseMaskForGaps = 6787 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6788 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6789 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6790 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6791 6792 if (Group->isReverse()) { 6793 // TODO: Add support for reversed masked interleaved access. 6794 assert(!Legal->isMaskRequired(I) && 6795 "Reverse masked interleaved access not supported."); 6796 Cost += Group->getNumMembers() * 6797 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6798 } 6799 return Cost; 6800 } 6801 6802 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6803 ElementCount VF) { 6804 // Calculate scalar cost only. Vectorization cost should be ready at this 6805 // moment. 6806 if (VF.isScalar()) { 6807 Type *ValTy = getMemInstValueType(I); 6808 const Align Alignment = getLoadStoreAlignment(I); 6809 unsigned AS = getLoadStoreAddressSpace(I); 6810 6811 return TTI.getAddressComputationCost(ValTy) + 6812 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6813 TTI::TCK_RecipThroughput, I); 6814 } 6815 return getWideningCost(I, VF); 6816 } 6817 6818 LoopVectorizationCostModel::VectorizationCostTy 6819 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6820 ElementCount VF) { 6821 // If we know that this instruction will remain uniform, check the cost of 6822 // the scalar version. 6823 if (isUniformAfterVectorization(I, VF)) 6824 VF = ElementCount::getFixed(1); 6825 6826 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6827 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6828 6829 // Forced scalars do not have any scalarization overhead. 6830 auto ForcedScalar = ForcedScalars.find(VF); 6831 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6832 auto InstSet = ForcedScalar->second; 6833 if (InstSet.count(I)) 6834 return VectorizationCostTy( 6835 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6836 VF.getKnownMinValue()), 6837 false); 6838 } 6839 6840 Type *VectorTy; 6841 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6842 6843 bool TypeNotScalarized = 6844 VF.isVector() && VectorTy->isVectorTy() && 6845 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6846 return VectorizationCostTy(C, TypeNotScalarized); 6847 } 6848 6849 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6850 ElementCount VF) { 6851 6852 assert(!VF.isScalable() && 6853 "cannot compute scalarization overhead for scalable vectorization"); 6854 if (VF.isScalar()) 6855 return 0; 6856 6857 unsigned Cost = 0; 6858 Type *RetTy = ToVectorTy(I->getType(), VF); 6859 if (!RetTy->isVoidTy() && 6860 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6861 Cost += TTI.getScalarizationOverhead( 6862 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6863 true, false); 6864 6865 // Some targets keep addresses scalar. 6866 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6867 return Cost; 6868 6869 // Some targets support efficient element stores. 6870 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6871 return Cost; 6872 6873 // Collect operands to consider. 6874 CallInst *CI = dyn_cast<CallInst>(I); 6875 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6876 6877 // Skip operands that do not require extraction/scalarization and do not incur 6878 // any overhead. 6879 return Cost + TTI.getOperandsScalarizationOverhead( 6880 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6881 } 6882 6883 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6884 if (VF.isScalar()) 6885 return; 6886 NumPredStores = 0; 6887 for (BasicBlock *BB : TheLoop->blocks()) { 6888 // For each instruction in the old loop. 6889 for (Instruction &I : *BB) { 6890 Value *Ptr = getLoadStorePointerOperand(&I); 6891 if (!Ptr) 6892 continue; 6893 6894 // TODO: We should generate better code and update the cost model for 6895 // predicated uniform stores. Today they are treated as any other 6896 // predicated store (see added test cases in 6897 // invariant-store-vectorization.ll). 6898 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6899 NumPredStores++; 6900 6901 if (Legal->isUniformMemOp(I)) { 6902 // TODO: Avoid replicating loads and stores instead of 6903 // relying on instcombine to remove them. 6904 // Load: Scalar load + broadcast 6905 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6906 unsigned Cost = getUniformMemOpCost(&I, VF); 6907 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6908 continue; 6909 } 6910 6911 // We assume that widening is the best solution when possible. 6912 if (memoryInstructionCanBeWidened(&I, VF)) { 6913 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6914 int ConsecutiveStride = 6915 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6916 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6917 "Expected consecutive stride."); 6918 InstWidening Decision = 6919 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6920 setWideningDecision(&I, VF, Decision, Cost); 6921 continue; 6922 } 6923 6924 // Choose between Interleaving, Gather/Scatter or Scalarization. 6925 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6926 unsigned NumAccesses = 1; 6927 if (isAccessInterleaved(&I)) { 6928 auto Group = getInterleavedAccessGroup(&I); 6929 assert(Group && "Fail to get an interleaved access group."); 6930 6931 // Make one decision for the whole group. 6932 if (getWideningDecision(&I, VF) != CM_Unknown) 6933 continue; 6934 6935 NumAccesses = Group->getNumMembers(); 6936 if (interleavedAccessCanBeWidened(&I, VF)) 6937 InterleaveCost = getInterleaveGroupCost(&I, VF); 6938 } 6939 6940 unsigned GatherScatterCost = 6941 isLegalGatherOrScatter(&I) 6942 ? getGatherScatterCost(&I, VF) * NumAccesses 6943 : std::numeric_limits<unsigned>::max(); 6944 6945 unsigned ScalarizationCost = 6946 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6947 6948 // Choose better solution for the current VF, 6949 // write down this decision and use it during vectorization. 6950 unsigned Cost; 6951 InstWidening Decision; 6952 if (InterleaveCost <= GatherScatterCost && 6953 InterleaveCost < ScalarizationCost) { 6954 Decision = CM_Interleave; 6955 Cost = InterleaveCost; 6956 } else if (GatherScatterCost < ScalarizationCost) { 6957 Decision = CM_GatherScatter; 6958 Cost = GatherScatterCost; 6959 } else { 6960 Decision = CM_Scalarize; 6961 Cost = ScalarizationCost; 6962 } 6963 // If the instructions belongs to an interleave group, the whole group 6964 // receives the same decision. The whole group receives the cost, but 6965 // the cost will actually be assigned to one instruction. 6966 if (auto Group = getInterleavedAccessGroup(&I)) 6967 setWideningDecision(Group, VF, Decision, Cost); 6968 else 6969 setWideningDecision(&I, VF, Decision, Cost); 6970 } 6971 } 6972 6973 // Make sure that any load of address and any other address computation 6974 // remains scalar unless there is gather/scatter support. This avoids 6975 // inevitable extracts into address registers, and also has the benefit of 6976 // activating LSR more, since that pass can't optimize vectorized 6977 // addresses. 6978 if (TTI.prefersVectorizedAddressing()) 6979 return; 6980 6981 // Start with all scalar pointer uses. 6982 SmallPtrSet<Instruction *, 8> AddrDefs; 6983 for (BasicBlock *BB : TheLoop->blocks()) 6984 for (Instruction &I : *BB) { 6985 Instruction *PtrDef = 6986 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6987 if (PtrDef && TheLoop->contains(PtrDef) && 6988 getWideningDecision(&I, VF) != CM_GatherScatter) 6989 AddrDefs.insert(PtrDef); 6990 } 6991 6992 // Add all instructions used to generate the addresses. 6993 SmallVector<Instruction *, 4> Worklist; 6994 for (auto *I : AddrDefs) 6995 Worklist.push_back(I); 6996 while (!Worklist.empty()) { 6997 Instruction *I = Worklist.pop_back_val(); 6998 for (auto &Op : I->operands()) 6999 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7000 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7001 AddrDefs.insert(InstOp).second) 7002 Worklist.push_back(InstOp); 7003 } 7004 7005 for (auto *I : AddrDefs) { 7006 if (isa<LoadInst>(I)) { 7007 // Setting the desired widening decision should ideally be handled in 7008 // by cost functions, but since this involves the task of finding out 7009 // if the loaded register is involved in an address computation, it is 7010 // instead changed here when we know this is the case. 7011 InstWidening Decision = getWideningDecision(I, VF); 7012 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7013 // Scalarize a widened load of address. 7014 setWideningDecision( 7015 I, VF, CM_Scalarize, 7016 (VF.getKnownMinValue() * 7017 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7018 else if (auto Group = getInterleavedAccessGroup(I)) { 7019 // Scalarize an interleave group of address loads. 7020 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7021 if (Instruction *Member = Group->getMember(I)) 7022 setWideningDecision( 7023 Member, VF, CM_Scalarize, 7024 (VF.getKnownMinValue() * 7025 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7026 } 7027 } 7028 } else 7029 // Make sure I gets scalarized and a cost estimate without 7030 // scalarization overhead. 7031 ForcedScalars[VF].insert(I); 7032 } 7033 } 7034 7035 InstructionCost 7036 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7037 Type *&VectorTy) { 7038 Type *RetTy = I->getType(); 7039 if (canTruncateToMinimalBitwidth(I, VF)) 7040 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7041 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7042 auto SE = PSE.getSE(); 7043 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7044 7045 // TODO: We need to estimate the cost of intrinsic calls. 7046 switch (I->getOpcode()) { 7047 case Instruction::GetElementPtr: 7048 // We mark this instruction as zero-cost because the cost of GEPs in 7049 // vectorized code depends on whether the corresponding memory instruction 7050 // is scalarized or not. Therefore, we handle GEPs with the memory 7051 // instruction cost. 7052 return 0; 7053 case Instruction::Br: { 7054 // In cases of scalarized and predicated instructions, there will be VF 7055 // predicated blocks in the vectorized loop. Each branch around these 7056 // blocks requires also an extract of its vector compare i1 element. 7057 bool ScalarPredicatedBB = false; 7058 BranchInst *BI = cast<BranchInst>(I); 7059 if (VF.isVector() && BI->isConditional() && 7060 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7061 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7062 ScalarPredicatedBB = true; 7063 7064 if (ScalarPredicatedBB) { 7065 // Return cost for branches around scalarized and predicated blocks. 7066 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7067 auto *Vec_i1Ty = 7068 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7069 return (TTI.getScalarizationOverhead( 7070 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7071 false, true) + 7072 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7073 VF.getKnownMinValue())); 7074 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7075 // The back-edge branch will remain, as will all scalar branches. 7076 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7077 else 7078 // This branch will be eliminated by if-conversion. 7079 return 0; 7080 // Note: We currently assume zero cost for an unconditional branch inside 7081 // a predicated block since it will become a fall-through, although we 7082 // may decide in the future to call TTI for all branches. 7083 } 7084 case Instruction::PHI: { 7085 auto *Phi = cast<PHINode>(I); 7086 7087 // First-order recurrences are replaced by vector shuffles inside the loop. 7088 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7089 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7090 return TTI.getShuffleCost( 7091 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7092 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7093 7094 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7095 // converted into select instructions. We require N - 1 selects per phi 7096 // node, where N is the number of incoming values. 7097 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7098 return (Phi->getNumIncomingValues() - 1) * 7099 TTI.getCmpSelInstrCost( 7100 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7101 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7102 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7103 7104 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7105 } 7106 case Instruction::UDiv: 7107 case Instruction::SDiv: 7108 case Instruction::URem: 7109 case Instruction::SRem: 7110 // If we have a predicated instruction, it may not be executed for each 7111 // vector lane. Get the scalarization cost and scale this amount by the 7112 // probability of executing the predicated block. If the instruction is not 7113 // predicated, we fall through to the next case. 7114 if (VF.isVector() && isScalarWithPredication(I)) { 7115 unsigned Cost = 0; 7116 7117 // These instructions have a non-void type, so account for the phi nodes 7118 // that we will create. This cost is likely to be zero. The phi node 7119 // cost, if any, should be scaled by the block probability because it 7120 // models a copy at the end of each predicated block. 7121 Cost += VF.getKnownMinValue() * 7122 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7123 7124 // The cost of the non-predicated instruction. 7125 Cost += VF.getKnownMinValue() * 7126 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7127 7128 // The cost of insertelement and extractelement instructions needed for 7129 // scalarization. 7130 Cost += getScalarizationOverhead(I, VF); 7131 7132 // Scale the cost by the probability of executing the predicated blocks. 7133 // This assumes the predicated block for each vector lane is equally 7134 // likely. 7135 return Cost / getReciprocalPredBlockProb(); 7136 } 7137 LLVM_FALLTHROUGH; 7138 case Instruction::Add: 7139 case Instruction::FAdd: 7140 case Instruction::Sub: 7141 case Instruction::FSub: 7142 case Instruction::Mul: 7143 case Instruction::FMul: 7144 case Instruction::FDiv: 7145 case Instruction::FRem: 7146 case Instruction::Shl: 7147 case Instruction::LShr: 7148 case Instruction::AShr: 7149 case Instruction::And: 7150 case Instruction::Or: 7151 case Instruction::Xor: { 7152 // Since we will replace the stride by 1 the multiplication should go away. 7153 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7154 return 0; 7155 // Certain instructions can be cheaper to vectorize if they have a constant 7156 // second vector operand. One example of this are shifts on x86. 7157 Value *Op2 = I->getOperand(1); 7158 TargetTransformInfo::OperandValueProperties Op2VP; 7159 TargetTransformInfo::OperandValueKind Op2VK = 7160 TTI.getOperandInfo(Op2, Op2VP); 7161 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7162 Op2VK = TargetTransformInfo::OK_UniformValue; 7163 7164 SmallVector<const Value *, 4> Operands(I->operand_values()); 7165 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7166 return N * TTI.getArithmeticInstrCost( 7167 I->getOpcode(), VectorTy, CostKind, 7168 TargetTransformInfo::OK_AnyValue, 7169 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7170 } 7171 case Instruction::FNeg: { 7172 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7173 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7174 return N * TTI.getArithmeticInstrCost( 7175 I->getOpcode(), VectorTy, CostKind, 7176 TargetTransformInfo::OK_AnyValue, 7177 TargetTransformInfo::OK_AnyValue, 7178 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7179 I->getOperand(0), I); 7180 } 7181 case Instruction::Select: { 7182 SelectInst *SI = cast<SelectInst>(I); 7183 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7184 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7185 Type *CondTy = SI->getCondition()->getType(); 7186 if (!ScalarCond) { 7187 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7188 CondTy = VectorType::get(CondTy, VF); 7189 } 7190 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7191 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7192 } 7193 case Instruction::ICmp: 7194 case Instruction::FCmp: { 7195 Type *ValTy = I->getOperand(0)->getType(); 7196 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7197 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7198 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7199 VectorTy = ToVectorTy(ValTy, VF); 7200 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7201 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7202 } 7203 case Instruction::Store: 7204 case Instruction::Load: { 7205 ElementCount Width = VF; 7206 if (Width.isVector()) { 7207 InstWidening Decision = getWideningDecision(I, Width); 7208 assert(Decision != CM_Unknown && 7209 "CM decision should be taken at this point"); 7210 if (Decision == CM_Scalarize) 7211 Width = ElementCount::getFixed(1); 7212 } 7213 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7214 return getMemoryInstructionCost(I, VF); 7215 } 7216 case Instruction::ZExt: 7217 case Instruction::SExt: 7218 case Instruction::FPToUI: 7219 case Instruction::FPToSI: 7220 case Instruction::FPExt: 7221 case Instruction::PtrToInt: 7222 case Instruction::IntToPtr: 7223 case Instruction::SIToFP: 7224 case Instruction::UIToFP: 7225 case Instruction::Trunc: 7226 case Instruction::FPTrunc: 7227 case Instruction::BitCast: { 7228 // Computes the CastContextHint from a Load/Store instruction. 7229 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7230 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7231 "Expected a load or a store!"); 7232 7233 if (VF.isScalar() || !TheLoop->contains(I)) 7234 return TTI::CastContextHint::Normal; 7235 7236 switch (getWideningDecision(I, VF)) { 7237 case LoopVectorizationCostModel::CM_GatherScatter: 7238 return TTI::CastContextHint::GatherScatter; 7239 case LoopVectorizationCostModel::CM_Interleave: 7240 return TTI::CastContextHint::Interleave; 7241 case LoopVectorizationCostModel::CM_Scalarize: 7242 case LoopVectorizationCostModel::CM_Widen: 7243 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7244 : TTI::CastContextHint::Normal; 7245 case LoopVectorizationCostModel::CM_Widen_Reverse: 7246 return TTI::CastContextHint::Reversed; 7247 case LoopVectorizationCostModel::CM_Unknown: 7248 llvm_unreachable("Instr did not go through cost modelling?"); 7249 } 7250 7251 llvm_unreachable("Unhandled case!"); 7252 }; 7253 7254 unsigned Opcode = I->getOpcode(); 7255 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7256 // For Trunc, the context is the only user, which must be a StoreInst. 7257 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7258 if (I->hasOneUse()) 7259 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7260 CCH = ComputeCCH(Store); 7261 } 7262 // For Z/Sext, the context is the operand, which must be a LoadInst. 7263 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7264 Opcode == Instruction::FPExt) { 7265 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7266 CCH = ComputeCCH(Load); 7267 } 7268 7269 // We optimize the truncation of induction variables having constant 7270 // integer steps. The cost of these truncations is the same as the scalar 7271 // operation. 7272 if (isOptimizableIVTruncate(I, VF)) { 7273 auto *Trunc = cast<TruncInst>(I); 7274 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7275 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7276 } 7277 7278 Type *SrcScalarTy = I->getOperand(0)->getType(); 7279 Type *SrcVecTy = 7280 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7281 if (canTruncateToMinimalBitwidth(I, VF)) { 7282 // This cast is going to be shrunk. This may remove the cast or it might 7283 // turn it into slightly different cast. For example, if MinBW == 16, 7284 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7285 // 7286 // Calculate the modified src and dest types. 7287 Type *MinVecTy = VectorTy; 7288 if (Opcode == Instruction::Trunc) { 7289 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7290 VectorTy = 7291 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7292 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7293 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7294 VectorTy = 7295 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7296 } 7297 } 7298 7299 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7300 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7301 return N * 7302 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7303 } 7304 case Instruction::Call: { 7305 bool NeedToScalarize; 7306 CallInst *CI = cast<CallInst>(I); 7307 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7308 if (getVectorIntrinsicIDForCall(CI, TLI)) 7309 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 7310 return CallCost; 7311 } 7312 case Instruction::ExtractValue: 7313 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7314 default: 7315 // The cost of executing VF copies of the scalar instruction. This opcode 7316 // is unknown. Assume that it is the same as 'mul'. 7317 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7318 Instruction::Mul, VectorTy, CostKind) + 7319 getScalarizationOverhead(I, VF); 7320 } // end of switch. 7321 } 7322 7323 char LoopVectorize::ID = 0; 7324 7325 static const char lv_name[] = "Loop Vectorization"; 7326 7327 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7328 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7329 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7330 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7331 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7332 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7333 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7334 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7335 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7336 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7337 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7338 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7339 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7340 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7341 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7342 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7343 7344 namespace llvm { 7345 7346 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7347 7348 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7349 bool VectorizeOnlyWhenForced) { 7350 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7351 } 7352 7353 } // end namespace llvm 7354 7355 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7356 // Check if the pointer operand of a load or store instruction is 7357 // consecutive. 7358 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7359 return Legal->isConsecutivePtr(Ptr); 7360 return false; 7361 } 7362 7363 void LoopVectorizationCostModel::collectValuesToIgnore() { 7364 // Ignore ephemeral values. 7365 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7366 7367 // Ignore type-promoting instructions we identified during reduction 7368 // detection. 7369 for (auto &Reduction : Legal->getReductionVars()) { 7370 RecurrenceDescriptor &RedDes = Reduction.second; 7371 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7372 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7373 } 7374 // Ignore type-casting instructions we identified during induction 7375 // detection. 7376 for (auto &Induction : Legal->getInductionVars()) { 7377 InductionDescriptor &IndDes = Induction.second; 7378 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7379 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7380 } 7381 } 7382 7383 void LoopVectorizationCostModel::collectInLoopReductions() { 7384 for (auto &Reduction : Legal->getReductionVars()) { 7385 PHINode *Phi = Reduction.first; 7386 RecurrenceDescriptor &RdxDesc = Reduction.second; 7387 7388 // We don't collect reductions that are type promoted (yet). 7389 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7390 continue; 7391 7392 // If the target would prefer this reduction to happen "in-loop", then we 7393 // want to record it as such. 7394 unsigned Opcode = RdxDesc.getOpcode(); 7395 if (!PreferInLoopReductions && 7396 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7397 TargetTransformInfo::ReductionFlags())) 7398 continue; 7399 7400 // Check that we can correctly put the reductions into the loop, by 7401 // finding the chain of operations that leads from the phi to the loop 7402 // exit value. 7403 SmallVector<Instruction *, 4> ReductionOperations = 7404 RdxDesc.getReductionOpChain(Phi, TheLoop); 7405 bool InLoop = !ReductionOperations.empty(); 7406 if (InLoop) 7407 InLoopReductionChains[Phi] = ReductionOperations; 7408 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7409 << " reduction for phi: " << *Phi << "\n"); 7410 } 7411 } 7412 7413 // TODO: we could return a pair of values that specify the max VF and 7414 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7415 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7416 // doesn't have a cost model that can choose which plan to execute if 7417 // more than one is generated. 7418 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7419 LoopVectorizationCostModel &CM) { 7420 unsigned WidestType; 7421 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7422 return WidestVectorRegBits / WidestType; 7423 } 7424 7425 VectorizationFactor 7426 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7427 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7428 ElementCount VF = UserVF; 7429 // Outer loop handling: They may require CFG and instruction level 7430 // transformations before even evaluating whether vectorization is profitable. 7431 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7432 // the vectorization pipeline. 7433 if (!OrigLoop->isInnermost()) { 7434 // If the user doesn't provide a vectorization factor, determine a 7435 // reasonable one. 7436 if (UserVF.isZero()) { 7437 VF = ElementCount::getFixed( 7438 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7439 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7440 7441 // Make sure we have a VF > 1 for stress testing. 7442 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7443 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7444 << "overriding computed VF.\n"); 7445 VF = ElementCount::getFixed(4); 7446 } 7447 } 7448 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7449 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7450 "VF needs to be a power of two"); 7451 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7452 << "VF " << VF << " to build VPlans.\n"); 7453 buildVPlans(VF, VF); 7454 7455 // For VPlan build stress testing, we bail out after VPlan construction. 7456 if (VPlanBuildStressTest) 7457 return VectorizationFactor::Disabled(); 7458 7459 return {VF, 0 /*Cost*/}; 7460 } 7461 7462 LLVM_DEBUG( 7463 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7464 "VPlan-native path.\n"); 7465 return VectorizationFactor::Disabled(); 7466 } 7467 7468 Optional<VectorizationFactor> 7469 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7470 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7471 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7472 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7473 return None; 7474 7475 // Invalidate interleave groups if all blocks of loop will be predicated. 7476 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7477 !useMaskedInterleavedAccesses(*TTI)) { 7478 LLVM_DEBUG( 7479 dbgs() 7480 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7481 "which requires masked-interleaved support.\n"); 7482 if (CM.InterleaveInfo.invalidateGroups()) 7483 // Invalidating interleave groups also requires invalidating all decisions 7484 // based on them, which includes widening decisions and uniform and scalar 7485 // values. 7486 CM.invalidateCostModelingDecisions(); 7487 } 7488 7489 ElementCount MaxVF = MaybeMaxVF.getValue(); 7490 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7491 7492 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7493 if (!UserVF.isZero() && 7494 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7495 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7496 // VFs here, this should be reverted to only use legal UserVFs once the 7497 // loop below supports scalable VFs. 7498 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7499 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7500 << " VF " << VF << ".\n"); 7501 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7502 "VF needs to be a power of two"); 7503 // Collect the instructions (and their associated costs) that will be more 7504 // profitable to scalarize. 7505 CM.selectUserVectorizationFactor(VF); 7506 CM.collectInLoopReductions(); 7507 buildVPlansWithVPRecipes(VF, VF); 7508 LLVM_DEBUG(printPlans(dbgs())); 7509 return {{VF, 0}}; 7510 } 7511 7512 assert(!MaxVF.isScalable() && 7513 "Scalable vectors not yet supported beyond this point"); 7514 7515 for (ElementCount VF = ElementCount::getFixed(1); 7516 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7517 // Collect Uniform and Scalar instructions after vectorization with VF. 7518 CM.collectUniformsAndScalars(VF); 7519 7520 // Collect the instructions (and their associated costs) that will be more 7521 // profitable to scalarize. 7522 if (VF.isVector()) 7523 CM.collectInstsToScalarize(VF); 7524 } 7525 7526 CM.collectInLoopReductions(); 7527 7528 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7529 LLVM_DEBUG(printPlans(dbgs())); 7530 if (MaxVF.isScalar()) 7531 return VectorizationFactor::Disabled(); 7532 7533 // Select the optimal vectorization factor. 7534 return CM.selectVectorizationFactor(MaxVF); 7535 } 7536 7537 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7538 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7539 << '\n'); 7540 BestVF = VF; 7541 BestUF = UF; 7542 7543 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7544 return !Plan->hasVF(VF); 7545 }); 7546 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7547 } 7548 7549 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7550 DominatorTree *DT) { 7551 // Perform the actual loop transformation. 7552 7553 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7554 VPCallbackILV CallbackILV(ILV); 7555 7556 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7557 7558 VPTransformState State{*BestVF, BestUF, LI, 7559 DT, ILV.Builder, ILV.VectorLoopValueMap, 7560 &ILV, CallbackILV}; 7561 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7562 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7563 State.CanonicalIV = ILV.Induction; 7564 7565 ILV.printDebugTracesAtStart(); 7566 7567 //===------------------------------------------------===// 7568 // 7569 // Notice: any optimization or new instruction that go 7570 // into the code below should also be implemented in 7571 // the cost-model. 7572 // 7573 //===------------------------------------------------===// 7574 7575 // 2. Copy and widen instructions from the old loop into the new loop. 7576 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7577 VPlans.front()->execute(&State); 7578 7579 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7580 // predication, updating analyses. 7581 ILV.fixVectorizedLoop(); 7582 7583 ILV.printDebugTracesAtEnd(); 7584 } 7585 7586 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7587 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7588 7589 // We create new control-flow for the vectorized loop, so the original exit 7590 // conditions will be dead after vectorization if it's only used by the 7591 // terminator 7592 SmallVector<BasicBlock*> ExitingBlocks; 7593 OrigLoop->getExitingBlocks(ExitingBlocks); 7594 for (auto *BB : ExitingBlocks) { 7595 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7596 if (!Cmp || !Cmp->hasOneUse()) 7597 continue; 7598 7599 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7600 if (!DeadInstructions.insert(Cmp).second) 7601 continue; 7602 7603 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7604 // TODO: can recurse through operands in general 7605 for (Value *Op : Cmp->operands()) { 7606 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7607 DeadInstructions.insert(cast<Instruction>(Op)); 7608 } 7609 } 7610 7611 // We create new "steps" for induction variable updates to which the original 7612 // induction variables map. An original update instruction will be dead if 7613 // all its users except the induction variable are dead. 7614 auto *Latch = OrigLoop->getLoopLatch(); 7615 for (auto &Induction : Legal->getInductionVars()) { 7616 PHINode *Ind = Induction.first; 7617 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7618 7619 // If the tail is to be folded by masking, the primary induction variable, 7620 // if exists, isn't dead: it will be used for masking. Don't kill it. 7621 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7622 continue; 7623 7624 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7625 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7626 })) 7627 DeadInstructions.insert(IndUpdate); 7628 7629 // We record as "Dead" also the type-casting instructions we had identified 7630 // during induction analysis. We don't need any handling for them in the 7631 // vectorized loop because we have proven that, under a proper runtime 7632 // test guarding the vectorized loop, the value of the phi, and the casted 7633 // value of the phi, are the same. The last instruction in this casting chain 7634 // will get its scalar/vector/widened def from the scalar/vector/widened def 7635 // of the respective phi node. Any other casts in the induction def-use chain 7636 // have no other uses outside the phi update chain, and will be ignored. 7637 InductionDescriptor &IndDes = Induction.second; 7638 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7639 DeadInstructions.insert(Casts.begin(), Casts.end()); 7640 } 7641 } 7642 7643 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7644 7645 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7646 7647 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7648 Instruction::BinaryOps BinOp) { 7649 // When unrolling and the VF is 1, we only need to add a simple scalar. 7650 Type *Ty = Val->getType(); 7651 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7652 7653 if (Ty->isFloatingPointTy()) { 7654 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7655 7656 // Floating point operations had to be 'fast' to enable the unrolling. 7657 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7658 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7659 } 7660 Constant *C = ConstantInt::get(Ty, StartIdx); 7661 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7662 } 7663 7664 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7665 SmallVector<Metadata *, 4> MDs; 7666 // Reserve first location for self reference to the LoopID metadata node. 7667 MDs.push_back(nullptr); 7668 bool IsUnrollMetadata = false; 7669 MDNode *LoopID = L->getLoopID(); 7670 if (LoopID) { 7671 // First find existing loop unrolling disable metadata. 7672 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7673 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7674 if (MD) { 7675 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7676 IsUnrollMetadata = 7677 S && S->getString().startswith("llvm.loop.unroll.disable"); 7678 } 7679 MDs.push_back(LoopID->getOperand(i)); 7680 } 7681 } 7682 7683 if (!IsUnrollMetadata) { 7684 // Add runtime unroll disable metadata. 7685 LLVMContext &Context = L->getHeader()->getContext(); 7686 SmallVector<Metadata *, 1> DisableOperands; 7687 DisableOperands.push_back( 7688 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7689 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7690 MDs.push_back(DisableNode); 7691 MDNode *NewLoopID = MDNode::get(Context, MDs); 7692 // Set operand 0 to refer to the loop id itself. 7693 NewLoopID->replaceOperandWith(0, NewLoopID); 7694 L->setLoopID(NewLoopID); 7695 } 7696 } 7697 7698 //===--------------------------------------------------------------------===// 7699 // EpilogueVectorizerMainLoop 7700 //===--------------------------------------------------------------------===// 7701 7702 /// This function is partially responsible for generating the control flow 7703 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7704 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7705 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7706 Loop *Lp = createVectorLoopSkeleton(""); 7707 7708 // Generate the code to check the minimum iteration count of the vector 7709 // epilogue (see below). 7710 EPI.EpilogueIterationCountCheck = 7711 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7712 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7713 7714 // Generate the code to check any assumptions that we've made for SCEV 7715 // expressions. 7716 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7717 emitSCEVChecks(Lp, LoopScalarPreHeader); 7718 7719 // If a safety check was generated save it. 7720 if (SavedPreHeader != LoopVectorPreHeader) 7721 EPI.SCEVSafetyCheck = SavedPreHeader; 7722 7723 // Generate the code that checks at runtime if arrays overlap. We put the 7724 // checks into a separate block to make the more common case of few elements 7725 // faster. 7726 SavedPreHeader = LoopVectorPreHeader; 7727 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7728 7729 // If a safety check was generated save/overwite it. 7730 if (SavedPreHeader != LoopVectorPreHeader) 7731 EPI.MemSafetyCheck = SavedPreHeader; 7732 7733 // Generate the iteration count check for the main loop, *after* the check 7734 // for the epilogue loop, so that the path-length is shorter for the case 7735 // that goes directly through the vector epilogue. The longer-path length for 7736 // the main loop is compensated for, by the gain from vectorizing the larger 7737 // trip count. Note: the branch will get updated later on when we vectorize 7738 // the epilogue. 7739 EPI.MainLoopIterationCountCheck = 7740 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7741 7742 // Generate the induction variable. 7743 OldInduction = Legal->getPrimaryInduction(); 7744 Type *IdxTy = Legal->getWidestInductionType(); 7745 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7746 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7747 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7748 EPI.VectorTripCount = CountRoundDown; 7749 Induction = 7750 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7751 getDebugLocFromInstOrOperands(OldInduction)); 7752 7753 // Skip induction resume value creation here because they will be created in 7754 // the second pass. If we created them here, they wouldn't be used anyway, 7755 // because the vplan in the second pass still contains the inductions from the 7756 // original loop. 7757 7758 return completeLoopSkeleton(Lp, OrigLoopID); 7759 } 7760 7761 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7762 LLVM_DEBUG({ 7763 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7764 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7765 << ", Main Loop UF:" << EPI.MainLoopUF 7766 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7767 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7768 }); 7769 } 7770 7771 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7772 DEBUG_WITH_TYPE(VerboseDebug, { 7773 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7774 }); 7775 } 7776 7777 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7778 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7779 assert(L && "Expected valid Loop."); 7780 assert(Bypass && "Expected valid bypass basic block."); 7781 unsigned VFactor = 7782 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7783 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7784 Value *Count = getOrCreateTripCount(L); 7785 // Reuse existing vector loop preheader for TC checks. 7786 // Note that new preheader block is generated for vector loop. 7787 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7788 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7789 7790 // Generate code to check if the loop's trip count is less than VF * UF of the 7791 // main vector loop. 7792 auto P = 7793 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7794 7795 Value *CheckMinIters = Builder.CreateICmp( 7796 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7797 "min.iters.check"); 7798 7799 if (!ForEpilogue) 7800 TCCheckBlock->setName("vector.main.loop.iter.check"); 7801 7802 // Create new preheader for vector loop. 7803 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7804 DT, LI, nullptr, "vector.ph"); 7805 7806 if (ForEpilogue) { 7807 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7808 DT->getNode(Bypass)->getIDom()) && 7809 "TC check is expected to dominate Bypass"); 7810 7811 // Update dominator for Bypass & LoopExit. 7812 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7813 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7814 7815 LoopBypassBlocks.push_back(TCCheckBlock); 7816 7817 // Save the trip count so we don't have to regenerate it in the 7818 // vec.epilog.iter.check. This is safe to do because the trip count 7819 // generated here dominates the vector epilog iter check. 7820 EPI.TripCount = Count; 7821 } 7822 7823 ReplaceInstWithInst( 7824 TCCheckBlock->getTerminator(), 7825 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7826 7827 return TCCheckBlock; 7828 } 7829 7830 //===--------------------------------------------------------------------===// 7831 // EpilogueVectorizerEpilogueLoop 7832 //===--------------------------------------------------------------------===// 7833 7834 /// This function is partially responsible for generating the control flow 7835 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7836 BasicBlock * 7837 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7838 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7839 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7840 7841 // Now, compare the remaining count and if there aren't enough iterations to 7842 // execute the vectorized epilogue skip to the scalar part. 7843 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7844 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7845 LoopVectorPreHeader = 7846 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7847 LI, nullptr, "vec.epilog.ph"); 7848 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7849 VecEpilogueIterationCountCheck); 7850 7851 // Adjust the control flow taking the state info from the main loop 7852 // vectorization into account. 7853 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7854 "expected this to be saved from the previous pass."); 7855 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7856 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7857 7858 DT->changeImmediateDominator(LoopVectorPreHeader, 7859 EPI.MainLoopIterationCountCheck); 7860 7861 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7862 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7863 7864 if (EPI.SCEVSafetyCheck) 7865 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7866 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7867 if (EPI.MemSafetyCheck) 7868 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7869 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7870 7871 DT->changeImmediateDominator( 7872 VecEpilogueIterationCountCheck, 7873 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7874 7875 DT->changeImmediateDominator(LoopScalarPreHeader, 7876 EPI.EpilogueIterationCountCheck); 7877 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7878 7879 // Keep track of bypass blocks, as they feed start values to the induction 7880 // phis in the scalar loop preheader. 7881 if (EPI.SCEVSafetyCheck) 7882 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7883 if (EPI.MemSafetyCheck) 7884 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7885 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7886 7887 // Generate a resume induction for the vector epilogue and put it in the 7888 // vector epilogue preheader 7889 Type *IdxTy = Legal->getWidestInductionType(); 7890 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7891 LoopVectorPreHeader->getFirstNonPHI()); 7892 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7893 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7894 EPI.MainLoopIterationCountCheck); 7895 7896 // Generate the induction variable. 7897 OldInduction = Legal->getPrimaryInduction(); 7898 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7899 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7900 Value *StartIdx = EPResumeVal; 7901 Induction = 7902 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7903 getDebugLocFromInstOrOperands(OldInduction)); 7904 7905 // Generate induction resume values. These variables save the new starting 7906 // indexes for the scalar loop. They are used to test if there are any tail 7907 // iterations left once the vector loop has completed. 7908 // Note that when the vectorized epilogue is skipped due to iteration count 7909 // check, then the resume value for the induction variable comes from 7910 // the trip count of the main vector loop, hence passing the AdditionalBypass 7911 // argument. 7912 createInductionResumeValues(Lp, CountRoundDown, 7913 {VecEpilogueIterationCountCheck, 7914 EPI.VectorTripCount} /* AdditionalBypass */); 7915 7916 AddRuntimeUnrollDisableMetaData(Lp); 7917 return completeLoopSkeleton(Lp, OrigLoopID); 7918 } 7919 7920 BasicBlock * 7921 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7922 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7923 7924 assert(EPI.TripCount && 7925 "Expected trip count to have been safed in the first pass."); 7926 assert( 7927 (!isa<Instruction>(EPI.TripCount) || 7928 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7929 "saved trip count does not dominate insertion point."); 7930 Value *TC = EPI.TripCount; 7931 IRBuilder<> Builder(Insert->getTerminator()); 7932 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7933 7934 // Generate code to check if the loop's trip count is less than VF * UF of the 7935 // vector epilogue loop. 7936 auto P = 7937 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7938 7939 Value *CheckMinIters = Builder.CreateICmp( 7940 P, Count, 7941 ConstantInt::get(Count->getType(), 7942 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7943 "min.epilog.iters.check"); 7944 7945 ReplaceInstWithInst( 7946 Insert->getTerminator(), 7947 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7948 7949 LoopBypassBlocks.push_back(Insert); 7950 return Insert; 7951 } 7952 7953 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7954 LLVM_DEBUG({ 7955 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7956 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7957 << ", Main Loop UF:" << EPI.MainLoopUF 7958 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7959 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7960 }); 7961 } 7962 7963 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7964 DEBUG_WITH_TYPE(VerboseDebug, { 7965 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7966 }); 7967 } 7968 7969 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7970 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7971 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7972 bool PredicateAtRangeStart = Predicate(Range.Start); 7973 7974 for (ElementCount TmpVF = Range.Start * 2; 7975 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7976 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7977 Range.End = TmpVF; 7978 break; 7979 } 7980 7981 return PredicateAtRangeStart; 7982 } 7983 7984 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7985 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7986 /// of VF's starting at a given VF and extending it as much as possible. Each 7987 /// vectorization decision can potentially shorten this sub-range during 7988 /// buildVPlan(). 7989 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7990 ElementCount MaxVF) { 7991 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7992 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7993 VFRange SubRange = {VF, MaxVFPlusOne}; 7994 VPlans.push_back(buildVPlan(SubRange)); 7995 VF = SubRange.End; 7996 } 7997 } 7998 7999 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8000 VPlanPtr &Plan) { 8001 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8002 8003 // Look for cached value. 8004 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8005 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8006 if (ECEntryIt != EdgeMaskCache.end()) 8007 return ECEntryIt->second; 8008 8009 VPValue *SrcMask = createBlockInMask(Src, Plan); 8010 8011 // The terminator has to be a branch inst! 8012 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8013 assert(BI && "Unexpected terminator found"); 8014 8015 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8016 return EdgeMaskCache[Edge] = SrcMask; 8017 8018 // If source is an exiting block, we know the exit edge is dynamically dead 8019 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8020 // adding uses of an otherwise potentially dead instruction. 8021 if (OrigLoop->isLoopExiting(Src)) 8022 return EdgeMaskCache[Edge] = SrcMask; 8023 8024 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8025 assert(EdgeMask && "No Edge Mask found for condition"); 8026 8027 if (BI->getSuccessor(0) != Dst) 8028 EdgeMask = Builder.createNot(EdgeMask); 8029 8030 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 8031 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 8032 8033 return EdgeMaskCache[Edge] = EdgeMask; 8034 } 8035 8036 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8037 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8038 8039 // Look for cached value. 8040 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8041 if (BCEntryIt != BlockMaskCache.end()) 8042 return BCEntryIt->second; 8043 8044 // All-one mask is modelled as no-mask following the convention for masked 8045 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8046 VPValue *BlockMask = nullptr; 8047 8048 if (OrigLoop->getHeader() == BB) { 8049 if (!CM.blockNeedsPredication(BB)) 8050 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8051 8052 // Create the block in mask as the first non-phi instruction in the block. 8053 VPBuilder::InsertPointGuard Guard(Builder); 8054 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8055 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8056 8057 // Introduce the early-exit compare IV <= BTC to form header block mask. 8058 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8059 // Start by constructing the desired canonical IV. 8060 VPValue *IV = nullptr; 8061 if (Legal->getPrimaryInduction()) 8062 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8063 else { 8064 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8065 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8066 IV = IVRecipe->getVPValue(); 8067 } 8068 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8069 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8070 8071 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8072 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8073 // as a second argument, we only pass the IV here and extract the 8074 // tripcount from the transform state where codegen of the VP instructions 8075 // happen. 8076 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8077 } else { 8078 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8079 } 8080 return BlockMaskCache[BB] = BlockMask; 8081 } 8082 8083 // This is the block mask. We OR all incoming edges. 8084 for (auto *Predecessor : predecessors(BB)) { 8085 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8086 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8087 return BlockMaskCache[BB] = EdgeMask; 8088 8089 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8090 BlockMask = EdgeMask; 8091 continue; 8092 } 8093 8094 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8095 } 8096 8097 return BlockMaskCache[BB] = BlockMask; 8098 } 8099 8100 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8101 VPlanPtr &Plan) { 8102 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8103 "Must be called with either a load or store"); 8104 8105 auto willWiden = [&](ElementCount VF) -> bool { 8106 if (VF.isScalar()) 8107 return false; 8108 LoopVectorizationCostModel::InstWidening Decision = 8109 CM.getWideningDecision(I, VF); 8110 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8111 "CM decision should be taken at this point."); 8112 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8113 return true; 8114 if (CM.isScalarAfterVectorization(I, VF) || 8115 CM.isProfitableToScalarize(I, VF)) 8116 return false; 8117 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8118 }; 8119 8120 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8121 return nullptr; 8122 8123 VPValue *Mask = nullptr; 8124 if (Legal->isMaskRequired(I)) 8125 Mask = createBlockInMask(I->getParent(), Plan); 8126 8127 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8128 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8129 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8130 8131 StoreInst *Store = cast<StoreInst>(I); 8132 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8133 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8134 } 8135 8136 VPWidenIntOrFpInductionRecipe * 8137 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8138 // Check if this is an integer or fp induction. If so, build the recipe that 8139 // produces its scalar and vector values. 8140 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8141 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8142 II.getKind() == InductionDescriptor::IK_FpInduction) { 8143 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8144 return new VPWidenIntOrFpInductionRecipe(Phi, Start); 8145 } 8146 8147 return nullptr; 8148 } 8149 8150 VPWidenIntOrFpInductionRecipe * 8151 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8152 VPlan &Plan) const { 8153 // Optimize the special case where the source is a constant integer 8154 // induction variable. Notice that we can only optimize the 'trunc' case 8155 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8156 // (c) other casts depend on pointer size. 8157 8158 // Determine whether \p K is a truncation based on an induction variable that 8159 // can be optimized. 8160 auto isOptimizableIVTruncate = 8161 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8162 return [=](ElementCount VF) -> bool { 8163 return CM.isOptimizableIVTruncate(K, VF); 8164 }; 8165 }; 8166 8167 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8168 isOptimizableIVTruncate(I), Range)) { 8169 8170 InductionDescriptor II = 8171 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8172 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8173 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8174 Start, I); 8175 } 8176 return nullptr; 8177 } 8178 8179 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8180 // We know that all PHIs in non-header blocks are converted into selects, so 8181 // we don't have to worry about the insertion order and we can just use the 8182 // builder. At this point we generate the predication tree. There may be 8183 // duplications since this is a simple recursive scan, but future 8184 // optimizations will clean it up. 8185 8186 SmallVector<VPValue *, 2> Operands; 8187 unsigned NumIncoming = Phi->getNumIncomingValues(); 8188 for (unsigned In = 0; In < NumIncoming; In++) { 8189 VPValue *EdgeMask = 8190 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8191 assert((EdgeMask || NumIncoming == 1) && 8192 "Multiple predecessors with one having a full mask"); 8193 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8194 if (EdgeMask) 8195 Operands.push_back(EdgeMask); 8196 } 8197 return new VPBlendRecipe(Phi, Operands); 8198 } 8199 8200 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8201 VPlan &Plan) const { 8202 8203 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8204 [this, CI](ElementCount VF) { 8205 return CM.isScalarWithPredication(CI, VF); 8206 }, 8207 Range); 8208 8209 if (IsPredicated) 8210 return nullptr; 8211 8212 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8213 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8214 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8215 ID == Intrinsic::pseudoprobe)) 8216 return nullptr; 8217 8218 auto willWiden = [&](ElementCount VF) -> bool { 8219 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8220 // The following case may be scalarized depending on the VF. 8221 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8222 // version of the instruction. 8223 // Is it beneficial to perform intrinsic call compared to lib call? 8224 bool NeedToScalarize = false; 8225 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8226 bool UseVectorIntrinsic = 8227 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 8228 return UseVectorIntrinsic || !NeedToScalarize; 8229 }; 8230 8231 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8232 return nullptr; 8233 8234 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8235 } 8236 8237 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8238 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8239 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8240 // Instruction should be widened, unless it is scalar after vectorization, 8241 // scalarization is profitable or it is predicated. 8242 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8243 return CM.isScalarAfterVectorization(I, VF) || 8244 CM.isProfitableToScalarize(I, VF) || 8245 CM.isScalarWithPredication(I, VF); 8246 }; 8247 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8248 Range); 8249 } 8250 8251 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8252 auto IsVectorizableOpcode = [](unsigned Opcode) { 8253 switch (Opcode) { 8254 case Instruction::Add: 8255 case Instruction::And: 8256 case Instruction::AShr: 8257 case Instruction::BitCast: 8258 case Instruction::FAdd: 8259 case Instruction::FCmp: 8260 case Instruction::FDiv: 8261 case Instruction::FMul: 8262 case Instruction::FNeg: 8263 case Instruction::FPExt: 8264 case Instruction::FPToSI: 8265 case Instruction::FPToUI: 8266 case Instruction::FPTrunc: 8267 case Instruction::FRem: 8268 case Instruction::FSub: 8269 case Instruction::ICmp: 8270 case Instruction::IntToPtr: 8271 case Instruction::LShr: 8272 case Instruction::Mul: 8273 case Instruction::Or: 8274 case Instruction::PtrToInt: 8275 case Instruction::SDiv: 8276 case Instruction::Select: 8277 case Instruction::SExt: 8278 case Instruction::Shl: 8279 case Instruction::SIToFP: 8280 case Instruction::SRem: 8281 case Instruction::Sub: 8282 case Instruction::Trunc: 8283 case Instruction::UDiv: 8284 case Instruction::UIToFP: 8285 case Instruction::URem: 8286 case Instruction::Xor: 8287 case Instruction::ZExt: 8288 return true; 8289 } 8290 return false; 8291 }; 8292 8293 if (!IsVectorizableOpcode(I->getOpcode())) 8294 return nullptr; 8295 8296 // Success: widen this instruction. 8297 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8298 } 8299 8300 VPBasicBlock *VPRecipeBuilder::handleReplication( 8301 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8302 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8303 VPlanPtr &Plan) { 8304 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8305 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8306 Range); 8307 8308 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8309 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8310 Range); 8311 8312 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8313 IsUniform, IsPredicated); 8314 setRecipe(I, Recipe); 8315 Plan->addVPValue(I, Recipe); 8316 8317 // Find if I uses a predicated instruction. If so, it will use its scalar 8318 // value. Avoid hoisting the insert-element which packs the scalar value into 8319 // a vector value, as that happens iff all users use the vector value. 8320 for (auto &Op : I->operands()) 8321 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8322 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8323 PredInst2Recipe[PredInst]->setAlsoPack(false); 8324 8325 // Finalize the recipe for Instr, first if it is not predicated. 8326 if (!IsPredicated) { 8327 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8328 VPBB->appendRecipe(Recipe); 8329 return VPBB; 8330 } 8331 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8332 assert(VPBB->getSuccessors().empty() && 8333 "VPBB has successors when handling predicated replication."); 8334 // Record predicated instructions for above packing optimizations. 8335 PredInst2Recipe[I] = Recipe; 8336 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8337 VPBlockUtils::insertBlockAfter(Region, VPBB); 8338 auto *RegSucc = new VPBasicBlock(); 8339 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8340 return RegSucc; 8341 } 8342 8343 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8344 VPRecipeBase *PredRecipe, 8345 VPlanPtr &Plan) { 8346 // Instructions marked for predication are replicated and placed under an 8347 // if-then construct to prevent side-effects. 8348 8349 // Generate recipes to compute the block mask for this region. 8350 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8351 8352 // Build the triangular if-then region. 8353 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8354 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8355 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8356 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8357 auto *PHIRecipe = Instr->getType()->isVoidTy() 8358 ? nullptr 8359 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8360 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8361 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8362 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8363 8364 // Note: first set Entry as region entry and then connect successors starting 8365 // from it in order, to propagate the "parent" of each VPBasicBlock. 8366 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8367 VPBlockUtils::connectBlocks(Pred, Exit); 8368 8369 return Region; 8370 } 8371 8372 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8373 VFRange &Range, 8374 VPlanPtr &Plan) { 8375 // First, check for specific widening recipes that deal with calls, memory 8376 // operations, inductions and Phi nodes. 8377 if (auto *CI = dyn_cast<CallInst>(Instr)) 8378 return tryToWidenCall(CI, Range, *Plan); 8379 8380 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8381 return tryToWidenMemory(Instr, Range, Plan); 8382 8383 VPRecipeBase *Recipe; 8384 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8385 if (Phi->getParent() != OrigLoop->getHeader()) 8386 return tryToBlend(Phi, Plan); 8387 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8388 return Recipe; 8389 8390 if (Legal->isReductionVariable(Phi)) { 8391 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8392 VPValue *StartV = 8393 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8394 return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8395 } 8396 8397 return new VPWidenPHIRecipe(Phi); 8398 } 8399 8400 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8401 cast<TruncInst>(Instr), Range, *Plan))) 8402 return Recipe; 8403 8404 if (!shouldWiden(Instr, Range)) 8405 return nullptr; 8406 8407 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8408 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8409 OrigLoop); 8410 8411 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8412 bool InvariantCond = 8413 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8414 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8415 InvariantCond); 8416 } 8417 8418 return tryToWiden(Instr, *Plan); 8419 } 8420 8421 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8422 ElementCount MaxVF) { 8423 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8424 8425 // Collect instructions from the original loop that will become trivially dead 8426 // in the vectorized loop. We don't need to vectorize these instructions. For 8427 // example, original induction update instructions can become dead because we 8428 // separately emit induction "steps" when generating code for the new loop. 8429 // Similarly, we create a new latch condition when setting up the structure 8430 // of the new loop, so the old one can become dead. 8431 SmallPtrSet<Instruction *, 4> DeadInstructions; 8432 collectTriviallyDeadInstructions(DeadInstructions); 8433 8434 // Add assume instructions we need to drop to DeadInstructions, to prevent 8435 // them from being added to the VPlan. 8436 // TODO: We only need to drop assumes in blocks that get flattend. If the 8437 // control flow is preserved, we should keep them. 8438 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8439 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8440 8441 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8442 // Dead instructions do not need sinking. Remove them from SinkAfter. 8443 for (Instruction *I : DeadInstructions) 8444 SinkAfter.erase(I); 8445 8446 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8447 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8448 VFRange SubRange = {VF, MaxVFPlusOne}; 8449 VPlans.push_back( 8450 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8451 VF = SubRange.End; 8452 } 8453 } 8454 8455 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8456 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8457 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8458 8459 // Hold a mapping from predicated instructions to their recipes, in order to 8460 // fix their AlsoPack behavior if a user is determined to replicate and use a 8461 // scalar instead of vector value. 8462 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8463 8464 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8465 8466 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8467 8468 // --------------------------------------------------------------------------- 8469 // Pre-construction: record ingredients whose recipes we'll need to further 8470 // process after constructing the initial VPlan. 8471 // --------------------------------------------------------------------------- 8472 8473 // Mark instructions we'll need to sink later and their targets as 8474 // ingredients whose recipe we'll need to record. 8475 for (auto &Entry : SinkAfter) { 8476 RecipeBuilder.recordRecipeOf(Entry.first); 8477 RecipeBuilder.recordRecipeOf(Entry.second); 8478 } 8479 for (auto &Reduction : CM.getInLoopReductionChains()) { 8480 PHINode *Phi = Reduction.first; 8481 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8482 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8483 8484 RecipeBuilder.recordRecipeOf(Phi); 8485 for (auto &R : ReductionOperations) { 8486 RecipeBuilder.recordRecipeOf(R); 8487 // For min/max reducitons, where we have a pair of icmp/select, we also 8488 // need to record the ICmp recipe, so it can be removed later. 8489 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8490 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8491 } 8492 } 8493 8494 // For each interleave group which is relevant for this (possibly trimmed) 8495 // Range, add it to the set of groups to be later applied to the VPlan and add 8496 // placeholders for its members' Recipes which we'll be replacing with a 8497 // single VPInterleaveRecipe. 8498 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8499 auto applyIG = [IG, this](ElementCount VF) -> bool { 8500 return (VF.isVector() && // Query is illegal for VF == 1 8501 CM.getWideningDecision(IG->getInsertPos(), VF) == 8502 LoopVectorizationCostModel::CM_Interleave); 8503 }; 8504 if (!getDecisionAndClampRange(applyIG, Range)) 8505 continue; 8506 InterleaveGroups.insert(IG); 8507 for (unsigned i = 0; i < IG->getFactor(); i++) 8508 if (Instruction *Member = IG->getMember(i)) 8509 RecipeBuilder.recordRecipeOf(Member); 8510 }; 8511 8512 // --------------------------------------------------------------------------- 8513 // Build initial VPlan: Scan the body of the loop in a topological order to 8514 // visit each basic block after having visited its predecessor basic blocks. 8515 // --------------------------------------------------------------------------- 8516 8517 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8518 auto Plan = std::make_unique<VPlan>(); 8519 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8520 Plan->setEntry(VPBB); 8521 8522 // Scan the body of the loop in a topological order to visit each basic block 8523 // after having visited its predecessor basic blocks. 8524 LoopBlocksDFS DFS(OrigLoop); 8525 DFS.perform(LI); 8526 8527 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8528 // Relevant instructions from basic block BB will be grouped into VPRecipe 8529 // ingredients and fill a new VPBasicBlock. 8530 unsigned VPBBsForBB = 0; 8531 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8532 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8533 VPBB = FirstVPBBForBB; 8534 Builder.setInsertPoint(VPBB); 8535 8536 // Introduce each ingredient into VPlan. 8537 // TODO: Model and preserve debug instrinsics in VPlan. 8538 for (Instruction &I : BB->instructionsWithoutDebug()) { 8539 Instruction *Instr = &I; 8540 8541 // First filter out irrelevant instructions, to ensure no recipes are 8542 // built for them. 8543 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8544 continue; 8545 8546 if (auto Recipe = 8547 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8548 for (auto *Def : Recipe->definedValues()) { 8549 auto *UV = Def->getUnderlyingValue(); 8550 Plan->addVPValue(UV, Def); 8551 } 8552 8553 RecipeBuilder.setRecipe(Instr, Recipe); 8554 VPBB->appendRecipe(Recipe); 8555 continue; 8556 } 8557 8558 // Otherwise, if all widening options failed, Instruction is to be 8559 // replicated. This may create a successor for VPBB. 8560 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8561 Instr, Range, VPBB, PredInst2Recipe, Plan); 8562 if (NextVPBB != VPBB) { 8563 VPBB = NextVPBB; 8564 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8565 : ""); 8566 } 8567 } 8568 } 8569 8570 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8571 // may also be empty, such as the last one VPBB, reflecting original 8572 // basic-blocks with no recipes. 8573 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8574 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8575 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8576 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8577 delete PreEntry; 8578 8579 // --------------------------------------------------------------------------- 8580 // Transform initial VPlan: Apply previously taken decisions, in order, to 8581 // bring the VPlan to its final state. 8582 // --------------------------------------------------------------------------- 8583 8584 // Apply Sink-After legal constraints. 8585 for (auto &Entry : SinkAfter) { 8586 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8587 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8588 // If the target is in a replication region, make sure to move Sink to the 8589 // block after it, not into the replication region itself. 8590 if (auto *Region = 8591 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8592 if (Region->isReplicator()) { 8593 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8594 VPBasicBlock *NextBlock = 8595 cast<VPBasicBlock>(Region->getSuccessors().front()); 8596 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8597 continue; 8598 } 8599 } 8600 Sink->moveAfter(Target); 8601 } 8602 8603 // Interleave memory: for each Interleave Group we marked earlier as relevant 8604 // for this VPlan, replace the Recipes widening its memory instructions with a 8605 // single VPInterleaveRecipe at its insertion point. 8606 for (auto IG : InterleaveGroups) { 8607 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8608 RecipeBuilder.getRecipe(IG->getInsertPos())); 8609 SmallVector<VPValue *, 4> StoredValues; 8610 for (unsigned i = 0; i < IG->getFactor(); ++i) 8611 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8612 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8613 8614 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8615 Recipe->getMask()); 8616 VPIG->insertBefore(Recipe); 8617 unsigned J = 0; 8618 for (unsigned i = 0; i < IG->getFactor(); ++i) 8619 if (Instruction *Member = IG->getMember(i)) { 8620 if (!Member->getType()->isVoidTy()) { 8621 VPValue *OriginalV = Plan->getVPValue(Member); 8622 Plan->removeVPValueFor(Member); 8623 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8624 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8625 J++; 8626 } 8627 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8628 } 8629 } 8630 8631 // Adjust the recipes for any inloop reductions. 8632 if (Range.Start.isVector()) 8633 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8634 8635 // Finally, if tail is folded by masking, introduce selects between the phi 8636 // and the live-out instruction of each reduction, at the end of the latch. 8637 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8638 Builder.setInsertPoint(VPBB); 8639 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8640 for (auto &Reduction : Legal->getReductionVars()) { 8641 if (CM.isInLoopReduction(Reduction.first)) 8642 continue; 8643 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8644 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8645 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8646 } 8647 } 8648 8649 std::string PlanName; 8650 raw_string_ostream RSO(PlanName); 8651 ElementCount VF = Range.Start; 8652 Plan->addVF(VF); 8653 RSO << "Initial VPlan for VF={" << VF; 8654 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8655 Plan->addVF(VF); 8656 RSO << "," << VF; 8657 } 8658 RSO << "},UF>=1"; 8659 RSO.flush(); 8660 Plan->setName(PlanName); 8661 8662 return Plan; 8663 } 8664 8665 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8666 // Outer loop handling: They may require CFG and instruction level 8667 // transformations before even evaluating whether vectorization is profitable. 8668 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8669 // the vectorization pipeline. 8670 assert(!OrigLoop->isInnermost()); 8671 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8672 8673 // Create new empty VPlan 8674 auto Plan = std::make_unique<VPlan>(); 8675 8676 // Build hierarchical CFG 8677 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8678 HCFGBuilder.buildHierarchicalCFG(); 8679 8680 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8681 VF *= 2) 8682 Plan->addVF(VF); 8683 8684 if (EnableVPlanPredication) { 8685 VPlanPredicator VPP(*Plan); 8686 VPP.predicate(); 8687 8688 // Avoid running transformation to recipes until masked code generation in 8689 // VPlan-native path is in place. 8690 return Plan; 8691 } 8692 8693 SmallPtrSet<Instruction *, 1> DeadInstructions; 8694 VPlanTransforms::VPInstructionsToVPRecipes( 8695 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8696 return Plan; 8697 } 8698 8699 // Adjust the recipes for any inloop reductions. The chain of instructions 8700 // leading from the loop exit instr to the phi need to be converted to 8701 // reductions, with one operand being vector and the other being the scalar 8702 // reduction chain. 8703 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8704 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8705 for (auto &Reduction : CM.getInLoopReductionChains()) { 8706 PHINode *Phi = Reduction.first; 8707 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8708 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8709 8710 // ReductionOperations are orders top-down from the phi's use to the 8711 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8712 // which of the two operands will remain scalar and which will be reduced. 8713 // For minmax the chain will be the select instructions. 8714 Instruction *Chain = Phi; 8715 for (Instruction *R : ReductionOperations) { 8716 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8717 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8718 8719 VPValue *ChainOp = Plan->getVPValue(Chain); 8720 unsigned FirstOpId; 8721 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8722 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8723 "Expected to replace a VPWidenSelectSC"); 8724 FirstOpId = 1; 8725 } else { 8726 assert(isa<VPWidenRecipe>(WidenRecipe) && 8727 "Expected to replace a VPWidenSC"); 8728 FirstOpId = 0; 8729 } 8730 unsigned VecOpId = 8731 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8732 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8733 8734 auto *CondOp = CM.foldTailByMasking() 8735 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8736 : nullptr; 8737 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8738 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8739 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8740 Plan->removeVPValueFor(R); 8741 Plan->addVPValue(R, RedRecipe); 8742 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8743 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8744 WidenRecipe->eraseFromParent(); 8745 8746 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8747 VPRecipeBase *CompareRecipe = 8748 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8749 assert(isa<VPWidenRecipe>(CompareRecipe) && 8750 "Expected to replace a VPWidenSC"); 8751 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 8752 "Expected no remaining users"); 8753 CompareRecipe->eraseFromParent(); 8754 } 8755 Chain = R; 8756 } 8757 } 8758 } 8759 8760 Value* LoopVectorizationPlanner::VPCallbackILV:: 8761 getOrCreateVectorValues(Value *V, unsigned Part) { 8762 return ILV.getOrCreateVectorValue(V, Part); 8763 } 8764 8765 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8766 Value *V, const VPIteration &Instance) { 8767 return ILV.getOrCreateScalarValue(V, Instance); 8768 } 8769 8770 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8771 VPSlotTracker &SlotTracker) const { 8772 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8773 IG->getInsertPos()->printAsOperand(O, false); 8774 O << ", "; 8775 getAddr()->printAsOperand(O, SlotTracker); 8776 VPValue *Mask = getMask(); 8777 if (Mask) { 8778 O << ", "; 8779 Mask->printAsOperand(O, SlotTracker); 8780 } 8781 for (unsigned i = 0; i < IG->getFactor(); ++i) 8782 if (Instruction *I = IG->getMember(i)) 8783 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8784 } 8785 8786 void VPWidenCallRecipe::execute(VPTransformState &State) { 8787 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8788 *this, State); 8789 } 8790 8791 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8792 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8793 this, *this, InvariantCond, State); 8794 } 8795 8796 void VPWidenRecipe::execute(VPTransformState &State) { 8797 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8798 } 8799 8800 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8801 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8802 *this, State.UF, State.VF, IsPtrLoopInvariant, 8803 IsIndexLoopInvariant, State); 8804 } 8805 8806 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8807 assert(!State.Instance && "Int or FP induction being replicated."); 8808 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 8809 Trunc); 8810 } 8811 8812 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8813 Value *StartV = 8814 getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; 8815 State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); 8816 } 8817 8818 void VPBlendRecipe::execute(VPTransformState &State) { 8819 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8820 // We know that all PHIs in non-header blocks are converted into 8821 // selects, so we don't have to worry about the insertion order and we 8822 // can just use the builder. 8823 // At this point we generate the predication tree. There may be 8824 // duplications since this is a simple recursive scan, but future 8825 // optimizations will clean it up. 8826 8827 unsigned NumIncoming = getNumIncomingValues(); 8828 8829 // Generate a sequence of selects of the form: 8830 // SELECT(Mask3, In3, 8831 // SELECT(Mask2, In2, 8832 // SELECT(Mask1, In1, 8833 // In0))) 8834 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8835 // are essentially undef are taken from In0. 8836 InnerLoopVectorizer::VectorParts Entry(State.UF); 8837 for (unsigned In = 0; In < NumIncoming; ++In) { 8838 for (unsigned Part = 0; Part < State.UF; ++Part) { 8839 // We might have single edge PHIs (blocks) - use an identity 8840 // 'select' for the first PHI operand. 8841 Value *In0 = State.get(getIncomingValue(In), Part); 8842 if (In == 0) 8843 Entry[Part] = In0; // Initialize with the first incoming value. 8844 else { 8845 // Select between the current value and the previous incoming edge 8846 // based on the incoming mask. 8847 Value *Cond = State.get(getMask(In), Part); 8848 Entry[Part] = 8849 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8850 } 8851 } 8852 } 8853 for (unsigned Part = 0; Part < State.UF; ++Part) 8854 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8855 } 8856 8857 void VPInterleaveRecipe::execute(VPTransformState &State) { 8858 assert(!State.Instance && "Interleave group being replicated."); 8859 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 8860 getStoredValues(), getMask()); 8861 } 8862 8863 void VPReductionRecipe::execute(VPTransformState &State) { 8864 assert(!State.Instance && "Reduction being replicated."); 8865 for (unsigned Part = 0; Part < State.UF; ++Part) { 8866 RecurKind Kind = RdxDesc->getRecurrenceKind(); 8867 Value *NewVecOp = State.get(getVecOp(), Part); 8868 if (VPValue *Cond = getCondOp()) { 8869 Value *NewCond = State.get(Cond, Part); 8870 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8871 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8872 Kind, VecTy->getElementType()); 8873 Constant *IdenVec = 8874 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8875 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8876 NewVecOp = Select; 8877 } 8878 Value *NewRed = 8879 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 8880 Value *PrevInChain = State.get(getChainOp(), Part); 8881 Value *NextInChain; 8882 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8883 NextInChain = 8884 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 8885 NewRed, PrevInChain); 8886 } else { 8887 NextInChain = State.Builder.CreateBinOp( 8888 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8889 PrevInChain); 8890 } 8891 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8892 } 8893 } 8894 8895 void VPReplicateRecipe::execute(VPTransformState &State) { 8896 if (State.Instance) { // Generate a single instance. 8897 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 8898 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8899 *State.Instance, IsPredicated, State); 8900 // Insert scalar instance packing it into a vector. 8901 if (AlsoPack && State.VF.isVector()) { 8902 // If we're constructing lane 0, initialize to start from poison. 8903 if (State.Instance->Lane == 0) { 8904 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8905 Value *Poison = PoisonValue::get( 8906 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8907 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8908 State.Instance->Part, Poison); 8909 } 8910 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8911 *State.Instance); 8912 } 8913 return; 8914 } 8915 8916 // Generate scalar instances for all VF lanes of all UF parts, unless the 8917 // instruction is uniform inwhich case generate only the first lane for each 8918 // of the UF parts. 8919 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8920 assert((!State.VF.isScalable() || IsUniform) && 8921 "Can't scalarize a scalable vector"); 8922 for (unsigned Part = 0; Part < State.UF; ++Part) 8923 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8924 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8925 IsPredicated, State); 8926 } 8927 8928 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8929 assert(State.Instance && "Branch on Mask works only on single instance."); 8930 8931 unsigned Part = State.Instance->Part; 8932 unsigned Lane = State.Instance->Lane; 8933 8934 Value *ConditionBit = nullptr; 8935 VPValue *BlockInMask = getMask(); 8936 if (BlockInMask) { 8937 ConditionBit = State.get(BlockInMask, Part); 8938 if (ConditionBit->getType()->isVectorTy()) 8939 ConditionBit = State.Builder.CreateExtractElement( 8940 ConditionBit, State.Builder.getInt32(Lane)); 8941 } else // Block in mask is all-one. 8942 ConditionBit = State.Builder.getTrue(); 8943 8944 // Replace the temporary unreachable terminator with a new conditional branch, 8945 // whose two destinations will be set later when they are created. 8946 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8947 assert(isa<UnreachableInst>(CurrentTerminator) && 8948 "Expected to replace unreachable terminator with conditional branch."); 8949 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8950 CondBr->setSuccessor(0, nullptr); 8951 ReplaceInstWithInst(CurrentTerminator, CondBr); 8952 } 8953 8954 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8955 assert(State.Instance && "Predicated instruction PHI works per instance."); 8956 Instruction *ScalarPredInst = 8957 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8958 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8959 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8960 assert(PredicatingBB && "Predicated block has no single predecessor."); 8961 8962 // By current pack/unpack logic we need to generate only a single phi node: if 8963 // a vector value for the predicated instruction exists at this point it means 8964 // the instruction has vector users only, and a phi for the vector value is 8965 // needed. In this case the recipe of the predicated instruction is marked to 8966 // also do that packing, thereby "hoisting" the insert-element sequence. 8967 // Otherwise, a phi node for the scalar value is needed. 8968 unsigned Part = State.Instance->Part; 8969 Instruction *PredInst = 8970 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8971 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8972 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8973 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8974 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8975 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8976 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8977 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8978 } else { 8979 Type *PredInstType = PredInst->getType(); 8980 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8981 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); 8982 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8983 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8984 } 8985 } 8986 8987 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8988 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 8989 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 8990 StoredValue ? nullptr : getVPValue(), 8991 getAddr(), StoredValue, getMask()); 8992 } 8993 8994 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8995 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8996 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8997 // for predication. 8998 static ScalarEpilogueLowering getScalarEpilogueLowering( 8999 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9000 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9001 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9002 LoopVectorizationLegality &LVL) { 9003 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9004 // don't look at hints or options, and don't request a scalar epilogue. 9005 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9006 // LoopAccessInfo (due to code dependency and not being able to reliably get 9007 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9008 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9009 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9010 // back to the old way and vectorize with versioning when forced. See D81345.) 9011 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9012 PGSOQueryType::IRPass) && 9013 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9014 return CM_ScalarEpilogueNotAllowedOptSize; 9015 9016 // 2) If set, obey the directives 9017 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9018 switch (PreferPredicateOverEpilogue) { 9019 case PreferPredicateTy::ScalarEpilogue: 9020 return CM_ScalarEpilogueAllowed; 9021 case PreferPredicateTy::PredicateElseScalarEpilogue: 9022 return CM_ScalarEpilogueNotNeededUsePredicate; 9023 case PreferPredicateTy::PredicateOrDontVectorize: 9024 return CM_ScalarEpilogueNotAllowedUsePredicate; 9025 }; 9026 } 9027 9028 // 3) If set, obey the hints 9029 switch (Hints.getPredicate()) { 9030 case LoopVectorizeHints::FK_Enabled: 9031 return CM_ScalarEpilogueNotNeededUsePredicate; 9032 case LoopVectorizeHints::FK_Disabled: 9033 return CM_ScalarEpilogueAllowed; 9034 }; 9035 9036 // 4) if the TTI hook indicates this is profitable, request predication. 9037 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9038 LVL.getLAI())) 9039 return CM_ScalarEpilogueNotNeededUsePredicate; 9040 9041 return CM_ScalarEpilogueAllowed; 9042 } 9043 9044 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 9045 unsigned Part) { 9046 set(Def, V, Part); 9047 ILV->setVectorValue(IRDef, Part, V); 9048 } 9049 9050 // Process the loop in the VPlan-native vectorization path. This path builds 9051 // VPlan upfront in the vectorization pipeline, which allows to apply 9052 // VPlan-to-VPlan transformations from the very beginning without modifying the 9053 // input LLVM IR. 9054 static bool processLoopInVPlanNativePath( 9055 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9056 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9057 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9058 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9059 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9060 9061 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9062 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9063 return false; 9064 } 9065 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9066 Function *F = L->getHeader()->getParent(); 9067 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9068 9069 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9070 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9071 9072 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9073 &Hints, IAI); 9074 // Use the planner for outer loop vectorization. 9075 // TODO: CM is not used at this point inside the planner. Turn CM into an 9076 // optional argument if we don't need it in the future. 9077 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9078 9079 // Get user vectorization factor. 9080 ElementCount UserVF = Hints.getWidth(); 9081 9082 // Plan how to best vectorize, return the best VF and its cost. 9083 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9084 9085 // If we are stress testing VPlan builds, do not attempt to generate vector 9086 // code. Masked vector code generation support will follow soon. 9087 // Also, do not attempt to vectorize if no vector code will be produced. 9088 if (VPlanBuildStressTest || EnableVPlanPredication || 9089 VectorizationFactor::Disabled() == VF) 9090 return false; 9091 9092 LVP.setBestPlan(VF.Width, 1); 9093 9094 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9095 &CM, BFI, PSI); 9096 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9097 << L->getHeader()->getParent()->getName() << "\"\n"); 9098 LVP.executePlan(LB, DT); 9099 9100 // Mark the loop as already vectorized to avoid vectorizing again. 9101 Hints.setAlreadyVectorized(); 9102 9103 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9104 return true; 9105 } 9106 9107 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9108 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9109 !EnableLoopInterleaving), 9110 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9111 !EnableLoopVectorization) {} 9112 9113 bool LoopVectorizePass::processLoop(Loop *L) { 9114 assert((EnableVPlanNativePath || L->isInnermost()) && 9115 "VPlan-native path is not enabled. Only process inner loops."); 9116 9117 #ifndef NDEBUG 9118 const std::string DebugLocStr = getDebugLocString(L); 9119 #endif /* NDEBUG */ 9120 9121 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9122 << L->getHeader()->getParent()->getName() << "\" from " 9123 << DebugLocStr << "\n"); 9124 9125 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9126 9127 LLVM_DEBUG( 9128 dbgs() << "LV: Loop hints:" 9129 << " force=" 9130 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9131 ? "disabled" 9132 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9133 ? "enabled" 9134 : "?")) 9135 << " width=" << Hints.getWidth() 9136 << " unroll=" << Hints.getInterleave() << "\n"); 9137 9138 // Function containing loop 9139 Function *F = L->getHeader()->getParent(); 9140 9141 // Looking at the diagnostic output is the only way to determine if a loop 9142 // was vectorized (other than looking at the IR or machine code), so it 9143 // is important to generate an optimization remark for each loop. Most of 9144 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9145 // generated as OptimizationRemark and OptimizationRemarkMissed are 9146 // less verbose reporting vectorized loops and unvectorized loops that may 9147 // benefit from vectorization, respectively. 9148 9149 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9150 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9151 return false; 9152 } 9153 9154 PredicatedScalarEvolution PSE(*SE, *L); 9155 9156 // Check if it is legal to vectorize the loop. 9157 LoopVectorizationRequirements Requirements(*ORE); 9158 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9159 &Requirements, &Hints, DB, AC, BFI, PSI); 9160 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9161 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9162 Hints.emitRemarkWithHints(); 9163 return false; 9164 } 9165 9166 // Check the function attributes and profiles to find out if this function 9167 // should be optimized for size. 9168 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9169 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9170 9171 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9172 // here. They may require CFG and instruction level transformations before 9173 // even evaluating whether vectorization is profitable. Since we cannot modify 9174 // the incoming IR, we need to build VPlan upfront in the vectorization 9175 // pipeline. 9176 if (!L->isInnermost()) 9177 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9178 ORE, BFI, PSI, Hints); 9179 9180 assert(L->isInnermost() && "Inner loop expected."); 9181 9182 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9183 // count by optimizing for size, to minimize overheads. 9184 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9185 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9186 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9187 << "This loop is worth vectorizing only if no scalar " 9188 << "iteration overheads are incurred."); 9189 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9190 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9191 else { 9192 LLVM_DEBUG(dbgs() << "\n"); 9193 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9194 } 9195 } 9196 9197 // Check the function attributes to see if implicit floats are allowed. 9198 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9199 // an integer loop and the vector instructions selected are purely integer 9200 // vector instructions? 9201 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9202 reportVectorizationFailure( 9203 "Can't vectorize when the NoImplicitFloat attribute is used", 9204 "loop not vectorized due to NoImplicitFloat attribute", 9205 "NoImplicitFloat", ORE, L); 9206 Hints.emitRemarkWithHints(); 9207 return false; 9208 } 9209 9210 // Check if the target supports potentially unsafe FP vectorization. 9211 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9212 // for the target we're vectorizing for, to make sure none of the 9213 // additional fp-math flags can help. 9214 if (Hints.isPotentiallyUnsafe() && 9215 TTI->isFPVectorizationPotentiallyUnsafe()) { 9216 reportVectorizationFailure( 9217 "Potentially unsafe FP op prevents vectorization", 9218 "loop not vectorized due to unsafe FP support.", 9219 "UnsafeFP", ORE, L); 9220 Hints.emitRemarkWithHints(); 9221 return false; 9222 } 9223 9224 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9225 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9226 9227 // If an override option has been passed in for interleaved accesses, use it. 9228 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9229 UseInterleaved = EnableInterleavedMemAccesses; 9230 9231 // Analyze interleaved memory accesses. 9232 if (UseInterleaved) { 9233 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9234 } 9235 9236 // Use the cost model. 9237 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9238 F, &Hints, IAI); 9239 CM.collectValuesToIgnore(); 9240 9241 // Use the planner for vectorization. 9242 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9243 9244 // Get user vectorization factor and interleave count. 9245 ElementCount UserVF = Hints.getWidth(); 9246 unsigned UserIC = Hints.getInterleave(); 9247 9248 // Plan how to best vectorize, return the best VF and its cost. 9249 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9250 9251 VectorizationFactor VF = VectorizationFactor::Disabled(); 9252 unsigned IC = 1; 9253 9254 if (MaybeVF) { 9255 VF = *MaybeVF; 9256 // Select the interleave count. 9257 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9258 } 9259 9260 // Identify the diagnostic messages that should be produced. 9261 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9262 bool VectorizeLoop = true, InterleaveLoop = true; 9263 if (Requirements.doesNotMeet(F, L, Hints)) { 9264 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9265 "requirements.\n"); 9266 Hints.emitRemarkWithHints(); 9267 return false; 9268 } 9269 9270 if (VF.Width.isScalar()) { 9271 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9272 VecDiagMsg = std::make_pair( 9273 "VectorizationNotBeneficial", 9274 "the cost-model indicates that vectorization is not beneficial"); 9275 VectorizeLoop = false; 9276 } 9277 9278 if (!MaybeVF && UserIC > 1) { 9279 // Tell the user interleaving was avoided up-front, despite being explicitly 9280 // requested. 9281 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9282 "interleaving should be avoided up front\n"); 9283 IntDiagMsg = std::make_pair( 9284 "InterleavingAvoided", 9285 "Ignoring UserIC, because interleaving was avoided up front"); 9286 InterleaveLoop = false; 9287 } else if (IC == 1 && UserIC <= 1) { 9288 // Tell the user interleaving is not beneficial. 9289 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9290 IntDiagMsg = std::make_pair( 9291 "InterleavingNotBeneficial", 9292 "the cost-model indicates that interleaving is not beneficial"); 9293 InterleaveLoop = false; 9294 if (UserIC == 1) { 9295 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9296 IntDiagMsg.second += 9297 " and is explicitly disabled or interleave count is set to 1"; 9298 } 9299 } else if (IC > 1 && UserIC == 1) { 9300 // Tell the user interleaving is beneficial, but it explicitly disabled. 9301 LLVM_DEBUG( 9302 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9303 IntDiagMsg = std::make_pair( 9304 "InterleavingBeneficialButDisabled", 9305 "the cost-model indicates that interleaving is beneficial " 9306 "but is explicitly disabled or interleave count is set to 1"); 9307 InterleaveLoop = false; 9308 } 9309 9310 // Override IC if user provided an interleave count. 9311 IC = UserIC > 0 ? UserIC : IC; 9312 9313 // Emit diagnostic messages, if any. 9314 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9315 if (!VectorizeLoop && !InterleaveLoop) { 9316 // Do not vectorize or interleaving the loop. 9317 ORE->emit([&]() { 9318 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9319 L->getStartLoc(), L->getHeader()) 9320 << VecDiagMsg.second; 9321 }); 9322 ORE->emit([&]() { 9323 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9324 L->getStartLoc(), L->getHeader()) 9325 << IntDiagMsg.second; 9326 }); 9327 return false; 9328 } else if (!VectorizeLoop && InterleaveLoop) { 9329 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9330 ORE->emit([&]() { 9331 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9332 L->getStartLoc(), L->getHeader()) 9333 << VecDiagMsg.second; 9334 }); 9335 } else if (VectorizeLoop && !InterleaveLoop) { 9336 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9337 << ") in " << DebugLocStr << '\n'); 9338 ORE->emit([&]() { 9339 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9340 L->getStartLoc(), L->getHeader()) 9341 << IntDiagMsg.second; 9342 }); 9343 } else if (VectorizeLoop && InterleaveLoop) { 9344 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9345 << ") in " << DebugLocStr << '\n'); 9346 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9347 } 9348 9349 LVP.setBestPlan(VF.Width, IC); 9350 9351 using namespace ore; 9352 bool DisableRuntimeUnroll = false; 9353 MDNode *OrigLoopID = L->getLoopID(); 9354 9355 if (!VectorizeLoop) { 9356 assert(IC > 1 && "interleave count should not be 1 or 0"); 9357 // If we decided that it is not legal to vectorize the loop, then 9358 // interleave it. 9359 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9360 BFI, PSI); 9361 LVP.executePlan(Unroller, DT); 9362 9363 ORE->emit([&]() { 9364 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9365 L->getHeader()) 9366 << "interleaved loop (interleaved count: " 9367 << NV("InterleaveCount", IC) << ")"; 9368 }); 9369 } else { 9370 // If we decided that it is *legal* to vectorize the loop, then do it. 9371 9372 // Consider vectorizing the epilogue too if it's profitable. 9373 VectorizationFactor EpilogueVF = 9374 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9375 if (EpilogueVF.Width.isVector()) { 9376 9377 // The first pass vectorizes the main loop and creates a scalar epilogue 9378 // to be vectorized by executing the plan (potentially with a different 9379 // factor) again shortly afterwards. 9380 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9381 EpilogueVF.Width.getKnownMinValue(), 1); 9382 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9383 &LVL, &CM, BFI, PSI); 9384 9385 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9386 LVP.executePlan(MainILV, DT); 9387 ++LoopsVectorized; 9388 9389 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9390 formLCSSARecursively(*L, *DT, LI, SE); 9391 9392 // Second pass vectorizes the epilogue and adjusts the control flow 9393 // edges from the first pass. 9394 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9395 EPI.MainLoopVF = EPI.EpilogueVF; 9396 EPI.MainLoopUF = EPI.EpilogueUF; 9397 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9398 ORE, EPI, &LVL, &CM, BFI, PSI); 9399 LVP.executePlan(EpilogILV, DT); 9400 ++LoopsEpilogueVectorized; 9401 9402 if (!MainILV.areSafetyChecksAdded()) 9403 DisableRuntimeUnroll = true; 9404 } else { 9405 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9406 &LVL, &CM, BFI, PSI); 9407 LVP.executePlan(LB, DT); 9408 ++LoopsVectorized; 9409 9410 // Add metadata to disable runtime unrolling a scalar loop when there are 9411 // no runtime checks about strides and memory. A scalar loop that is 9412 // rarely used is not worth unrolling. 9413 if (!LB.areSafetyChecksAdded()) 9414 DisableRuntimeUnroll = true; 9415 } 9416 9417 // Report the vectorization decision. 9418 ORE->emit([&]() { 9419 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9420 L->getHeader()) 9421 << "vectorized loop (vectorization width: " 9422 << NV("VectorizationFactor", VF.Width) 9423 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9424 }); 9425 } 9426 9427 Optional<MDNode *> RemainderLoopID = 9428 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9429 LLVMLoopVectorizeFollowupEpilogue}); 9430 if (RemainderLoopID.hasValue()) { 9431 L->setLoopID(RemainderLoopID.getValue()); 9432 } else { 9433 if (DisableRuntimeUnroll) 9434 AddRuntimeUnrollDisableMetaData(L); 9435 9436 // Mark the loop as already vectorized to avoid vectorizing again. 9437 Hints.setAlreadyVectorized(); 9438 } 9439 9440 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9441 return true; 9442 } 9443 9444 LoopVectorizeResult LoopVectorizePass::runImpl( 9445 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9446 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9447 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9448 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9449 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9450 SE = &SE_; 9451 LI = &LI_; 9452 TTI = &TTI_; 9453 DT = &DT_; 9454 BFI = &BFI_; 9455 TLI = TLI_; 9456 AA = &AA_; 9457 AC = &AC_; 9458 GetLAA = &GetLAA_; 9459 DB = &DB_; 9460 ORE = &ORE_; 9461 PSI = PSI_; 9462 9463 // Don't attempt if 9464 // 1. the target claims to have no vector registers, and 9465 // 2. interleaving won't help ILP. 9466 // 9467 // The second condition is necessary because, even if the target has no 9468 // vector registers, loop vectorization may still enable scalar 9469 // interleaving. 9470 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9471 TTI->getMaxInterleaveFactor(1) < 2) 9472 return LoopVectorizeResult(false, false); 9473 9474 bool Changed = false, CFGChanged = false; 9475 9476 // The vectorizer requires loops to be in simplified form. 9477 // Since simplification may add new inner loops, it has to run before the 9478 // legality and profitability checks. This means running the loop vectorizer 9479 // will simplify all loops, regardless of whether anything end up being 9480 // vectorized. 9481 for (auto &L : *LI) 9482 Changed |= CFGChanged |= 9483 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9484 9485 // Build up a worklist of inner-loops to vectorize. This is necessary as 9486 // the act of vectorizing or partially unrolling a loop creates new loops 9487 // and can invalidate iterators across the loops. 9488 SmallVector<Loop *, 8> Worklist; 9489 9490 for (Loop *L : *LI) 9491 collectSupportedLoops(*L, LI, ORE, Worklist); 9492 9493 LoopsAnalyzed += Worklist.size(); 9494 9495 // Now walk the identified inner loops. 9496 while (!Worklist.empty()) { 9497 Loop *L = Worklist.pop_back_val(); 9498 9499 // For the inner loops we actually process, form LCSSA to simplify the 9500 // transform. 9501 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9502 9503 Changed |= CFGChanged |= processLoop(L); 9504 } 9505 9506 // Process each loop nest in the function. 9507 return LoopVectorizeResult(Changed, CFGChanged); 9508 } 9509 9510 PreservedAnalyses LoopVectorizePass::run(Function &F, 9511 FunctionAnalysisManager &AM) { 9512 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9513 auto &LI = AM.getResult<LoopAnalysis>(F); 9514 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9515 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9516 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9517 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9518 auto &AA = AM.getResult<AAManager>(F); 9519 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9520 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9521 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9522 MemorySSA *MSSA = EnableMSSALoopDependency 9523 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9524 : nullptr; 9525 9526 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9527 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9528 [&](Loop &L) -> const LoopAccessInfo & { 9529 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9530 TLI, TTI, nullptr, MSSA}; 9531 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9532 }; 9533 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9534 ProfileSummaryInfo *PSI = 9535 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9536 LoopVectorizeResult Result = 9537 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9538 if (!Result.MadeAnyChange) 9539 return PreservedAnalyses::all(); 9540 PreservedAnalyses PA; 9541 9542 // We currently do not preserve loopinfo/dominator analyses with outer loop 9543 // vectorization. Until this is addressed, mark these analyses as preserved 9544 // only for non-VPlan-native path. 9545 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9546 if (!EnableVPlanNativePath) { 9547 PA.preserve<LoopAnalysis>(); 9548 PA.preserve<DominatorTreeAnalysis>(); 9549 } 9550 PA.preserve<BasicAA>(); 9551 PA.preserve<GlobalsAA>(); 9552 if (!Result.MadeCFGChange) 9553 PA.preserveSet<CFGAnalyses>(); 9554 return PA; 9555 } 9556